Merge branch 'issues/65' into tantivy-imhotep

Conflicts:
	src/core/segment_reader.rs
	src/fastfield/reader.rs
This commit is contained in:
Paul Masurel
2017-04-21 09:53:14 +09:00
45 changed files with 2228 additions and 458 deletions

View File

@@ -74,7 +74,7 @@ pub mod tests {
use Score;
use core::SegmentReader;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use schema::Field;
/// Stores all of the doc ids.
@@ -125,9 +125,9 @@ pub mod tests {
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u32>,
vals: Vec<u64>,
field: Field,
ff_reader: Option<U32FastFieldReader>,
ff_reader: Option<U64FastFieldReader>,
}
impl FastFieldTestCollector {
@@ -139,7 +139,7 @@ pub mod tests {
}
}
pub fn vals(self,) -> Vec<u32> {
pub fn vals(self,) -> Vec<u64> {
self.vals
}
}

View File

@@ -4,8 +4,14 @@ use common::serialize::BinarySerializable;
use std::mem;
pub fn compute_num_bits(amplitude: u32) -> u8 {
(32u32 - amplitude.leading_zeros()) as u8
pub fn compute_num_bits(amplitude: u64) -> u8 {
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
}
else {
64
}
}
pub struct BitPacker {
@@ -15,7 +21,7 @@ pub struct BitPacker {
written_size: usize,
}
impl BitPacker {
impl BitPacker {
pub fn new(num_bits: usize) -> BitPacker {
BitPacker {
@@ -26,7 +32,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: Write>(&mut self, val: u32, output: &mut TWrite) -> io::Result<()> {
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
let val_u64 = val as u64;
if self.mini_buffer_written + self.num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
@@ -67,22 +73,29 @@ impl BitPacker {
pub struct BitUnpacker {
num_bits: usize,
mask: u32,
mask: u64,
data_ptr: *const u8,
data_len: usize,
}
impl BitUnpacker {
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
let mask: u64 =
if num_bits == 64 {
!0u64
}
else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: num_bits,
mask: (1u32 << num_bits) - 1u32,
mask: mask,
data_ptr: data.as_ptr(),
data_len: data.len()
}
}
pub fn get(&self, idx: usize) -> u32 {
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0;
}
@@ -101,7 +114,7 @@ impl BitUnpacker {
}
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & self.mask)
}
@@ -123,13 +136,14 @@ mod test {
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new(num_bits);
let max_val: u32 = (1 << num_bits) - 1;
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
let max_val: u64 = (1 << num_bits) - 1;
let vals: Vec<u64> = (0u64..len as u64).map(|i| {
if max_val == 0 {
0
}

View File

@@ -110,7 +110,7 @@ pub mod tests {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert_eq!(compressed.len(), 19_790);
assert!(compressed.len() <= 19_794);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
@@ -123,7 +123,7 @@ pub mod tests {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert_eq!(compressed.len(), 7_822);
assert!(compressed.len() <= 7_826);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {

View File

@@ -4,16 +4,32 @@
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
#[cfg(feature="simdcompression")]
mod compression_simd;
#[cfg(feature="simdcompression")]
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
#[cfg(not(feature="simdcompression"))]
mod compression_nosimd;
#[cfg(not(feature="simdcompression"))]
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::*;
}
#[cfg(feature="simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::*;
}
pub use self::pack::{BlockEncoder, BlockDecoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
mod vint {
mod compression_vint_nosimd;
pub use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
mod vint {
mod compression_vint_simd;
pub use self::compression_vint_simd::*;
}
pub trait VIntEncoder {
@@ -28,49 +44,14 @@ pub trait VIntDecoder {
impl VIntEncoder for BlockEncoder {
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
vint::compress_sorted(input, &mut self.output, offset)
}
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
vint::compress_unsorted(input, &mut self.output)
}
}
}
impl VIntDecoder for BlockDecoder {
@@ -79,52 +60,19 @@ impl VIntDecoder for BlockDecoder {
compressed_data: &'a [u8],
offset: u32,
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(
&mut self,
compressed_data: &'a [u8],
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
}
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
@@ -224,7 +172,7 @@ pub mod tests {
#[test]
fn test_encode_vint() {
{
let expected_length = 123;
let expected_length = 154;
let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32)
.map(|i| 4 + i * 7 / 2)
@@ -232,23 +180,13 @@ pub mod tests {
.collect();
for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert_eq!(encoded_data.len(), expected_length);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
assert_eq!(input, decoder.output_array());
}
}
{
let mut encoder = BlockEncoder::new();
let input = vec!(3u32, 17u32, 187u32);
let encoded_data = encoder.compress_vint_sorted(&input, 0);
assert_eq!(encoded_data.len(), 4);
assert_eq!(encoded_data[0], 3u8 + 128u8);
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
assert_eq!(encoded_data[3], (1u8 + 128u8));
}
}
@@ -272,4 +210,27 @@ pub mod tests {
});
}
const NUM_INTS_BENCH_VINT: usize = 10;
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -2,7 +2,7 @@ use common::bitpacker::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use std::cmp;
use std::io::Write;
use super::NUM_DOCS_PER_BLOCK;
use super::super::NUM_DOCS_PER_BLOCK;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;

View File

@@ -1,4 +1,4 @@
use super::NUM_DOCS_PER_BLOCK;
use super::super::NUM_DOCS_PER_BLOCK;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;

View File

@@ -0,0 +1,92 @@
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
&compressed_data[read_byte..]
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
&compressed_data[read_byte..]
}

View File

@@ -0,0 +1,82 @@
mod streamvbyte {
use libc::size_t;
extern {
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32) -> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32) -> size_t;
pub fn streamvbyte_encode(
data: *const u32,
num_els: u32,
output: *mut u8) -> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize) -> size_t;
}
}
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
};
&output[..compress_length]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr())
};
&output[..compress_length]
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
};
&compressed_data[consumed_bytes..]
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len())
};
&compressed_data[consumed_bytes..]
}

View File

@@ -17,7 +17,7 @@ use std::sync::Arc;
use std::fmt;
use schema::Field;
use postings::{SegmentPostings, BlockSegmentPostings, SegmentPostingsOption};
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use fastfield::{U64FastFieldsReader, U64FastFieldReader};
use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
@@ -41,8 +41,8 @@ pub struct SegmentReader {
term_infos: Arc<TermDictionary<TermInfo>>,
postings_data: ReadOnlySource,
store_reader: StoreReader,
fast_fields_reader: Arc<U32FastFieldsReader>,
fieldnorms_reader: Arc<U32FastFieldsReader>,
fast_fields_reader: Arc<U64FastFieldsReader>,
fieldnorms_reader: Arc<U64FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
@@ -78,11 +78,11 @@ impl SegmentReader {
}
/// Accessor to a segment's fast field reader given a field.
pub fn get_fast_field_reader(&self, field: Field) -> Option<U32FastFieldReader> {
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
pub fn get_fast_field_reader(&self, field: Field) -> Option<U64FastFieldReader> {
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
@@ -93,8 +93,8 @@ impl SegmentReader {
warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
None
},
&FieldType::U32(ref u32_options) => {
if u32_options.is_fast() {
&FieldType::U64(ref u64_options) => {
if u64_options.is_fast() {
self.fast_fields_reader.get_field(field)
}
else {
@@ -112,7 +112,7 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U32FastFieldReader> {
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_reader.get_field(field)
}
@@ -138,10 +138,10 @@ impl SegmentReader {
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data));
let fast_fields_reader = try!(U64FastFieldsReader::open(fast_field_data));
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data));
let fieldnorms_reader = try!(U64FastFieldsReader::open(fieldnorms_data));
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
@@ -274,7 +274,7 @@ impl SegmentReader {
_ => SegmentPostingsOption::NoFreq,
}
}
FieldType::U32(_) => SegmentPostingsOption::NoFreq
FieldType::U64(_) => SegmentPostingsOption::NoFreq
};
self.read_postings(term, segment_posting_option)
}

View File

@@ -8,15 +8,15 @@
/// They are useful when a field is required for all or most of
/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
///
/// Currently only u32 fastfield are supported.
/// Currently only u64 fastfield are supported.
mod reader;
mod writer;
mod serializer;
pub mod delete;
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
pub use self::writer::{U64FastFieldsWriter, U64FastFieldWriter};
pub use self::reader::{U64FastFieldsReader, U64FastFieldReader};
pub use self::serializer::FastFieldSerializer;
#[cfg(test)]
@@ -37,7 +37,7 @@ mod tests {
lazy_static! {
static ref SCHEMA: Schema = {
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u32_field("field", FAST);
schema_builder.add_u64_field("field", FAST);
schema_builder.build()
};
static ref FIELD: Field = {
@@ -45,15 +45,15 @@ mod tests {
};
}
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: Field, value: u32) {
fn add_single_field_doc(fast_field_writers: &mut U64FastFieldsWriter, field: Field, value: u64) {
let mut doc = Document::default();
doc.add_u32(field, value);
doc.add_u64(field, value);
fast_field_writers.add_document(&doc);
}
#[test]
pub fn test_fastfield() {
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
let test_fastfield = U64FastFieldReader::from(vec!(100,200,300));
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
@@ -66,23 +66,23 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u32);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 23 as usize);
assert_eq!(source.len(), 31 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 13u32);
assert_eq!(fast_field_reader.get(1), 14u32);
assert_eq!(fast_field_reader.get(2), 2u32);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
}
}
@@ -93,35 +93,35 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u32);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 48 as usize);
assert_eq!(source.len(), 56 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
assert_eq!(fast_field_reader.get(2), 3_052u32);
assert_eq!(fast_field_reader.get(3), 9002u32);
assert_eq!(fast_field_reader.get(4), 15_001u32);
assert_eq!(fast_field_reader.get(5), 777u32);
assert_eq!(fast_field_reader.get(6), 1_002u32);
assert_eq!(fast_field_reader.get(7), 1_501u32);
assert_eq!(fast_field_reader.get(8), 215u32);
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
assert_eq!(fast_field_reader.get(3), 9002u64);
assert_eq!(fast_field_reader.get(4), 15_001u64);
assert_eq!(fast_field_reader.get(5), 777u64);
assert_eq!(fast_field_reader.get(6), 1_002u64);
assert_eq!(fast_field_reader.get(7), 1_501u64);
assert_eq!(fast_field_reader.get(8), 215u64);
}
}
@@ -134,30 +134,62 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 21 as usize);
assert_eq!(source.len(), 29 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u32);
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
}
fn generate_permutation() -> Vec<u32> {
#[test]
fn test_intfastfield_large_numbers() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80037 as usize);
}
{
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
}
}
}
fn generate_permutation() -> Vec<u64> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng = XorShiftRng::from_seed(*seed);
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
rng.shuffle(&mut permutation);
permutation
}
@@ -171,7 +203,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -180,10 +212,11 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let mut a = 0u32;
let mut a = 0u64;
for _ in 0..n {
println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]);
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
}
@@ -195,7 +228,7 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= permutation[i as usize];
}
@@ -208,7 +241,7 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
@@ -224,7 +257,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -233,11 +266,11 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= fast_field_reader.get(i);
}
@@ -254,7 +287,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -263,13 +296,13 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a);
a = fast_field_reader.get(a) as u32;
}
a
});

View File

@@ -1,7 +1,5 @@
use std::io;
use std::collections::HashMap;
use std::ops::Deref;
use directory::ReadOnlySource;
use common::BinarySerializable;
use DocId;
@@ -10,36 +8,36 @@ use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::U32FastFieldsWriter;
use fastfield::U64FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
lazy_static! {
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
let u32_fast_field = U32FastFieldReader::from(Vec::new());
u32_fast_field._data.clone()
static ref U64_FAST_FIELD_EMPTY: ReadOnlySource = {
let u64_fast_field = U64FastFieldReader::from(Vec::new());
u64_fast_field._data.clone()
};
}
pub struct U32FastFieldReader {
pub struct U64FastFieldReader {
_data: ReadOnlySource,
bit_unpacker: BitUnpacker,
min_val: u32,
max_val: u32,
min_val: u64,
max_val: u64,
}
impl U32FastFieldReader {
impl U64FastFieldReader {
pub fn empty() -> U32FastFieldReader {
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone())
pub fn empty() -> U64FastFieldReader {
U64FastFieldReader::open(U64_FAST_FIELD_EMPTY.clone())
}
pub fn min_val(&self,) -> u32 {
pub fn min_val(&self,) -> u64 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
pub fn max_val(&self,) -> u64 {
self.max_val
}
@@ -47,22 +45,22 @@ impl U32FastFieldReader {
///
/// # Panics
/// Panics if the data is corrupted.
pub fn open(data: ReadOnlySource) -> U32FastFieldReader {
let min_val;
let amplitude;
let max_val;
pub fn open(data: ReadOnlySource) -> U64FastFieldReader {
let min_val: u64;
let max_val: u64;
let bit_unpacker: BitUnpacker;
{
let mut cursor = data.as_slice();
min_val = u32::deserialize(&mut cursor).unwrap();
amplitude = u32::deserialize(&mut cursor).unwrap();
let mut cursor: &[u8] = data.as_slice();
min_val = u64::deserialize(&mut cursor).expect("Failed to read the min_val of fast field.");
let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
max_val = min_val + amplitude;
let num_bits = compute_num_bits(amplitude);
bit_unpacker = BitUnpacker::new(cursor, num_bits as usize)
}
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = {
let data_arr = &(data.deref()[8..]);
BitUnpacker::new(data_arr, num_bits as usize)
};
U32FastFieldReader {
U64FastFieldReader {
_data: data,
bit_unpacker: bit_unpacker,
min_val: min_val,
@@ -70,23 +68,23 @@ impl U32FastFieldReader {
}
}
pub fn get(&self, doc: DocId) -> u32 {
pub fn get(&self, doc: DocId) -> u64 {
self.min_val + self.bit_unpacker.get(doc as usize)
}
}
impl From<Vec<u32>> for U32FastFieldReader {
fn from(vals: Vec<u32>) -> U32FastFieldReader {
impl From<Vec<u64>> for U64FastFieldReader {
fn from(vals: Vec<u64>) -> U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("field", FAST);
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = U64FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
@@ -95,24 +93,22 @@ impl From<Vec<u32>> for U32FastFieldReader {
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_readers = U64FastFieldsReader::open(source).unwrap();
fast_field_readers.get_field(field).unwrap()
}
}
pub struct U32FastFieldsReader {
pub struct U64FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
unsafe impl Send for U32FastFieldReader {}
unsafe impl Sync for U32FastFieldReader {}
unsafe impl Send for U32FastFieldsReader {}
unsafe impl Sync for U32FastFieldsReader {}
unsafe impl Send for U64FastFieldsReader {}
unsafe impl Sync for U64FastFieldsReader {}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
impl U64FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U64FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
@@ -136,26 +132,26 @@ impl U32FastFieldsReader {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(U32FastFieldsReader {
Ok(U64FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_field(&self, field: Field) -> Option<U32FastFieldReader> {
pub fn get_field(&self, field: Field) -> Option<U64FastFieldReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
U64FastFieldReader::open(field_source)
})
}
}

View File

@@ -14,13 +14,13 @@ use std::io::{self, Write, Seek, SeekFrom};
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
@@ -29,7 +29,7 @@ pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(Field, u32)>,
min_value: u32,
min_value: u64,
field_open: bool,
bit_packer: BitPacker,
}
@@ -50,8 +50,8 @@ impl FastFieldSerializer {
})
}
/// Start serializing a new u32 fast field
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
@@ -68,14 +68,14 @@ impl FastFieldSerializer {
}
/// Pushes a new value to the currently open u32 fast field.
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let val_to_write: u32 = val - self.min_value;
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer.write(val_to_write, &mut self.write)?;
Ok(())
}
/// Close the u32 fast field.
/// Close the u64 fast field.
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));

View File

@@ -4,32 +4,32 @@ use std::io;
use schema::Value;
use DocId;
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
pub struct U64FastFieldsWriter {
field_writers: Vec<U64FastFieldWriter>,
}
impl U32FastFieldsWriter {
impl U64FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<Field> = schema.fields()
pub fn from_schema(schema: &Schema) -> U64FastFieldsWriter {
let u64_fields: Vec<Field> = schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.filter(|&(_, field_entry)| field_entry.is_u64_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
U32FastFieldsWriter::new(u32_fields)
U64FastFieldsWriter::new(u64_fields)
}
pub fn new(fields: Vec<Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
pub fn new(fields: Vec<Field>) -> U64FastFieldsWriter {
U64FastFieldsWriter {
field_writers: fields
.into_iter()
.map(U32FastFieldWriter::new)
.map(U64FastFieldWriter::new)
.collect(),
}
}
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> {
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U64FastFieldWriter> {
self.field_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
@@ -60,14 +60,14 @@ impl U32FastFieldsWriter {
}
}
pub struct U32FastFieldWriter {
pub struct U64FastFieldWriter {
field: Field,
vals: Vec<u32>,
vals: Vec<u64>,
}
impl U32FastFieldWriter {
pub fn new(field: Field) -> U32FastFieldWriter {
U32FastFieldWriter {
impl U64FastFieldWriter {
pub fn new(field: Field) -> U64FastFieldWriter {
U64FastFieldWriter {
field: field,
vals: Vec::new(),
}
@@ -81,24 +81,24 @@ impl U32FastFieldWriter {
let target = doc as usize + 1;
debug_assert!(self.vals.len() <= target);
while self.vals.len() < target {
self.add_val(0u32)
self.add_val(0u64)
}
}
pub fn add_val(&mut self, val: u32) {
pub fn add_val(&mut self, val: u64) {
self.vals.push(val);
}
fn extract_val(&self, doc: &Document) -> u32 {
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => {
match *v {
Value::U32(ref val) => { *val }
_ => { panic!("Expected a u32field, got {:?} ", v) }
Value::U64(ref val) => { *val }
_ => { panic!("Expected a u64field, got {:?} ", v) }
}
},
None => {
0u32
0u64
}
}
}
@@ -112,7 +112,7 @@ impl U32FastFieldWriter {
let zero = 0;
let min = *self.vals.iter().min().unwrap_or(&zero);
let max = *self.vals.iter().max().unwrap_or(&min);
try!(serializer.new_u32_fast_field(self.field, min, max));
try!(serializer.new_u64_fast_field(self.field, min, max));
for &val in &self.vals {
try!(serializer.add_val(val));
}

View File

@@ -6,7 +6,7 @@ use Index;
use Searcher;
use rand::distributions::{IndependentSample, Range};
fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
}
@@ -17,19 +17,19 @@ fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let id_field = schema_builder.add_u32_field("id", U32_INDEXED);
let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED);
let id_field = schema_builder.add_u64_field("id", U64_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", U64_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let universe = Range::new(0u32, 20u32);
let universe = Range::new(0u64, 20u64);
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut committed_docs: HashSet<u32> = HashSet::new();
let mut uncommitted_docs: HashSet<u32> = HashSet::new();
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = universe.ind_sample(&mut rng);
@@ -45,15 +45,15 @@ fn test_indexing() {
else {
if committed_docs.remove(&random_val) ||
uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u32(id_field, random_val);
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
}
else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u32(id_field, random_val);
for i in 1u32..10u32 {
doc.add_u32(multiples_field, random_val * i);
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
}

View File

@@ -283,7 +283,7 @@ mod tests {
let field = Field(1u32);
DeleteOperation {
opstamp: i as u64,
term: Term::from_field_u32(field, i as u32)
term: Term::from_field_u64(field, i as u64)
}
};

View File

@@ -6,7 +6,7 @@ use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use itertools::Itertools;
use postings::Postings;
use postings::DocSet;
@@ -50,31 +50,31 @@ impl DeltaPositionComputer {
}
fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u32, u32)> {
fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> {
if max_doc == 0 {
None
}
else if !delete_bitset.has_deletes() {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u32_reader.min_val(), u32_reader.max_val()))
Some((u64_reader.min_val(), u64_reader.max_val()))
}
else {
// some deleted documents,
// we need to recompute the max / min
(0..max_doc)
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
.map(|doc_id| u32_reader.get(doc_id))
.map(|doc_id| u64_reader.get(doc_id))
.minmax()
.into_option()
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field)
}
@@ -113,37 +113,37 @@ impl IndexMerger {
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.filter(|&(_, field_entry)| field_entry.is_u64_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
}
// used both to merge field norms and regular u32 fast fields.
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U32FastFieldReader>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
for field in fields {
let mut u32_readers = vec!();
let mut min_val = u32::max_value();
let mut max_val = u32::min_value();
let mut u64_readers = vec!();
let mut min_val = u64::max_value();
let mut max_val = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u32_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) {
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset()));
u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
}
}
None => {
let error_msg = format!("Failed to find a u32_reader for field {:?}", field);
let error_msg = format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
return Err(Error::SchemaError(error_msg))
}
@@ -151,7 +151,7 @@ impl IndexMerger {
}
if u32_readers.is_empty() {
if u64_readers.is_empty() {
// we have actually zero documents.
min_val = 0;
max_val = 0;
@@ -159,11 +159,11 @@ impl IndexMerger {
assert!(min_val <= max_val);
try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val));
for (max_doc, u32_reader, delete_bitset) in u32_readers {
try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val));
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u32_reader.get(doc_id);
let val = u64_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
}
}
@@ -311,8 +311,8 @@ mod tests {
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let index = Index::create_in_ram(schema_builder.build());
{
@@ -322,19 +322,19 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 3);
doc.add_u64(score_field, 3);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c");
doc.add_u32(score_field, 5);
doc.add_u64(score_field, 5);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d");
doc.add_u32(score_field, 7);
doc.add_u64(score_field, 7);
index_writer.add_document(doc);
}
index_writer.commit().expect("committed");
@@ -345,13 +345,13 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 11);
doc.add_u64(score_field, 11);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c g");
doc.add_u32(score_field, 13);
doc.add_u64(score_field, 13);
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed");
@@ -417,7 +417,7 @@ mod tests {
}
}
fn search_term(searcher: &Searcher, term: Term) -> Vec<u32> {
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
let mut collector = FastFieldTestCollector::for_field(Field(1));
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
searcher.search(&term_query, &mut collector).unwrap();
@@ -432,8 +432,8 @@ mod tests {
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();

View File

@@ -6,7 +6,7 @@ use schema::Term;
use core::Segment;
use core::SerializableSegment;
use postings::PostingsWriter;
use fastfield::U32FastFieldsWriter;
use fastfield::U64FastFieldsWriter;
use schema::Field;
use schema::FieldEntry;
use schema::FieldValue;
@@ -30,20 +30,20 @@ pub struct SegmentWriter<'a> {
max_doc: DocId,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
segment_serializer: SegmentSerializer,
fast_field_writers: U32FastFieldsWriter,
fieldnorms_writer: U32FastFieldsWriter,
fast_field_writers: U64FastFieldsWriter,
fieldnorms_writer: U64FastFieldsWriter,
doc_opstamps: Vec<u64>,
}
fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<Field> = schema.fields()
fn create_fieldnorms_writer(schema: &Schema) -> U64FastFieldsWriter {
let u64_fields: Vec<Field> = schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
U32FastFieldsWriter::new(u32_fields)
U64FastFieldsWriter::new(u64_fields)
}
@@ -62,7 +62,7 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box
}
}
}
FieldType::U32(_) => {
FieldType::U64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
@@ -95,7 +95,7 @@ impl<'a> SegmentWriter<'a> {
per_field_postings_writers: per_field_postings_writers,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: U32FastFieldsWriter::from_schema(schema),
fast_field_writers: U64FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
@@ -154,13 +154,13 @@ impl<'a> SegmentWriter<'a> {
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| {
field_norms_writer.add_val(num_tokens as u32)
field_norms_writer.add_val(num_tokens as u64)
});
}
FieldType::U32(ref u32_options) => {
if u32_options.is_indexed() {
FieldType::U64(ref u64_options) => {
if u64_options.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u32(field_value.field(), field_value.value().u32_value());
let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value());
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
}
}
@@ -205,8 +205,8 @@ impl<'a> SegmentWriter<'a> {
// This method is used as a trick to workaround the borrow checker
fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
fast_field_writers: &U32FastFieldsWriter,
fieldnorms_writer: &U32FastFieldsWriter,
fast_field_writers: &U64FastFieldsWriter,
fieldnorms_writer: &U64FastFieldsWriter,
mut serializer: SegmentSerializer,
heap: &'a Heap,) -> Result<()> {
for per_field_postings_writer in per_field_postings_writers {

View File

@@ -438,9 +438,9 @@ mod tests {
#[test]
fn test_indexed_u32() {
fn test_indexed_u64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("text", U32_INDEXED);
let field = schema_builder.add_u64_field("text", U64_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -451,7 +451,7 @@ mod tests {
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u32(field, 1u32);
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);

View File

@@ -68,7 +68,7 @@ mod tests {
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert_eq!(read.len(), 13);
assert!(read.len() <= 16);
}
#[test]
@@ -120,7 +120,7 @@ mod tests {
assert_eq!(fieldnorm_reader.get(0), 8 + 5);
assert_eq!(fieldnorm_reader.get(1), 2);
for i in 2 .. 1000 {
assert_eq!(fieldnorm_reader.get(i), i + 1);
assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
}
}
{

View File

@@ -109,8 +109,8 @@ impl PostingsSerializer {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U32(ref u32_options) => {
if u32_options.is_indexed() {
FieldType::U64(ref u64_options) => {
if u64_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized

View File

@@ -23,7 +23,7 @@ mod tests {
use collector::tests::TestCollector;
use Index;
use schema::*;
use fastfield::{U32FastFieldReader};
use fastfield::{U64FastFieldReader};
use postings::SegmentPostingsOption;
fn abs_diff(left: f32, right: f32) -> f32 {
@@ -111,7 +111,7 @@ mod tests {
let occurs = vec!(Occur::Should, Occur::Should);
let occur_filter = OccurFilter::new(&occurs);
let left_fieldnorms = U32FastFieldReader::from(vec!(100,200,300));
let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300));
let left = VecPostings::from(vec!(1, 2, 3));
let left_scorer = TermScorer {
@@ -120,7 +120,7 @@ mod tests {
postings: left,
};
let right_fieldnorms = U32FastFieldReader::from(vec!(15,25,35));
let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35));
let right = VecPostings::from(vec!(1, 3, 8));
let right_scorer = TermScorer {

View File

@@ -22,10 +22,9 @@ pub enum QueryParserError {
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
FieldDoesNotExist(String),
/// `ExpectedU32(field_name: String, field_value: String)`
/// The query contains a term for a `u32`-field, but the value
/// is not a u32.
ExpectedU32(String, String),
/// The query contains a term for a `u64`-field, but the value
/// is not a u64.
ExpectedU64(String, String),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
@@ -138,7 +137,7 @@ impl QueryParser {
loop {
if let Some(token) = token_iter.next() {
let text = token.to_string();
// TODO Handle u32
// TODO Handle u64
let term = Term::from_field_text(field, &text);
tokens.push(term);
} else {

View File

@@ -14,7 +14,7 @@ mod tests {
use query::Scorer;
use query::term_query::TermScorer;
use query::Query;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use query::TermQuery;
use Index;
use schema::*;
@@ -55,7 +55,7 @@ mod tests {
#[test]
pub fn test_term_scorer() {
let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4));
let left_fieldnorms = U64FastFieldReader::from(vec!(10, 4));
assert_eq!(left_fieldnorms.get(0), 10);
assert_eq!(left_fieldnorms.get(1), 4);
let left = VecPostings::from(vec!(1));

View File

@@ -1,13 +1,13 @@
use Score;
use DocId;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use postings::DocSet;
use query::Scorer;
use postings::Postings;
pub struct TermScorer<TPostings> where TPostings: Postings {
pub idf: Score,
pub fieldnorm_reader_opt: Option<U32FastFieldReader>,
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
pub postings: TPostings,
}

View File

@@ -52,9 +52,9 @@ impl Document {
self.add(FieldValue::new(field, value));
}
/// Add a u32 field
pub fn add_u32(&mut self, field: Field, value: u32) {
self.add(FieldValue::new(field, Value::U32(value)));
/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add(FieldValue::new(field, Value::U64(value)));
}
/// Add a field value

View File

@@ -1,5 +1,5 @@
use schema::TextOptions;
use schema::U32Options;
use schema::IntOptions;
use rustc_serialize::Decodable;
use rustc_serialize::Decoder;
@@ -22,7 +22,7 @@ pub struct FieldEntry {
impl FieldEntry {
/// Creates a new u32 field entry in the schema, given
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_text(field_name: String, field_type: TextOptions) -> FieldEntry {
FieldEntry {
@@ -31,12 +31,12 @@ impl FieldEntry {
}
}
/// Creates a new u32 field entry in the schema, given
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_u32(field_name: String, field_type: U32Options) -> FieldEntry {
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
FieldEntry {
name: field_name,
field_type: FieldType::U32(field_type),
field_type: FieldType::U64(field_type),
}
}
@@ -54,14 +54,14 @@ impl FieldEntry {
pub fn is_indexed(&self,) -> bool {
match self.field_type {
FieldType::Str(ref options) => options.get_indexing_options().is_indexed(),
FieldType::U32(ref options) => options.is_indexed(),
FieldType::U64(ref options) => options.is_indexed(),
}
}
/// Returns true iff the field is a u32 fast field
pub fn is_u32_fast(&self,) -> bool {
/// Returns true iff the field is a u64 fast field
pub fn is_u64_fast(&self,) -> bool {
match self.field_type {
FieldType::U32(ref options) => options.is_fast(),
FieldType::U64(ref options) => options.is_fast(),
_ => false,
}
}
@@ -69,7 +69,7 @@ impl FieldEntry {
/// Returns true iff the field is stored
pub fn is_stored(&self,) -> bool {
match self.field_type {
FieldType::U32(ref options) => {
FieldType::U64(ref options) => {
options.is_stored()
}
FieldType::Str(ref options) => {
@@ -96,9 +96,9 @@ impl Encodable for FieldEntry {
options.encode(s)
}));
}
FieldType::U32(ref options) => {
FieldType::U64(ref options) => {
try!(s.emit_struct_field("type", 1, |s| {
s.emit_str("u32")
s.emit_str("u64")
}));
try!(s.emit_struct_field("options", 2, |s| {
options.encode(s)
@@ -122,9 +122,9 @@ impl Decodable for FieldEntry {
}));
d.read_struct_field("options", 2, |d| {
match field_type.as_ref() {
"u32" => {
let u32_options = try!(U32Options::decode(d));
Ok(FieldEntry::new_u32(name, u32_options))
"u64" => {
let u64_options = try!(IntOptions::decode(d));
Ok(FieldEntry::new_u64(name, u64_options))
}
"text" => {
let text_options = try!(TextOptions::decode(d));

View File

@@ -1,5 +1,5 @@
use schema::TextOptions;
use schema::U32Options;
use schema::IntOptions;
use rustc_serialize::json::Json;
use schema::Value;
@@ -11,20 +11,22 @@ use schema::Value;
pub enum ValueParsingError {
/// Encounterred a numerical value that overflows or underflow its integer type.
OverflowError(String),
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u32 type)
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
/// Tantivy will try to autocast values.
TypeError(String),
}
/// A `FieldType` describes the type (text, u32) of a field as well as
/// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy.
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
pub enum FieldType {
/// String field type configuration
Str(TextOptions),
/// U32 field type configuration
U32(U32Options),
/// Unsigned 64-bits integers field type configuration
U64(IntOptions),
// /// Signed 64-bits integers 64 field type configuration
// I64(IntOptions),
}
impl FieldType {
@@ -41,20 +43,15 @@ impl FieldType {
FieldType::Str(_) => {
Ok(Value::Str(field_text.clone()))
}
FieldType::U32(_) => {
Err(ValueParsingError::TypeError(format!("Expected a u32 int, got {:?}", json)))
FieldType::U64(_) => {
Err(ValueParsingError::TypeError(format!("Expected a u64 int, got {:?}", json)))
}
}
}
Json::U64(ref field_val_u64) => {
match *self {
FieldType::U32(_) => {
if *field_val_u64 > (u32::max_value() as u64) {
Err(ValueParsingError::OverflowError(format!("Expected u32, but value {:?} overflows.", field_val_u64)))
}
else {
Ok(Value::U32(*field_val_u64 as u32))
}
FieldType::U64(_) => {
Ok(Value::U64(*field_val_u64 as u64))
}
_ => {
Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", json)))
@@ -62,7 +59,7 @@ impl FieldType {
}
},
_ => {
Err(ValueParsingError::TypeError(format!("Expected a string or a u32, got {:?}", json)))
Err(ValueParsingError::TypeError(format!("Expected a string or a u64, got {:?}", json)))
}
}
}

View File

@@ -1,14 +1,14 @@
use std::ops::BitOr;
/// Define how a U32 field should be handled by tantivy.
/// Define how a u64 field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct U32Options {
pub struct IntOptions {
indexed: bool,
fast: bool,
stored: bool,
}
impl U32Options {
impl IntOptions {
/// Returns true iff the value is stored.
pub fn is_stored(&self,) -> bool {
@@ -26,39 +26,39 @@ impl U32Options {
self.fast
}
/// Set the u32 options as stored.
/// Set the u64 options as stored.
///
/// Only the fields that are set as *stored* are
/// persisted into the Tantivy's store.
pub fn set_stored(mut self,) -> U32Options {
pub fn set_stored(mut self,) -> IntOptions {
self.stored = true;
self
}
/// Set the u32 options as indexed.
/// Set the u64 options as indexed.
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
pub fn set_indexed(mut self,) -> U32Options {
pub fn set_indexed(mut self,) -> IntOptions {
self.indexed = true;
self
}
/// Set the u32 options as a fast field.
/// Set the u64 options as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated to a fast field, only the last one is
/// kept.
pub fn set_fast(mut self,) -> U32Options {
pub fn set_fast(mut self,) -> IntOptions {
self.fast = true;
self
}
}
impl Default for U32Options {
fn default() -> U32Options {
U32Options {
impl Default for IntOptions {
fn default() -> IntOptions {
IntOptions {
fast: false,
indexed: false,
stored: false,
@@ -67,40 +67,40 @@ impl Default for U32Options {
}
/// Shortcut for a u32 fast field.
/// Shortcut for a u64 fast field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const FAST: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | U64_INDEXED`
pub const FAST: IntOptions = IntOptions {
indexed: false,
stored: false,
fast: true,
};
/// Shortcut for a u32 indexed field.
/// Shortcut for a u64 indexed field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const U32_INDEXED: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | U64_INDEXED`
pub const U64_INDEXED: IntOptions = IntOptions {
indexed: true,
stored: false,
fast: false,
};
/// Shortcut for a u32 stored field.
/// Shortcut for a u64 stored field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const U32_STORED: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | U64_INDEXED`
pub const U64_STORED: IntOptions = IntOptions {
indexed: false,
stored: true,
fast: false,
};
impl BitOr for U32Options {
impl BitOr for IntOptions {
type Output = U32Options;
type Output = IntOptions;
fn bitor(self, other: U32Options) -> U32Options {
let mut res = U32Options::default();
fn bitor(self, other: IntOptions) -> IntOptions {
let mut res = IntOptions::default();
res.indexed = self.indexed | other.indexed;
res.stored = self.stored | other.stored;
res.fast = self.fast | other.fast;

View File

@@ -7,7 +7,7 @@ Tantivy has a very strict schema.
The schema defines information about the fields your index contains, that is, for each field :
* the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`)
* the type of the field (currently only `text` and `u32` are supported)
* the type of the field (currently only `text` and `u64` are supported)
* how the field should be indexed / stored.
This very last point is critical as it will enable / disable some of the functionality
@@ -64,17 +64,17 @@ let schema = schema_builder.build();
## Setting a u32 field
## Setting a u64 field
### Example
```
use tantivy::schema::*;
let mut schema_builder = SchemaBuilder::default();
let num_stars_options = U32Options::default()
let num_stars_options = IntOptions::default()
.set_stored()
.set_indexed();
schema_builder.add_u32_field("num_stars", num_stars_options);
schema_builder.add_u64_field("num_stars", num_stars_options);
let schema = schema_builder.build();
```
@@ -82,15 +82,15 @@ Just like for Text fields (see above),
setting the field as stored defines whether the field will be
returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called,
and setting the field as indexed means that we will be able perform queries such as `num_stars:10`.
Note that unlike text fields, u32 can only be indexed in one way for the moment.
Note that unlike text fields, u64 can only be indexed in one way for the moment.
This may change when we will start supporting range queries.
The `fast` option on the other hand is specific to u32 fields, and is only relevant
The `fast` option on the other hand is specific to u64 fields, and is only relevant
if you are implementing your own queries. This functionality is somewhat similar to Lucene's
`DocValues`.
u32 that are indexed as fast will be stored in a special data structure that will
make it possible to access the u32 value given the doc id rapidly. This is useful if the value of
u64 that are indexed as fast will be stored in a special data structure that will
make it possible to access the u64 value given the doc id rapidly. This is useful if the value of
the field is required during scoring or collection for instance.
*/
@@ -104,7 +104,7 @@ mod field_entry;
mod field_value;
mod text_options;
mod u32_options;
mod int_options;
mod field;
mod value;
mod named_field_document;
@@ -129,10 +129,10 @@ pub use self::text_options::TEXT;
pub use self::text_options::STRING;
pub use self::text_options::STORED;
pub use self::u32_options::U32Options;
pub use self::u32_options::FAST;
pub use self::u32_options::U32_INDEXED;
pub use self::u32_options::U32_STORED;
pub use self::int_options::IntOptions;
pub use self::int_options::FAST;
pub use self::int_options::U64_INDEXED;
pub use self::int_options::U64_STORED;
use regex::Regex;

View File

@@ -26,8 +26,8 @@ impl Encodable for NamedFieldDocument {
Value::Str(ref text) => {
s.emit_str(text)
},
Value::U32(ref val) => {
s.emit_u32(*val)
Value::U64(ref val) => {
s.emit_u64(*val)
}
}
})

View File

@@ -12,8 +12,6 @@ use std::sync::Arc;
use super::*;
use std::fmt;
const MAX_NUM_FIELDS: usize = 255;
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not, and RAM-based or not.
@@ -48,7 +46,7 @@ impl SchemaBuilder {
SchemaBuilder::default()
}
/// Adds a new u32 field.
/// Adds a new u64 field.
/// Returns the associated field handle
///
/// # Caution
@@ -58,12 +56,12 @@ impl SchemaBuilder {
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_u32_field(
pub fn add_u64_field(
&mut self,
field_name_str: &str,
field_options: U32Options) -> Field {
field_options: IntOptions) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_u32(field_name, field_options);
let field_entry = FieldEntry::new_u64(field_name, field_options);
self.add_field(field_entry)
}
@@ -325,14 +323,15 @@ mod tests {
use schema::*;
use rustc_serialize::json;
use schema::field_type::ValueParsingError;
use schema::schema::DocParsingError::NotJSON;
#[test]
pub fn test_schema_serialization() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u32_field("count", count_options);
schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
let schema_json: String = format!("{}", json::as_pretty_json(&schema));
let expected = r#"[
@@ -354,7 +353,7 @@ mod tests {
},
{
"name": "count",
"type": "u32",
"type": "u64",
"options": {
"indexed": false,
"fast": true,
@@ -371,10 +370,10 @@ mod tests {
#[test]
pub fn test_document_to_json() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u32_field("count", count_options);
schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
let doc_json = r#"{
"title": "my title",
@@ -389,10 +388,10 @@ mod tests {
#[test]
pub fn test_parse_document() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u32_field("count", count_options);
let count_field = schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
@@ -406,7 +405,7 @@ mod tests {
}"#).unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
assert_eq!(doc.get_first(count_field).unwrap().u32_value(), 4);
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
}
{
let json_err = schema.parse_document(r#"{
@@ -478,10 +477,25 @@ mod tests {
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(false);
}
_ => {
assert!(true);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50000000000000000000
}"#);
match json_err {
Err(NotJSON(_)) => {
assert!(true);
}
_ => {
assert!(false);
assert!(false)
}
}
}

View File

@@ -39,20 +39,20 @@ impl Term {
Field(self.field_id())
}
/// Builds a term given a field, and a u32-value
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u32 value of 3234,
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u32.
/// The 4 following bytes are encoding the u32 value.
pub fn from_field_u32(field: Field, val: u32) -> Term {
const U32_TERM_LEN: usize = 4 + 4;
let mut buffer = allocate_vec(U32_TERM_LEN);
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_u64(field: Field, val: u64) -> Term {
const U64_TERM_LEN: usize = 4 + 8;
let mut buffer = allocate_vec(U64_TERM_LEN);
// we want BigEndian here to have lexicographic order
// match the natural order of `(field, val)`
BigEndian::write_u32(&mut buffer[0..4], field.0);
BigEndian::write_u32(&mut buffer[4..], val);
BigEndian::write_u64(&mut buffer[4..], val);
Term(buffer)
}
@@ -69,11 +69,11 @@ impl Term {
Term(buffer)
}
/// Assume the term is a u32 field.
/// Assume the term is a u64 field.
///
/// Panics if the term is not a u32 field.
pub fn get_u32(&self) -> u32 {
BigEndian::read_u32(&self.0[4..])
/// Panics if the term is not a u64 field.
pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0[4..])
}
/// Builds a term from its byte representation.
@@ -88,7 +88,7 @@ impl Term {
/// (this does not include the field.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u32, its value is encoded according
/// If the term is a u64, its value is encoded according
/// to `byteorder::LittleEndian`.
pub fn value(&self) -> &[u8] {
&self.0[4..]
@@ -147,14 +147,18 @@ mod tests {
assert_eq!(&term.as_slice()[4..], "test".as_bytes());
}
{
let term = Term::from_field_u32(count_field, 983u32);
let term = Term::from_field_u64(count_field, 983u64);
assert_eq!(term.field(), count_field);
assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 2u8]);
assert_eq!(term.as_slice().len(), 8);
assert_eq!(term.as_slice().len(), 4 + 8);
assert_eq!(term.as_slice()[4], 0u8);
assert_eq!(term.as_slice()[5], 0u8);
assert_eq!(term.as_slice()[6], (933u32 / 256u32) as u8);
assert_eq!(term.as_slice()[7], (983u32 % 256u32) as u8);
assert_eq!(term.as_slice()[6], 0u8);
assert_eq!(term.as_slice()[7], 0u8);
assert_eq!(term.as_slice()[8], 0u8);
assert_eq!(term.as_slice()[9], 0u8);
assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8);
assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8);
}
}

View File

@@ -10,8 +10,8 @@ use std::io::Read;
pub enum Value {
/// The str type is used for any text information.
Str(String),
/// Unsigned 32-bits Integer `u32`
U32(u32),
/// Unsigned 32-bits Integer `u64`
U64(u64),
}
impl Value {
@@ -30,13 +30,13 @@ impl Value {
}
}
/// Returns the u32-value, provided the value is of the `U32` type.
/// Returns the u64-value, provided the value is of the `U64` type.
///
/// # Panics
/// If the value is not of type `U32`
pub fn u32_value(&self) -> u32 {
/// If the value is not of type `U64`
pub fn u64_value(&self) -> u64 {
match *self {
Value::U32(ref value) => {
Value::U64(ref value) => {
*value
}
_ => {
@@ -53,9 +53,9 @@ impl From<String> for Value {
}
impl From<u32> for Value {
fn from(v: u32) -> Value {
Value::U32(v)
impl From<u64> for Value {
fn from(v: u64) -> Value {
Value::U64(v)
}
}
@@ -66,7 +66,7 @@ impl<'a> From<&'a str> for Value {
}
const TEXT_CODE: u8 = 0;
const U32_CODE: u8 = 1;
const U64_CODE: u8 = 1;
impl BinarySerializable for Value {
@@ -77,8 +77,8 @@ impl BinarySerializable for Value {
written_size += try!(TEXT_CODE.serialize(writer));
written_size += try!(text.serialize(writer));
},
Value::U32(ref val) => {
written_size += try!(U32_CODE.serialize(writer));
Value::U64(ref val) => {
written_size += try!(U64_CODE.serialize(writer));
written_size += try!(val.serialize(writer));
},
}
@@ -91,9 +91,9 @@ impl BinarySerializable for Value {
let text = try!(String::deserialize(reader));
Ok(Value::Str(text))
}
U32_CODE => {
let value = try!(u32::deserialize(reader));
Ok(Value::U32(value))
U64_CODE => {
let value = try!(u64::deserialize(reader));
Ok(Value::U64(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code)))