Compare commits

..

20 Commits

Author SHA1 Message Date
Paul Masurel
fbe398dfa2 Added a broken unit test 2019-01-22 08:18:32 +09:00
Paul Masurel
0e8fcd5727 Plastic surgery 2019-01-19 23:13:27 +09:00
Paul Masurel
f745c83bb7 Closes 466. Removing mentions of the chain collector. (#467) 2019-01-16 10:28:19 +09:00
Paul Masurel
ffb16d9103 More efficient indexing (#463)
* Using unrolled u32 VInt and caching Vec s

* cargo fmt

* Exposing a io::Write in the Expull thing

* expull as a writer. clippy + format

* inline the first block

* simplified -if let Some-

* vint reader iterator

* blop
2019-01-13 14:51:18 +09:00
Paul Masurel
98ca703daa More efficient indexing (#462)
* Using unrolled u32 VInt and caching Vec s

* cargo fmt

* Exposing a io::Write in the Expull thing

* expull as a writer. clippy + format

* inline the first block

* simplified -if let Some-

* vint reader iterator
2019-01-13 14:41:56 +09:00
Paul Masurel
b9d25cda5d Using LittleEndian explicitely 2019-01-08 12:41:58 +09:00
Paul Masurel
beb4289ec2 Less unsafe 2019-01-08 00:48:14 +09:00
Andrew Banchich
bdd72e4683 Update README.md (#459)
Fix Elasticsearch spelling
2018-12-27 07:26:49 +09:00
Paul Masurel
45c3cd19be Fixing README: git clone https... 2018-12-26 21:13:33 +09:00
Paul Masurel
b8241c5603 0.8.0 2018-12-26 10:18:34 +09:00
Paul Masurel
a4745151c0 Version to 0.8 2018-12-26 10:11:06 +09:00
Paul Masurel
e2ce326a8c Merge branch 'issue/457' 2018-12-18 10:35:01 +09:00
Paul Masurel
bb21d12a70 Bumping version 2018-12-18 10:14:12 +09:00
Paul Masurel
4565aba62a Added unit test for exponential search 2018-12-18 09:24:31 +09:00
Paul Masurel
545a7ec8dd Closes #457 2018-12-18 09:18:46 +09:00
Paul Masurel
e68775d71c Format and update murmurhash32 version 2018-12-17 19:12:38 +09:00
Paul Masurel
dcc92d287e Facet remove unsafe (#456)
* Removing some unsafe

* Removing some unsafe (2)

* Remove murmurhash
2018-12-17 19:08:48 +09:00
Paul Masurel
b48f81c051 Removing unsafe from bitpacking code (#455) 2018-12-17 19:06:37 +09:00
Paul Masurel
a3042e956b Facet remove unsafe (#454)
* Removing some unsafe

* Removing some unsafe (2)
2018-12-17 09:31:09 +09:00
dependabot[bot]
1fa10f0a0b Update itertools requirement from 0.7 to 0.8 (#453)
Updates the requirements on [itertools](https://github.com/bluss/rust-itertools) to permit the latest version.
- [Release notes](https://github.com/bluss/rust-itertools/releases)
- [Commits](https://github.com/bluss/rust-itertools/commits/0.8.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-12-17 09:28:36 +09:00
38 changed files with 913 additions and 639 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.1
Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.8.0-dev"
version = "0.8.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -29,7 +29,7 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.7"
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] }
@@ -49,6 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -21,7 +21,7 @@
**Tantivy** is a **full text search engine library** written in rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.
@@ -76,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run :
git clone git@github.com:tantivy-search/tantivy.git
git clone https://github.com/tantivy-search/tantivy.git
cd tantivy
cargo build

View File

@@ -70,8 +70,6 @@ impl Collector for StatsCollector {
// Our standard deviation will be a float.
type Fruit = Option<Stats>;
type SegmentFruit = Self::Fruit;
type Child = StatsSegmentCollector;
fn for_segment(

View File

@@ -58,7 +58,6 @@ pub struct Count;
impl Collector for Count {
type Fruit = usize;
type SegmentFruit = usize;
type Child = SegmentCountCollector;

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -258,8 +258,6 @@ impl FacetCollector {
impl Collector for FacetCollector {
type Fruit = FacetCounts;
type SegmentFruit = FacetCounts;
type Child = FacetSegmentCollector;
fn for_segment(
@@ -371,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
}
FacetCounts { facet_counts }
}
@@ -405,9 +404,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));

View File

@@ -136,10 +136,8 @@ pub trait Collector: Sync {
/// e.g. `usize` for the `Count` collector.
type Fruit: Fruit;
type SegmentFruit: Fruit;
/// Type of the `SegmentCollector` associated to this collector.
type Child: SegmentCollector<Fruit = Self::SegmentFruit>;
type Child: SegmentCollector<Fruit = Self::Fruit>;
/// `set_segment` is called before beginning to enumerate
/// on this segment.
@@ -154,7 +152,7 @@ pub trait Collector: Sync {
/// Combines the fruit associated to the collection of each segments
/// into one fruit.
fn merge_fruits(&self, segment_fruits: Vec<Self::SegmentFruit>) -> Result<Self::Fruit>;
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
}
/// The `SegmentCollector` is the trait in charge of defining the
@@ -183,9 +181,6 @@ where
Right: Collector,
{
type Fruit = (Left::Fruit, Right::Fruit);
type SegmentFruit = (Left::SegmentFruit, Right::SegmentFruit);
type Child = (Left::Child, Right::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -200,7 +195,7 @@ where
fn merge_fruits(
&self,
children: Vec<(Left::SegmentFruit, Right::SegmentFruit)>,
children: Vec<(Left::Fruit, Right::Fruit)>,
) -> Result<(Left::Fruit, Right::Fruit)> {
let mut left_fruits = vec![];
let mut right_fruits = vec![];
@@ -241,7 +236,6 @@ where
Three: Collector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit);
type Child = (One::Child, Two::Child, Three::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -255,7 +249,7 @@ where
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
}
fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![];
let mut two_fruits = vec![];
let mut three_fruits = vec![];
@@ -301,7 +295,6 @@ where
Four: Collector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit, Four::SegmentFruit);
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -319,7 +312,7 @@ where
|| self.3.requires_scoring()
}
fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![];
let mut two_fruits = vec![];
let mut three_fruits = vec![];

View File

@@ -18,7 +18,6 @@ pub struct CollectorWrapper<TCollector: Collector>(TCollector);
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
type Fruit = Box<Fruit>;
type SegmentFruit = Box<Fruit>;
type Child = Box<BoxableSegmentCollector>;
fn for_segment(
@@ -35,10 +34,10 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
}
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
let typed_fruit: Vec<TCollector::SegmentFruit> = children
let typed_fruit: Vec<TCollector::Fruit> = children
.into_iter()
.map(|untyped_fruit| {
Downcast::<TCollector::SegmentFruit>::downcast(untyped_fruit)
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
.map(|boxed_but_typed| *boxed_but_typed)
.map_err(|e| {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
@@ -97,7 +96,10 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
///
/// If the type of the collectors is known, you can just group yours collectors
/// in a tuple. See the
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
///
/// ```rust
/// #[macro_use]
@@ -153,7 +155,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>, SegmentFruit = Box<Fruit>> + 'a>>,
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
}
impl<'a> MultiCollector<'a> {
@@ -178,9 +180,7 @@ impl<'a> MultiCollector<'a> {
}
impl<'a> Collector for MultiCollector<'a> {
type Fruit = MultiFruit;
type SegmentFruit = MultiFruit;
type Child = MultiCollectorChild;
fn for_segment(

View File

@@ -40,7 +40,6 @@ impl TestFruit {
impl Collector for TestCollector {
type Fruit = TestFruit;
type SegmentFruit = Self::Fruit;
type Child = TestSegmentCollector;
fn for_segment(
@@ -110,8 +109,6 @@ impl FastFieldTestCollector {
impl Collector for FastFieldTestCollector {
type Fruit = Vec<u64>;
type SegmentFruit = Self::Fruit;
type Child = FastFieldSegmentCollector;
fn for_segment(
@@ -168,7 +165,6 @@ impl BytesFastFieldTestCollector {
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type SegmentFruit = Self::Fruit;
type Child = BytesFastFieldSegmentCollector;
fn for_segment(

View File

@@ -88,7 +88,6 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
type Fruit = Vec<(T, DocAddress)>;
type SegmentFruit = Vec<(T, DocAddress)>;
type Child = TopFieldSegmentCollector<T>;

View File

@@ -89,7 +89,6 @@ impl TopDocs {
impl Collector for TopDocs {
type Fruit = Vec<(Score, DocAddress)>;
type SegmentFruit = Vec<(Score, DocAddress)>;
type Child = TopScoreSegmentCollector;

View File

@@ -1,9 +1,6 @@
use common::serialize::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -18,7 +15,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: Write>(
pub fn write<TWrite: io::Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -28,14 +25,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -43,17 +40,18 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -102,39 +100,10 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
/// Reads a range of values from the fast field.
///
/// The range of values read is from
/// `[start..start + output.len()[`
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0u64;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
}
}
#[cfg(test)]
@@ -172,17 +141,4 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

View File

@@ -10,7 +10,7 @@ pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::VInt;
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
pub use byteorder::LittleEndian as Endianness;
use std::io;

View File

@@ -1,4 +1,5 @@
use super::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
use std::io;
use std::io::Read;
use std::io::Write;
@@ -9,6 +10,100 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
const MASK_4: u64 = MASK_3 << 7;
const MASK_5: u64 = MASK_4 << 7;
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
match val {
0...STOP_1 => (val | STOP_BIT, 1),
START_2...STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3...STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4...STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
}
}
/// Returns the number of bytes covered by a
/// serialized vint `u32`.
///
/// Expects a buffer data that starts
/// by the serialized `vint`, scans at most 5 bytes ahead until
/// it finds the vint final byte.
///
/// # May Panic
/// If the payload does not start by a valid `vint`
fn vint_len(data: &[u8]) -> usize {
for i in 0..5.min(data.len()) {
if data[i] >= STOP_BIT {
return i + 1;
}
}
panic!("Corrupted data. Invalid VInt 32");
}
/// Reads a vint `u32` from a buffer, and
/// consumes its payload data.
///
/// # Panics
///
/// If the buffer does not start by a valid
/// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let vlen = vint_len(*data);
let mut result = 0u32;
let mut shift = 0u64;
for &b in &data[..vlen] {
result |= u32::from(b & 127u8) << shift;
shift += 7;
}
*data = &data[vlen..];
result
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let (val, num_bytes) = serialize_vint_u32(val);
let mut buffer = [0u8; 8];
LittleEndian::write_u64(&mut buffer, val);
writer.write_all(&buffer[..num_bytes])
}
impl VInt {
pub fn val(&self) -> u64 {
self.0
@@ -24,7 +119,7 @@ impl VInt {
output.extend(&buffer[0..num_bytes]);
}
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
let mut remaining = self.0;
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (remaining % 128u64) as u8;
@@ -64,7 +159,7 @@ impl BinarySerializable for VInt {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer while reading VInt",
))
));
}
}
}
@@ -74,7 +169,9 @@ impl BinarySerializable for VInt {
#[cfg(test)]
mod tests {
use super::serialize_vint_u32;
use super::VInt;
use byteorder::{ByteOrder, LittleEndian};
use common::BinarySerializable;
fn aux_test_vint(val: u64) {
@@ -108,4 +205,28 @@ mod tests {
}
aux_test_vint(10);
}
fn aux_test_serialize_vint_u32(val: u32) {
let mut buffer = [0u8; 10];
let mut buffer2 = [0u8; 10];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let (vint, len) = serialize_vint_u32(val);
assert_eq!(len, len_vint, "len wrong for val {}", val);
LittleEndian::write_u64(&mut buffer2, vint);
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
}
#[test]
fn test_vint_u32() {
aux_test_serialize_vint_u32(0);
aux_test_serialize_vint_u32(1);
aux_test_serialize_vint_u32(5);
for i in 1..3 {
let power_of_128 = 1u32 << (7 * i);
aux_test_serialize_vint_u32(power_of_128 - 1u32);
aux_test_serialize_vint_u32(power_of_128);
aux_test_serialize_vint_u32(power_of_128 + 1u32);
}
aux_test_serialize_vint_u32(u32::max_value());
}
}

View File

@@ -64,17 +64,18 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
results[pos] = fruit_res?;
num_items += 1;
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
}
}

View File

@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
}
/// Search Index

View File

@@ -23,7 +23,7 @@ fn collect_segment<C: Collector>(
weight: &Weight,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> Result<C::SegmentFruit> {
) -> Result<C::Fruit> {
let mut scorer = weight.scorer(segment_reader)?;
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
if let Some(delete_bitset) = segment_reader.delete_bitset() {

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::TantivyError;
use error::DataCorruption;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
@@ -64,7 +64,12 @@ impl ManagedDirectory {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {

View File

@@ -8,9 +8,42 @@ use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -33,8 +66,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -55,6 +88,12 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,5 +1,6 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -20,6 +21,7 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -37,6 +39,7 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -55,11 +58,18 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
.ord_to_term(facet_ord as u64, &mut self.buffer);
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet);
facet_reader.facet_from_ord(1, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet);
facet_reader.facet_from_ord(2, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet);
facet_reader.facet_from_ord(3, &mut facet).unwrap();
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet);
facet_reader.facet_from_ord(4, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -79,11 +79,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
// ok: Item is either `u64` or `i64`
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
for (i, out) in output.iter_mut().enumerate() {
*out = self.get(start + i as u32);
}
}

View File

@@ -26,7 +26,6 @@ use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
use std::mem;
use std::mem::swap;
use std::thread;
use std::thread::JoinHandle;
use Result;
@@ -52,17 +51,19 @@ type DocumentReceiver = channel::Receiver<AddOperation>;
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
assert!(per_thread_memory_budget > 1_000);
let table_size_limit: usize = per_thread_memory_budget / 3;
(1..)
if let Some(limit) = (1..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.unwrap_or_else(|| {
panic!(
"Per thread memory is too small: {}",
per_thread_memory_budget
)
})
.min(19) // we cap it at 512K
{
limit.min(19) // we cap it at 2^19 = 512K.
} else {
unreachable!(
"Per thread memory is too small: {}",
per_thread_memory_budget
);
}
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -302,7 +303,7 @@ fn index_documents(
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
let delete_bitset_opt = if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
@@ -313,18 +314,17 @@ fn index_documents(
&doc_to_opstamps,
last_docstamp,
)?;
SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
})
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
} else {
// if there are no delete operation in the queue, no need
// to even open the segment.
SegmentEntry::new(segment_meta, delete_cursor, None)
None
};
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
Ok(segment_updater.add_segment(generation, segment_entry))
}
@@ -467,11 +467,10 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
mem::replace(&mut self.document_sender, document_sender);
mem::replace(&mut self.document_receiver, document_receiver)
}
/// Rollback to the last commit
@@ -558,17 +557,13 @@ impl IndexWriter {
// and recreate a new one channels.
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
let former_workers_join_handle =
mem::replace(&mut self.workers_join_handle, Vec::new());
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
.join()
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
indexing_worker_result?;
// add a new worker for the next generation.
self.add_indexing_worker()?;

View File

@@ -1255,6 +1255,36 @@ mod tests {
}
}
use schema::INT_INDEXED;
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(index.searcher().num_docs(), 2);
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder();

View File

@@ -111,19 +111,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&[u8]> = field_values
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
let mut term = Term::for_field(field); // we set the Term
for facet_bytes in facets {
for fake_str in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =

View File

@@ -1,6 +1,8 @@
use super::stacker::{Addr, MemoryArena, TermHashMap};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
use schema::IndexRecordOption;
@@ -213,7 +215,7 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
@@ -245,8 +247,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if opt_recorder.is_some() {
let mut recorder = opt_recorder.unwrap();
if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(heap);
@@ -255,7 +256,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
recorder.record_position(position, heap);
recorder
} else {
let mut recorder = Rec::new(heap);
let mut recorder = Rec::new();
recorder.new_doc(doc, heap);
recorder.record_position(position, heap);
recorder
@@ -270,10 +271,11 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
termdict_heap: &MemoryArena,
heap: &MemoryArena,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = unsafe { termdict_heap.read(addr) };
let recorder: Rec = termdict_heap.read(addr);
serializer.new_term(&term_bytes[4..])?;
recorder.serialize(serializer, heap)?;
recorder.serialize(&mut buffer_lender, serializer, heap)?;
serializer.close_term()?;
}
Ok(())

View File

@@ -1,10 +1,51 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use common::{read_u32_vint, write_u32_vint};
use postings::FieldSerializer;
use std::{self, io};
use std::io;
use DocId;
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = std::u32::MAX;
const POSITION_END: u32 = 0;
#[derive(Default)]
pub(crate) struct BufferLender {
buffer_u8: Vec<u8>,
buffer_u32: Vec<u32>,
}
impl BufferLender {
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
self.buffer_u8.clear();
&mut self.buffer_u8
}
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
(&mut self.buffer_u8, &mut self.buffer_u32)
}
}
pub struct VInt32Reader<'a> {
data: &'a [u8],
}
impl<'a> VInt32Reader<'a> {
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
VInt32Reader { data }
}
}
impl<'a> Iterator for VInt32Reader<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.data.is_empty() {
None
} else {
Some(read_u32_vint(&mut self.data))
}
}
}
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
@@ -15,9 +56,9 @@ const POSITION_END: u32 = std::u32::MAX;
/// * the document id
/// * the term frequency
/// * the term positions
pub trait Recorder: Copy {
pub(crate) trait Recorder: Copy + 'static {
///
fn new(heap: &mut MemoryArena) -> Self;
fn new() -> Self;
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
@@ -29,7 +70,12 @@ pub trait Recorder: Copy {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()>;
}
/// Only records the doc ids
@@ -40,9 +86,9 @@ pub struct NothingRecorder {
}
impl Recorder for NothingRecorder {
fn new(heap: &mut MemoryArena) -> Self {
fn new() -> Self {
NothingRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(),
}
}
@@ -53,16 +99,23 @@ impl Recorder for NothingRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
for doc in self.stack.iter(heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
@@ -77,9 +130,9 @@ pub struct TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn new(heap: &mut MemoryArena) -> Self {
fn new() -> Self {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(),
current_tf: 0u32,
}
@@ -91,7 +144,7 @@ impl Recorder for TermFrequencyRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
@@ -100,24 +153,24 @@ impl Recorder for TermFrequencyRecorder {
fn close_doc(&mut self, heap: &mut MemoryArena) {
debug_assert!(self.current_tf > 0);
self.stack.push(self.current_tf, heap);
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
self.current_tf = 0;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self
.stack
.iter(heap)
.chain(Some(self.current_tf).into_iter());
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
while let Some(doc) = u32_it.next() {
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc as u32, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
}
}
@@ -128,11 +181,10 @@ pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for TFAndPositionRecorder {
fn new(heap: &mut MemoryArena) -> Self {
fn new() -> Self {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(),
}
}
@@ -143,33 +195,88 @@ impl Recorder for TFAndPositionRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
}
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
self.stack.push(position, heap);
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
}
fn close_doc(&mut self, heap: &mut MemoryArena) {
self.stack.push(POSITION_END, heap);
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();
for position in &mut positions_iter {
if position == POSITION_END {
break;
} else {
doc_positions.push(position - prev_position);
prev_position = position;
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(heap, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
while let Some(doc) = u32_it.next() {
let mut prev_position_plus_one = 1u32;
buffer_positions.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
}
}
}
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::write_u32_vint;
use super::BufferLender;
use super::VInt32Reader;
#[test]
fn test_buffer_lender() {
let mut buffer_lender = BufferLender::default();
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
}
#[test]
fn test_vint_u32() {
let mut buffer = vec![];
let vals = [0, 1, 324_234_234, u32::max_value()];
for &i in &vals {
assert!(write_u32_vint(i, &mut buffer).is_ok());
}
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
assert_eq!(&res[..], &vals[..]);
}
}

View File

@@ -126,7 +126,6 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -622,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
@@ -635,6 +634,7 @@ mod tests {
use schema::Term;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -662,6 +662,16 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -693,7 +703,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -707,14 +717,44 @@ mod tests {
}
}
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for doc in docs {
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -734,7 +774,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -744,7 +784,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -757,7 +797,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(docs.clone());
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -1,28 +1,37 @@
use super::{Addr, MemoryArena};
use common::is_power_of_2;
use postings::stacker::memory_arena::load;
use postings::stacker::memory_arena::store;
use std::io;
use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: usize = 16;
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
const FIRST_BLOCK: u32 = 4u32;
enum CapacityResult {
Available(u32),
NeedAlloc(u32),
}
#[inline]
pub fn jump_needed(len: u32) -> Option<usize> {
fn len_to_capacity(len: u32) -> CapacityResult {
match len {
0...3 => None,
4...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) {
Some(len as usize)
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
16...MAX_BLOCK_LEN => {
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
let available = cap - len;
if available == 0 {
CapacityResult::NeedAlloc(len)
} else {
None
CapacityResult::Available(available)
}
}
n => {
if n % MAX_BLOCK_LEN == 0 {
Some(MAX_BLOCK_LEN as usize)
let available = n % MAX_BLOCK_LEN;
if available == 0 {
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
} else {
None
CapacityResult::Available(MAX_BLOCK_LEN - available)
}
}
}
@@ -52,82 +61,119 @@ pub fn jump_needed(len: u32) -> Option<usize> {
#[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList {
len: u32,
head: Addr,
tail: Addr,
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
}
pub struct ExpUnrolledLinkedListWriter<'a> {
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
}
fn ensure_capacity<'a>(
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
) -> &'a mut [u8] {
if eull.len <= FIRST_BLOCK as u32 {
// We are still hitting the inline block.
if eull.len < FIRST_BLOCK as u32 {
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
}
// We need to allocate a new block!
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
eull.tail = new_block_addr;
return heap.slice_mut(eull.tail, FIRST_BLOCK);
}
let len = match len_to_capacity(eull.len) {
CapacityResult::NeedAlloc(new_block_len) => {
let new_block_addr: Addr =
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
heap.write_at(eull.tail, new_block_addr);
eull.tail = new_block_addr;
new_block_len
}
CapacityResult::Available(available) => available,
};
heap.slice_mut(eull.tail, len as usize)
}
impl<'a> ExpUnrolledLinkedListWriter<'a> {
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
if buf.is_empty() {
// we need to cut early, because `ensure_capacity`
// allocates if there is no capacity at all right now.
return;
}
while !buf.is_empty() {
let add_len: usize;
{
let output_buf = ensure_capacity(self.eull, self.heap);
add_len = buf.len().min(output_buf.len());
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
}
self.eull.len += add_len as u32;
self.eull.tail = self.eull.tail.offset(add_len as u32);
buf = &buf[add_len..];
}
}
}
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// There is no use case to only write the capacity.
// This is not IO after all, so we write the whole
// buffer even if the contract of `.write` is looser.
self.extend_from_slice(buf);
Ok(buf.len())
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.extend_from_slice(buf);
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
pub fn new() -> ExpUnrolledLinkedList {
ExpUnrolledLinkedList {
len: 0u32,
head: addr,
tail: addr,
tail: Addr::null_pointer(),
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
}
}
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: self.head,
len: self.len,
consumed: 0,
}
#[inline(always)]
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
ExpUnrolledLinkedListWriter { eull: self, heap }
}
/// Appends a new element to the current stack.
///
/// If the current block end is reached, a new block is allocated.
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
self.len += 1;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
let len = self.len as usize;
if len <= FIRST_BLOCK {
output.extend_from_slice(&self.inlined_data[..len]);
return;
}
unsafe {
// logic
heap.write(self.tail, val);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a MemoryArena,
addr: Addr,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
let mut cur = FIRST_BLOCK;
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
loop {
let cap = match len_to_capacity(cur as u32) {
CapacityResult::Available(capacity) => capacity,
CapacityResult::NeedAlloc(capacity) => capacity,
} as usize;
let data = heap.slice(addr, cap);
if cur + cap >= len {
output.extend_from_slice(&data[..(len - cur)]);
return;
}
output.extend_from_slice(data);
cur += cap;
addr = heap.read(addr.offset(cap as u32));
}
}
}
@@ -136,39 +182,126 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
mod tests {
use super::super::MemoryArena;
use super::jump_needed;
use super::len_to_capacity;
use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test]
fn test_stack() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stack.push(1u32, &mut heap);
stack.push(2u32, &mut heap);
stack.push(4u32, &mut heap);
stack.push(8u32, &mut heap);
let mut stack = ExpUnrolledLinkedList::new();
stack.writer(&mut heap).extend_from_slice(&[1u8]);
stack.writer(&mut heap).extend_from_slice(&[2u8]);
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
stack.writer(&mut heap).extend_from_slice(&[5u8]);
{
let mut it = stack.iter(&heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
}
}
#[test]
fn test_jump_if_needed() {
let mut block_len = 4u32;
let mut i = 0;
while i < 10_000_000 {
assert!(jump_needed(i + block_len - 1).is_none());
assert!(jump_needed(i + block_len + 1).is_none());
assert!(jump_needed(i + block_len).is_some());
let new_block_len = jump_needed(i + block_len).unwrap();
i += block_len;
block_len = new_block_len as u32;
fn test_stack_long() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let source: Vec<u32> = (0..100).collect();
for &el in &source {
assert!(stack
.writer(&mut heap)
.write_u32::<LittleEndian>(el)
.is_ok());
}
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
let mut result = vec![];
let mut remaining = &buffer[..];
while !remaining.is_empty() {
result.push(LittleEndian::read_u32(&remaining[..4]));
remaining = &remaining[4..];
}
assert_eq!(&result[..], &source[..]);
}
#[test]
fn test_stack_interlaced() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let mut stack2 = ExpUnrolledLinkedList::new();
let mut vec1: Vec<u8> = vec![];
let mut vec2: Vec<u8> = vec![];
for i in 0..9 {
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
if i % 2 == 0 {
assert!(stack2
.writer(&mut heap)
.write_u32::<LittleEndian>(i)
.is_ok());
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
}
}
let mut res1 = vec![];
let mut res2 = vec![];
stack.read_to_end(&heap, &mut res1);
stack2.read_to_end(&heap, &mut res2);
assert_eq!(&vec1[..], &res1[..]);
assert_eq!(&vec2[..], &res2[..]);
}
#[test]
fn test_jump_if_needed() {
let mut available = 16u32;
for i in 0..10_000_000 {
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
available = cap;
}
CapacityResult::Available(cap) => {
assert_eq!(
available, cap,
"Failed len={}: Expected {} Got {}",
i, available, cap
);
}
}
available -= 1;
}
}
#[test]
fn test_jump_if_needed_progression() {
let mut v = vec![];
for i in 0.. {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
}
}
assert_eq!(
&v[..],
&[
(16, 16),
(32, 32),
(64, 64),
(128, 128),
(256, 256),
(512, 512),
(1024, 1024),
(2048, 2048),
(4096, 4096),
(8192, 8192)
]
);
}
}
@@ -176,6 +309,7 @@ mod tests {
mod bench {
use super::super::MemoryArena;
use super::ExpUnrolledLinkedList;
use byteorder::{NativeEndian, WriteBytesExt};
use test::Bencher;
const NUM_STACK: usize = 10_000;
@@ -203,13 +337,13 @@ mod bench {
let mut heap = MemoryArena::new();
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
let mut stack = ExpUnrolledLinkedList::new();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &mut heap);
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
}
}
});

View File

@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// page of memory.
///
/// The last 20 bits are an address within this page of memory.
#[derive(Clone, Copy, Debug)]
#[derive(Copy, Clone, Debug)]
pub struct Addr(u32);
impl Addr {
@@ -69,32 +69,16 @@ impl Addr {
}
}
/// Trait required for an object to be `storable`.
///
/// # Warning
///
/// Most of the time you should not implement this trait,
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
assert_eq!(dest.len(), std::mem::size_of::<Item>());
unsafe {
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
}
}
impl<V> ArenaStorable for V
where
V: Copy,
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
assert_eq!(data.len(), std::mem::size_of::<Item>());
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
}
/// The `MemoryArena`
@@ -126,47 +110,9 @@ impl MemoryArena {
self.pages.len() * PAGE_SIZE
}
/// Writes a slice at the given address, assuming the
/// memory was allocated beforehands.
///
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
store(dest, val);
}
/// Read an item in the heap at the given `address`.
@@ -174,9 +120,21 @@ impl MemoryArena {
/// # Panics
///
/// If the address is erroneous
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
ptr::read_unaligned(ptr as *const Item)
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
load(self.slice(addr, mem::size_of::<Item>()))
}
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
}
pub fn slice_from(&self, addr: Addr) -> &[u8] {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}
#[inline(always)]
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
}
/// Allocates `len` bytes and returns the allocated address.
@@ -197,14 +155,10 @@ struct Page {
impl Page {
fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page {
page_id,
len: 0,
data: data.into_boxed_slice(),
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
}
}
@@ -213,12 +167,16 @@ impl Page {
len + self.len <= PAGE_SIZE
}
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.slice_from(local_addr)[..len]
}
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.data[local_addr..][..len]
fn slice_from(&self, local_addr: usize) -> &[u8] {
&self.data[local_addr..]
}
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
}
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
@@ -230,16 +188,6 @@ impl Page {
None
}
}
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().add(addr)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().add(addr)
}
}
#[cfg(test)]
@@ -254,13 +202,13 @@ mod tests {
let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len());
arena.write_bytes(addr_a, a);
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
let addr_b = arena.allocate_space(b.len());
arena.write_bytes(addr_b, b);
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
assert_eq!(arena.read_slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b);
assert_eq!(arena.slice(addr_a, a.len()), a);
assert_eq!(arena.slice(addr_b, b.len()), b);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -283,9 +231,15 @@ mod tests {
b: 221,
c: 12,
};
let addr_a = arena.store(a);
let addr_b = arena.store(b);
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
let num_bytes = std::mem::size_of::<MyTest>();
let addr_a = arena.allocate_space(num_bytes);
arena.write_at(addr_a, a);
let addr_b = arena.allocate_space(num_bytes);
arena.write_at(addr_b, b);
assert_eq!(arena.read::<MyTest>(addr_a), a);
assert_eq!(arena.read::<MyTest>(addr_b), b);
}
}

View File

@@ -1,9 +1,7 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::memory_arena::{Addr, MemoryArena};
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,87 +0,0 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,37 +1,16 @@
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
use byteorder::{ByteOrder, NativeEndian};
use postings::stacker::memory_arena::store;
use std::iter;
use std::mem;
use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize {
@@ -111,8 +90,7 @@ impl<'a> Iterator for Iter<'a> {
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) =
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, bucket as BucketId)
})
}
@@ -143,12 +121,22 @@ impl TermHashMap {
self.table.len() < self.occupied.len() * 3
}
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
let key_addr = addr.offset(2u32);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
(key_bytes, val_addr)
#[inline(always)]
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.heap.slice_from(addr);
let key_bytes_len = NativeEndian::read_u16(data) as usize;
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
}
#[inline(always)]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Some(value_addr)
} else {
None
}
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
@@ -199,36 +187,39 @@ impl TermHashMap {
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
where
S: AsRef<[u8]>,
V: Copy,
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
if self.is_saturated() {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let val = updater(None);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
let num_bytes =
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
let key_addr = self.heap.allocate_space(num_bytes);
{
let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val);
}
self.set_bucket(hash, key_addr, bucket);
return bucket as BucketId;
} else if kv.hash == hash {
let (key_matches, val_addr) = {
let (stored_key, val_addr): (&[u8], Addr) =
unsafe { self.get_key_value(kv.key_value_addr) };
(stored_key == key_bytes, val_addr)
};
if key_matches {
unsafe {
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
if let Some(val_addr) =
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
{
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write_at(val_addr, new_v);
return bucket as BucketId;
}
}
@@ -236,24 +227,6 @@ impl TermHashMap {
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
@@ -285,10 +258,7 @@ mod tests {
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = unsafe {
// test
hash_map.heap.read(addr)
};
let val: u32 = hash_map.heap.read(addr);
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);

View File

@@ -6,6 +6,7 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write};
use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\';
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(Vec<u8>);
pub struct Facet(String);
impl Facet {
/// Returns a new instance of the "root facet"
/// Equivalent to `/`.
pub fn root() -> Facet {
Facet(vec![])
Facet("".to_string())
}
/// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool {
self.encoded_bytes().is_empty()
self.encoded_str().is_empty()
}
/// Returns a binary representation of the facet.
@@ -49,13 +54,19 @@ impl Facet {
/// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over
/// the term ordinals.
pub fn encoded_bytes(&self) -> &[u8] {
pub fn encoded_str(&self) -> &str {
&self.0
}
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation.
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
Facet(encoded_bytes)
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
}
/// Parse a text representation of a facet.
@@ -79,36 +90,37 @@ impl Facet {
Path: IntoIterator,
Path::Item: ToString,
{
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut facet_string: String = String::with_capacity(100);
let mut step_it = path.into_iter();
if let Some(step) = step_it.next() {
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push_str(&step.to_string());
}
for step in step_it {
facet_bytes.push(FACET_SEP_BYTE);
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push(FACET_SEP_CHAR);
facet_string.push_str(&step.to_string());
}
Facet(facet_bytes)
Facet(facet_string)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
}
/// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_bytes: &[u8] = self.encoded_bytes();
let other_bytes: &[u8] = other.encoded_bytes();
self_bytes.len() < other_bytes.len()
&& other_bytes.starts_with(self_bytes)
&& other_bytes[self_bytes.len()] == 0u8
let self_str = self.encoded_str();
let other_str = other.encoded_str();
self_str.len() < other_str.len()
&& other_str.starts_with(self_str)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
}
}
impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &[u8] {
self.encoded_bytes()
impl Borrow<str> for Facet {
fn borrow(&self) -> &str {
self.encoded_str()
}
}
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle,
}
let path: &str = path_asref.as_ref();
let mut facet_encoded = Vec::new();
assert!(!path.is_empty());
assert!(path.starts_with('/'));
let mut facet_encoded = String::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
for &c in &path_bytes[1..] {
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) {
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => {
facet_encoded.push(FACET_SEP_BYTE);
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
}
(State::Escaped, any_char) => {
(State::Escaped, _escaped_char) => {
state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
}
}
impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
<String as BinarySerializable>::serialize(&self.0, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
}
}
impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
for step in self.0.split(FACET_SEP_CHAR) {
write!(f, "/")?;
let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
write!(f, "{}", escape_slashes(step))?;
}
Ok(())
}

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_bytes();
let bytes = facet.encoded_str().as_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
@@ -68,12 +68,7 @@ impl Term {
term
}
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
term.set_field(field);

View File

@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
}
assert_eq!(&*term_string, "abcdef");
}

View File

@@ -1,4 +1,4 @@
use byteorder::ByteOrder;
use byteorder::{ByteOrder, LittleEndian};
use common::bitpacker::BitPacker;
use common::compute_num_bits;
use common::Endianness;
@@ -7,7 +7,6 @@ use directory::ReadOnlySource;
use postings::TermInfo;
use std::cmp;
use std::io::{self, Read, Write};
use std::ptr;
use termdict::TermOrdinal;
const BLOCK_LEN: usize = 256;
@@ -88,13 +87,17 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
assert!(num_bits <= 56);
let addr_byte = addr_bits / 8;
let bit_shift = (addr_bits % 8) as u64;
assert!(data.len() >= addr_byte + 7);
let val_unshifted_unmasked: u64 = unsafe {
// ok because the pointer is only accessed using `ptr::read_unaligned`
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let addr = data.as_ptr().add(addr_byte) as *const u64;
// ok thanks to the 7 byte padding
ptr::read_unaligned(addr)
let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
LittleEndian::read_u64(&data[addr_byte..][..8])
} else {
// the buffer is not large enough.
// Let's copy the few remaining bytes to a 8 byte buffer
// padded with 0s.
let mut buf = [0u8; 8];
let data_to_copy = &data[addr_byte..];
let nbytes = data_to_copy.len();
buf[..nbytes].copy_from_slice(data_to_copy);
LittleEndian::read_u64(&buf)
};
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
let mask = (1u64 << u64::from(num_bits)) - 1;
@@ -246,7 +249,6 @@ impl TermInfoStoreWriter {
self.num_terms.serialize(write)?;
write.write_all(&self.buffer_block_metas)?;
write.write_all(&self.buffer_term_infos)?;
write.write_all(&[0u8; 7])?;
Ok(())
}
}

View File

@@ -1,6 +1,5 @@
use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
@@ -86,7 +84,6 @@ mod tests {
use super::FacetTokenizer;
use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer};
#[test]
@@ -95,11 +92,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -115,11 +112,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);