mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Compare commits
20 Commits
segment_fr
...
bug/merge-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fbe398dfa2 | ||
|
|
0e8fcd5727 | ||
|
|
f745c83bb7 | ||
|
|
ffb16d9103 | ||
|
|
98ca703daa | ||
|
|
b9d25cda5d | ||
|
|
beb4289ec2 | ||
|
|
bdd72e4683 | ||
|
|
45c3cd19be | ||
|
|
b8241c5603 | ||
|
|
a4745151c0 | ||
|
|
e2ce326a8c | ||
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
e68775d71c | ||
|
|
dcc92d287e | ||
|
|
b48f81c051 | ||
|
|
a3042e956b | ||
|
|
1fa10f0a0b |
@@ -1,4 +1,4 @@
|
||||
Tantivy 0.8.1
|
||||
Tantivy 0.8.0
|
||||
=====================
|
||||
*No change in the index format*
|
||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.8.0-dev"
|
||||
version = "0.8.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -29,7 +29,7 @@ serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.7"
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||
@@ -49,6 +49,7 @@ failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
@@ -76,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
|
||||
@@ -70,8 +70,6 @@ impl Collector for StatsCollector {
|
||||
// Our standard deviation will be a float.
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
type SegmentFruit = Self::Fruit;
|
||||
|
||||
type Child = StatsSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
|
||||
@@ -58,7 +58,6 @@ pub struct Count;
|
||||
|
||||
impl Collector for Count {
|
||||
type Fruit = usize;
|
||||
type SegmentFruit = usize;
|
||||
|
||||
type Child = SegmentCountCollector;
|
||||
|
||||
|
||||
@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
@@ -258,8 +258,6 @@ impl FacetCollector {
|
||||
impl Collector for FacetCollector {
|
||||
type Fruit = FacetCounts;
|
||||
|
||||
type SegmentFruit = FacetCounts;
|
||||
|
||||
type Child = FacetSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
@@ -371,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
let mut facet = vec![];
|
||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
||||
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
|
||||
// TODO
|
||||
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
@@ -405,9 +404,9 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
||||
facet_after_bytes.push('\u{1}');
|
||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
|
||||
@@ -136,10 +136,8 @@ pub trait Collector: Sync {
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
|
||||
type SegmentFruit: Fruit;
|
||||
|
||||
/// Type of the `SegmentCollector` associated to this collector.
|
||||
type Child: SegmentCollector<Fruit = Self::SegmentFruit>;
|
||||
type Child: SegmentCollector<Fruit = Self::Fruit>;
|
||||
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
@@ -154,7 +152,7 @@ pub trait Collector: Sync {
|
||||
|
||||
/// Combines the fruit associated to the collection of each segments
|
||||
/// into one fruit.
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::SegmentFruit>) -> Result<Self::Fruit>;
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
|
||||
}
|
||||
|
||||
/// The `SegmentCollector` is the trait in charge of defining the
|
||||
@@ -183,9 +181,6 @@ where
|
||||
Right: Collector,
|
||||
{
|
||||
type Fruit = (Left::Fruit, Right::Fruit);
|
||||
|
||||
type SegmentFruit = (Left::SegmentFruit, Right::SegmentFruit);
|
||||
|
||||
type Child = (Left::Child, Right::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
@@ -200,7 +195,7 @@ where
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
children: Vec<(Left::SegmentFruit, Right::SegmentFruit)>,
|
||||
children: Vec<(Left::Fruit, Right::Fruit)>,
|
||||
) -> Result<(Left::Fruit, Right::Fruit)> {
|
||||
let mut left_fruits = vec![];
|
||||
let mut right_fruits = vec![];
|
||||
@@ -241,7 +236,6 @@ where
|
||||
Three: Collector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit);
|
||||
type Child = (One::Child, Two::Child, Three::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
@@ -255,7 +249,7 @@ where
|
||||
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
|
||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
let mut one_fruits = vec![];
|
||||
let mut two_fruits = vec![];
|
||||
let mut three_fruits = vec![];
|
||||
@@ -301,7 +295,6 @@ where
|
||||
Four: Collector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit, Four::SegmentFruit);
|
||||
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
@@ -319,7 +312,7 @@ where
|
||||
|| self.3.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
|
||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
let mut one_fruits = vec![];
|
||||
let mut two_fruits = vec![];
|
||||
let mut three_fruits = vec![];
|
||||
|
||||
@@ -18,7 +18,6 @@ pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
||||
|
||||
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
type Fruit = Box<Fruit>;
|
||||
type SegmentFruit = Box<Fruit>;
|
||||
type Child = Box<BoxableSegmentCollector>;
|
||||
|
||||
fn for_segment(
|
||||
@@ -35,10 +34,10 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
||||
let typed_fruit: Vec<TCollector::SegmentFruit> = children
|
||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||
.into_iter()
|
||||
.map(|untyped_fruit| {
|
||||
Downcast::<TCollector::SegmentFruit>::downcast(untyped_fruit)
|
||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
||||
.map(|boxed_but_typed| *boxed_but_typed)
|
||||
.map_err(|e| {
|
||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||
@@ -97,7 +96,10 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
///
|
||||
/// If the type of the collectors is known, you can just group yours collectors
|
||||
/// in a tuple. See the
|
||||
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
@@ -153,7 +155,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers:
|
||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>, SegmentFruit = Box<Fruit>> + 'a>>,
|
||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
@@ -178,9 +180,7 @@ impl<'a> MultiCollector<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
|
||||
type Fruit = MultiFruit;
|
||||
type SegmentFruit = MultiFruit;
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
fn for_segment(
|
||||
|
||||
@@ -40,7 +40,6 @@ impl TestFruit {
|
||||
|
||||
impl Collector for TestCollector {
|
||||
type Fruit = TestFruit;
|
||||
type SegmentFruit = Self::Fruit;
|
||||
type Child = TestSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
@@ -110,8 +109,6 @@ impl FastFieldTestCollector {
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
type SegmentFruit = Self::Fruit;
|
||||
|
||||
type Child = FastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
@@ -168,7 +165,6 @@ impl BytesFastFieldTestCollector {
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
type SegmentFruit = Self::Fruit;
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
|
||||
@@ -88,7 +88,6 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
||||
type Fruit = Vec<(T, DocAddress)>;
|
||||
type SegmentFruit = Vec<(T, DocAddress)>;
|
||||
|
||||
type Child = TopFieldSegmentCollector<T>;
|
||||
|
||||
|
||||
@@ -89,7 +89,6 @@ impl TopDocs {
|
||||
|
||||
impl Collector for TopDocs {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
type SegmentFruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
type Child = TopScoreSegmentCollector;
|
||||
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use common::serialize::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
@@ -18,7 +15,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
@@ -28,14 +25,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -43,17 +40,18 @@ impl BitPacker {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
@@ -102,39 +100,10 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
///
|
||||
/// The range of values read is from
|
||||
/// `[start..start + output.len()[`
|
||||
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
||||
if self.num_bits == 0 {
|
||||
for val in output.iter_mut() {
|
||||
*val = 0u64;
|
||||
}
|
||||
} else {
|
||||
let data: &[u8] = &*self.data;
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let mut addr_in_bits = (start as usize) * num_bits;
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -172,17 +141,4 @@ mod test {
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitpacker_range() {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
|
||||
let buffer_len = 100;
|
||||
let mut buffer = vec![0u64; buffer_len];
|
||||
for start in vec![0, 10, 20, 100, 1_000] {
|
||||
bitunpacker.get_range(start as u32, &mut buffer[..]);
|
||||
for i in 0..buffer_len {
|
||||
assert_eq!(buffer[i], vals[start + i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
@@ -9,6 +10,100 @@ pub struct VInt(pub u64);
|
||||
|
||||
const STOP_BIT: u8 = 128;
|
||||
|
||||
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
|
||||
const START_2: u64 = 1 << 7;
|
||||
const START_3: u64 = 1 << 14;
|
||||
const START_4: u64 = 1 << 21;
|
||||
const START_5: u64 = 1 << 28;
|
||||
|
||||
const STOP_1: u64 = START_2 - 1;
|
||||
const STOP_2: u64 = START_3 - 1;
|
||||
const STOP_3: u64 = START_4 - 1;
|
||||
const STOP_4: u64 = START_5 - 1;
|
||||
|
||||
const MASK_1: u64 = 127;
|
||||
const MASK_2: u64 = MASK_1 << 7;
|
||||
const MASK_3: u64 = MASK_2 << 7;
|
||||
const MASK_4: u64 = MASK_3 << 7;
|
||||
const MASK_5: u64 = MASK_4 << 7;
|
||||
|
||||
let val = u64::from(val);
|
||||
const STOP_BIT: u64 = 128u64;
|
||||
match val {
|
||||
0...STOP_1 => (val | STOP_BIT, 1),
|
||||
START_2...STOP_2 => (
|
||||
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
|
||||
2,
|
||||
),
|
||||
START_3...STOP_3 => (
|
||||
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
|
||||
3,
|
||||
),
|
||||
START_4...STOP_4 => (
|
||||
(val & MASK_1)
|
||||
| ((val & MASK_2) << 1)
|
||||
| ((val & MASK_3) << 2)
|
||||
| ((val & MASK_4) << 3)
|
||||
| (STOP_BIT << (8 * 3)),
|
||||
4,
|
||||
),
|
||||
_ => (
|
||||
(val & MASK_1)
|
||||
| ((val & MASK_2) << 1)
|
||||
| ((val & MASK_3) << 2)
|
||||
| ((val & MASK_4) << 3)
|
||||
| ((val & MASK_5) << 4)
|
||||
| (STOP_BIT << (8 * 4)),
|
||||
5,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of bytes covered by a
|
||||
/// serialized vint `u32`.
|
||||
///
|
||||
/// Expects a buffer data that starts
|
||||
/// by the serialized `vint`, scans at most 5 bytes ahead until
|
||||
/// it finds the vint final byte.
|
||||
///
|
||||
/// # May Panic
|
||||
/// If the payload does not start by a valid `vint`
|
||||
fn vint_len(data: &[u8]) -> usize {
|
||||
for i in 0..5.min(data.len()) {
|
||||
if data[i] >= STOP_BIT {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
panic!("Corrupted data. Invalid VInt 32");
|
||||
}
|
||||
|
||||
/// Reads a vint `u32` from a buffer, and
|
||||
/// consumes its payload data.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the buffer does not start by a valid
|
||||
/// vint payload
|
||||
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
|
||||
let vlen = vint_len(*data);
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u64;
|
||||
for &b in &data[..vlen] {
|
||||
result |= u32::from(b & 127u8) << shift;
|
||||
shift += 7;
|
||||
}
|
||||
*data = &data[vlen..];
|
||||
result
|
||||
}
|
||||
|
||||
/// Write a `u32` as a vint payload.
|
||||
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
|
||||
let (val, num_bytes) = serialize_vint_u32(val);
|
||||
let mut buffer = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut buffer, val);
|
||||
writer.write_all(&buffer[..num_bytes])
|
||||
}
|
||||
|
||||
impl VInt {
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
@@ -24,7 +119,7 @@ impl VInt {
|
||||
output.extend(&buffer[0..num_bytes]);
|
||||
}
|
||||
|
||||
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||
let mut remaining = self.0;
|
||||
for (i, b) in buffer.iter_mut().enumerate() {
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
@@ -64,7 +159,7 @@ impl BinarySerializable for VInt {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Reach end of buffer while reading VInt",
|
||||
))
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,7 +169,9 @@ impl BinarySerializable for VInt {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::VInt;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
@@ -108,4 +205,28 @@ mod tests {
|
||||
}
|
||||
aux_test_vint(10);
|
||||
}
|
||||
|
||||
fn aux_test_serialize_vint_u32(val: u32) {
|
||||
let mut buffer = [0u8; 10];
|
||||
let mut buffer2 = [0u8; 10];
|
||||
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
|
||||
let (vint, len) = serialize_vint_u32(val);
|
||||
assert_eq!(len, len_vint, "len wrong for val {}", val);
|
||||
LittleEndian::write_u64(&mut buffer2, vint);
|
||||
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vint_u32() {
|
||||
aux_test_serialize_vint_u32(0);
|
||||
aux_test_serialize_vint_u32(1);
|
||||
aux_test_serialize_vint_u32(5);
|
||||
for i in 1..3 {
|
||||
let power_of_128 = 1u32 << (7 * i);
|
||||
aux_test_serialize_vint_u32(power_of_128 - 1u32);
|
||||
aux_test_serialize_vint_u32(power_of_128);
|
||||
aux_test_serialize_vint_u32(power_of_128 + 1u32);
|
||||
}
|
||||
aux_test_serialize_vint_u32(u32::max_value());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,17 +64,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut results = Vec::with_capacity(num_fruits);
|
||||
unsafe { results.set_len(num_fruits) };
|
||||
let mut num_items = 0;
|
||||
// This is lame, but safe.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
results[pos] = fruit_res?;
|
||||
num_items += 1;
|
||||
let fruit = fruit_res?;
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
// this checks ensures that we filled of this
|
||||
// uninitialized memory.
|
||||
assert_eq!(num_items, results.len());
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string)
|
||||
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.clone(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
|
||||
@@ -23,7 +23,7 @@ fn collect_segment<C: Collector>(
|
||||
weight: &Weight,
|
||||
segment_ord: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<C::SegmentFruit> {
|
||||
) -> Result<C::Fruit> {
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::TantivyError;
|
||||
use error::DataCorruption;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
@@ -64,7 +64,12 @@ impl ManagedDirectory {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
MANAGED_FILEPATH.clone(),
|
||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
|
||||
43
src/error.rs
43
src/error.rs
@@ -8,9 +8,42 @@ use indexer::LockType;
|
||||
use query;
|
||||
use schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
impl DataCorruption {
|
||||
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: Some(filepath),
|
||||
comment,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn comment_only(comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: None,
|
||||
comment,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
}
|
||||
write!(f, ": {}.", self.comment)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The library's failure based error enum
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum TantivyError {
|
||||
@@ -33,8 +66,8 @@ pub enum TantivyError {
|
||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||
IOError(#[cause] IOError),
|
||||
/// Data corruption.
|
||||
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
|
||||
CorruptedFile(PathBuf),
|
||||
#[fail(display = "{:?}", _0)]
|
||||
DataCorruption(DataCorruption),
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
@@ -55,6 +88,12 @@ pub enum TantivyError {
|
||||
SystemError(String),
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||
TantivyError::FastFieldError(fastfield_error)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
@@ -20,6 +21,7 @@ use DocId;
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -37,6 +39,7 @@ impl FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
buffer: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,11 +58,18 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
pub fn facet_from_ord(
|
||||
&mut self,
|
||||
facet_ord: TermOrdinal,
|
||||
output: &mut Facet,
|
||||
) -> Result<(), str::Utf8Error> {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
let facet_str = str::from_utf8(&self.buffer[..])?;
|
||||
output.set_facet_str(facet_str);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
|
||||
@@ -82,20 +82,20 @@ mod tests {
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
|
||||
@@ -79,11 +79,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
// ok: Item is either `u64` or `i64`
|
||||
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get(start + i as u32);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,6 @@ use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Result;
|
||||
@@ -52,17 +51,19 @@ type DocumentReceiver = channel::Receiver<AddOperation>;
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
assert!(per_thread_memory_budget > 1_000);
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
if let Some(limit) = (1..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
)
|
||||
})
|
||||
.min(19) // we cap it at 512K
|
||||
{
|
||||
limit.min(19) // we cap it at 2^19 = 512K.
|
||||
} else {
|
||||
unreachable!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
@@ -302,7 +303,7 @@ fn index_documents(
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
|
||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
@@ -313,18 +314,17 @@ fn index_documents(
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
SegmentEntry::new(segment_meta, delete_cursor, None)
|
||||
None
|
||||
};
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
}
|
||||
|
||||
@@ -467,11 +467,10 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
swap(&mut self.document_sender, &mut document_sender);
|
||||
swap(&mut self.document_receiver, &mut document_receiver);
|
||||
document_receiver
|
||||
mem::replace(&mut self.document_sender, document_sender);
|
||||
mem::replace(&mut self.document_receiver, document_receiver)
|
||||
}
|
||||
|
||||
/// Rollback to the last commit
|
||||
@@ -558,17 +557,13 @@ impl IndexWriter {
|
||||
// and recreate a new one channels.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
let former_workers_join_handle =
|
||||
mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = worker_handle
|
||||
.join()
|
||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
self.add_indexing_worker()?;
|
||||
|
||||
@@ -1255,6 +1255,36 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
use schema::INT_INDEXED;
|
||||
#[test]
|
||||
fn test_bug_merge() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
// commit has not been called yet. The document should still be
|
||||
// there.
|
||||
assert_eq!(index.searcher().num_docs(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -111,19 +111,18 @@ impl SegmentWriter {
|
||||
}
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&[u8]> = field_values
|
||||
let facets: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
for facet_bytes in facets {
|
||||
for fake_str in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
term.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::recorder::{
|
||||
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
|
||||
};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use schema::IndexRecordOption;
|
||||
@@ -213,7 +215,7 @@ pub trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
@@ -245,8 +247,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
if opt_recorder.is_some() {
|
||||
let mut recorder = opt_recorder.unwrap();
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(heap);
|
||||
@@ -255,7 +256,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new(heap);
|
||||
let mut recorder = Rec::new();
|
||||
recorder.new_doc(doc, heap);
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
@@ -270,10 +271,11 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||
let recorder: Rec = termdict_heap.read(addr);
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
recorder.serialize(serializer, heap)?;
|
||||
recorder.serialize(&mut buffer_lender, serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,10 +1,51 @@
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
use postings::FieldSerializer;
|
||||
use std::{self, io};
|
||||
use std::io;
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = std::u32::MAX;
|
||||
const POSITION_END: u32 = 0;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct BufferLender {
|
||||
buffer_u8: Vec<u8>,
|
||||
buffer_u32: Vec<u32>,
|
||||
}
|
||||
|
||||
impl BufferLender {
|
||||
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
|
||||
self.buffer_u8.clear();
|
||||
&mut self.buffer_u8
|
||||
}
|
||||
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
|
||||
self.buffer_u8.clear();
|
||||
self.buffer_u32.clear();
|
||||
(&mut self.buffer_u8, &mut self.buffer_u32)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct VInt32Reader<'a> {
|
||||
data: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> VInt32Reader<'a> {
|
||||
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
|
||||
VInt32Reader { data }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for VInt32Reader<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.data.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(read_u32_vint(&mut self.data))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Recorder is in charge of recording relevant information about
|
||||
/// the presence of a term in a document.
|
||||
@@ -15,9 +56,9 @@ const POSITION_END: u32 = std::u32::MAX;
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: Copy {
|
||||
pub(crate) trait Recorder: Copy + 'static {
|
||||
///
|
||||
fn new(heap: &mut MemoryArena) -> Self;
|
||||
fn new() -> Self;
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
@@ -29,7 +70,12 @@ pub trait Recorder: Copy {
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
@@ -40,9 +86,9 @@ pub struct NothingRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
fn new() -> Self {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
@@ -53,16 +99,23 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
for doc in self.stack.iter(heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(heap, buffer);
|
||||
for doc in VInt32Reader::new(&buffer[..]) {
|
||||
serializer.write_doc(doc as u32, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -77,9 +130,9 @@ pub struct TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
fn new() -> Self {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
}
|
||||
@@ -91,7 +144,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||
@@ -100,24 +153,24 @@ impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self
|
||||
.stack
|
||||
.iter(heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
let term_freq = doc_iter
|
||||
.next()
|
||||
.expect("The IndexWriter recorded a doc without a term freq.");
|
||||
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(heap, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
while let Some(doc) = u32_it.next() {
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
serializer.write_doc(doc as u32, term_freq, &EMPTY_ARRAY)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -128,11 +181,10 @@ pub struct TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
fn new() -> Self {
|
||||
TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
@@ -143,33 +195,88 @@ impl Recorder for TFAndPositionRecorder {
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||
self.stack.push(position, heap);
|
||||
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
let mut prev_position = 0;
|
||||
doc_positions.clear();
|
||||
for position in &mut positions_iter {
|
||||
if position == POSITION_END {
|
||||
break;
|
||||
} else {
|
||||
doc_positions.push(position - prev_position);
|
||||
prev_position = position;
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
self.stack.read_to_end(heap, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
while let Some(doc) = u32_it.next() {
|
||||
let mut prev_position_plus_one = 1u32;
|
||||
buffer_positions.clear();
|
||||
loop {
|
||||
match u32_it.next() {
|
||||
Some(POSITION_END) | None => {
|
||||
break;
|
||||
}
|
||||
Some(position_plus_one) => {
|
||||
let delta_position = position_plus_one - prev_position_plus_one;
|
||||
buffer_positions.push(delta_position);
|
||||
prev_position_plus_one = position_plus_one;
|
||||
}
|
||||
}
|
||||
}
|
||||
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
|
||||
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::write_u32_vint;
|
||||
use super::BufferLender;
|
||||
use super::VInt32Reader;
|
||||
|
||||
#[test]
|
||||
fn test_buffer_lender() {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
{
|
||||
let buf = buffer_lender.lend_u8();
|
||||
assert!(buf.is_empty());
|
||||
buf.push(1u8);
|
||||
}
|
||||
{
|
||||
let buf = buffer_lender.lend_u8();
|
||||
assert!(buf.is_empty());
|
||||
buf.push(1u8);
|
||||
}
|
||||
{
|
||||
let (_, buf) = buffer_lender.lend_all();
|
||||
assert!(buf.is_empty());
|
||||
buf.push(1u32);
|
||||
}
|
||||
{
|
||||
let (_, buf) = buffer_lender.lend_all();
|
||||
assert!(buf.is_empty());
|
||||
buf.push(1u32);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vint_u32() {
|
||||
let mut buffer = vec![];
|
||||
let vals = [0, 1, 324_234_234, u32::max_value()];
|
||||
for &i in &vals {
|
||||
assert!(write_u32_vint(i, &mut buffer).is_ok());
|
||||
}
|
||||
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
|
||||
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
|
||||
assert_eq!(&res[..], &vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,7 +126,6 @@ impl SegmentPostings {
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
let mut start = 0;
|
||||
let end = arr.len();
|
||||
debug_assert!(target >= arr[start]);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut jump = 1;
|
||||
loop {
|
||||
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
debug_assert!(target >= self.doc());
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
@@ -622,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
@@ -635,6 +634,7 @@ mod tests {
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -662,6 +662,16 @@ mod tests {
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
@@ -693,7 +703,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -707,14 +717,44 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for doc in docs {
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
@@ -734,7 +774,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
@@ -744,7 +784,7 @@ mod tests {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
@@ -757,7 +797,7 @@ mod tests {
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(docs.clone());
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
|
||||
@@ -1,28 +1,37 @@
|
||||
use super::{Addr, MemoryArena};
|
||||
|
||||
use common::is_power_of_2;
|
||||
use postings::stacker::memory_arena::load;
|
||||
use postings::stacker::memory_arena::store;
|
||||
use std::io;
|
||||
use std::mem;
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
const FIRST_BLOCK: usize = 16;
|
||||
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
|
||||
|
||||
const FIRST_BLOCK: u32 = 4u32;
|
||||
enum CapacityResult {
|
||||
Available(u32),
|
||||
NeedAlloc(u32),
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
fn len_to_capacity(len: u32) -> CapacityResult {
|
||||
match len {
|
||||
0...3 => None,
|
||||
4...MAX_BLOCK_LEN => {
|
||||
if is_power_of_2(len as usize) {
|
||||
Some(len as usize)
|
||||
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
|
||||
16...MAX_BLOCK_LEN => {
|
||||
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
|
||||
let available = cap - len;
|
||||
if available == 0 {
|
||||
CapacityResult::NeedAlloc(len)
|
||||
} else {
|
||||
None
|
||||
CapacityResult::Available(available)
|
||||
}
|
||||
}
|
||||
n => {
|
||||
if n % MAX_BLOCK_LEN == 0 {
|
||||
Some(MAX_BLOCK_LEN as usize)
|
||||
let available = n % MAX_BLOCK_LEN;
|
||||
if available == 0 {
|
||||
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
|
||||
} else {
|
||||
None
|
||||
CapacityResult::Available(MAX_BLOCK_LEN - available)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,82 +61,119 @@ pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListWriter<'a> {
|
||||
eull: &'a mut ExpUnrolledLinkedList,
|
||||
heap: &'a mut MemoryArena,
|
||||
}
|
||||
|
||||
fn ensure_capacity<'a>(
|
||||
eull: &'a mut ExpUnrolledLinkedList,
|
||||
heap: &'a mut MemoryArena,
|
||||
) -> &'a mut [u8] {
|
||||
if eull.len <= FIRST_BLOCK as u32 {
|
||||
// We are still hitting the inline block.
|
||||
if eull.len < FIRST_BLOCK as u32 {
|
||||
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
|
||||
}
|
||||
// We need to allocate a new block!
|
||||
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
|
||||
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
|
||||
eull.tail = new_block_addr;
|
||||
return heap.slice_mut(eull.tail, FIRST_BLOCK);
|
||||
}
|
||||
let len = match len_to_capacity(eull.len) {
|
||||
CapacityResult::NeedAlloc(new_block_len) => {
|
||||
let new_block_addr: Addr =
|
||||
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
|
||||
heap.write_at(eull.tail, new_block_addr);
|
||||
eull.tail = new_block_addr;
|
||||
new_block_len
|
||||
}
|
||||
CapacityResult::Available(available) => available,
|
||||
};
|
||||
heap.slice_mut(eull.tail, len as usize)
|
||||
}
|
||||
|
||||
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
||||
if buf.is_empty() {
|
||||
// we need to cut early, because `ensure_capacity`
|
||||
// allocates if there is no capacity at all right now.
|
||||
return;
|
||||
}
|
||||
while !buf.is_empty() {
|
||||
let add_len: usize;
|
||||
{
|
||||
let output_buf = ensure_capacity(self.eull, self.heap);
|
||||
add_len = buf.len().min(output_buf.len());
|
||||
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
|
||||
}
|
||||
self.eull.len += add_len as u32;
|
||||
self.eull.tail = self.eull.tail.offset(add_len as u32);
|
||||
buf = &buf[add_len..];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
// There is no use case to only write the capacity.
|
||||
// This is not IO after all, so we write the whole
|
||||
// buffer even if the contract of `.write` is looser.
|
||||
self.extend_from_slice(buf);
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.extend_from_slice(buf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||
pub fn new() -> ExpUnrolledLinkedList {
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
head: addr,
|
||||
tail: addr,
|
||||
tail: Addr::null_pointer(),
|
||||
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: self.head,
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
||||
ExpUnrolledLinkedListWriter { eull: self, heap }
|
||||
}
|
||||
|
||||
/// Appends a new element to the current stack.
|
||||
///
|
||||
/// If the current block end is reached, a new block is allocated.
|
||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||
self.len += 1;
|
||||
if let Some(new_block_len) = jump_needed(self.len) {
|
||||
// We need to allocate another block.
|
||||
// We also allocate an extra `u32` to store the pointer
|
||||
// to the future next block.
|
||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, new_block_addr)
|
||||
};
|
||||
self.tail = new_block_addr;
|
||||
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
|
||||
let len = self.len as usize;
|
||||
if len <= FIRST_BLOCK {
|
||||
output.extend_from_slice(&self.inlined_data[..len]);
|
||||
return;
|
||||
}
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, val);
|
||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a MemoryArena,
|
||||
addr: Addr,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
self.consumed += 1;
|
||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||
unsafe {
|
||||
// logic
|
||||
self.heap.read(self.addr)
|
||||
}
|
||||
} else {
|
||||
self.addr
|
||||
};
|
||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||
Some(unsafe {
|
||||
// logic
|
||||
self.heap.read(addr)
|
||||
})
|
||||
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
|
||||
let mut cur = FIRST_BLOCK;
|
||||
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
|
||||
loop {
|
||||
let cap = match len_to_capacity(cur as u32) {
|
||||
CapacityResult::Available(capacity) => capacity,
|
||||
CapacityResult::NeedAlloc(capacity) => capacity,
|
||||
} as usize;
|
||||
let data = heap.slice(addr, cap);
|
||||
if cur + cap >= len {
|
||||
output.extend_from_slice(&data[..(len - cur)]);
|
||||
return;
|
||||
}
|
||||
output.extend_from_slice(data);
|
||||
cur += cap;
|
||||
addr = heap.read(addr.offset(cap as u32));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -136,39 +182,126 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
mod tests {
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::jump_needed;
|
||||
use super::len_to_capacity;
|
||||
use super::*;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
|
||||
#[test]
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
stack.push(1u32, &mut heap);
|
||||
stack.push(2u32, &mut heap);
|
||||
stack.push(4u32, &mut heap);
|
||||
stack.push(8u32, &mut heap);
|
||||
let mut stack = ExpUnrolledLinkedList::new();
|
||||
stack.writer(&mut heap).extend_from_slice(&[1u8]);
|
||||
stack.writer(&mut heap).extend_from_slice(&[2u8]);
|
||||
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
|
||||
stack.writer(&mut heap).extend_from_slice(&[5u8]);
|
||||
{
|
||||
let mut it = stack.iter(&heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
let mut buffer = Vec::new();
|
||||
stack.read_to_end(&heap, &mut buffer);
|
||||
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut block_len = 4u32;
|
||||
let mut i = 0;
|
||||
while i < 10_000_000 {
|
||||
assert!(jump_needed(i + block_len - 1).is_none());
|
||||
assert!(jump_needed(i + block_len + 1).is_none());
|
||||
assert!(jump_needed(i + block_len).is_some());
|
||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||
i += block_len;
|
||||
block_len = new_block_len as u32;
|
||||
fn test_stack_long() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new();
|
||||
let source: Vec<u32> = (0..100).collect();
|
||||
for &el in &source {
|
||||
assert!(stack
|
||||
.writer(&mut heap)
|
||||
.write_u32::<LittleEndian>(el)
|
||||
.is_ok());
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
stack.read_to_end(&heap, &mut buffer);
|
||||
let mut result = vec![];
|
||||
let mut remaining = &buffer[..];
|
||||
while !remaining.is_empty() {
|
||||
result.push(LittleEndian::read_u32(&remaining[..4]));
|
||||
remaining = &remaining[4..];
|
||||
}
|
||||
assert_eq!(&result[..], &source[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stack_interlaced() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new();
|
||||
let mut stack2 = ExpUnrolledLinkedList::new();
|
||||
|
||||
let mut vec1: Vec<u8> = vec![];
|
||||
let mut vec2: Vec<u8> = vec![];
|
||||
|
||||
for i in 0..9 {
|
||||
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
|
||||
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
|
||||
if i % 2 == 0 {
|
||||
assert!(stack2
|
||||
.writer(&mut heap)
|
||||
.write_u32::<LittleEndian>(i)
|
||||
.is_ok());
|
||||
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
|
||||
}
|
||||
}
|
||||
let mut res1 = vec![];
|
||||
let mut res2 = vec![];
|
||||
stack.read_to_end(&heap, &mut res1);
|
||||
stack2.read_to_end(&heap, &mut res2);
|
||||
assert_eq!(&vec1[..], &res1[..]);
|
||||
assert_eq!(&vec2[..], &res2[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut available = 16u32;
|
||||
for i in 0..10_000_000 {
|
||||
match len_to_capacity(i) {
|
||||
CapacityResult::NeedAlloc(cap) => {
|
||||
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
|
||||
available = cap;
|
||||
}
|
||||
CapacityResult::Available(cap) => {
|
||||
assert_eq!(
|
||||
available, cap,
|
||||
"Failed len={}: Expected {} Got {}",
|
||||
i, available, cap
|
||||
);
|
||||
}
|
||||
}
|
||||
available -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed_progression() {
|
||||
let mut v = vec![];
|
||||
for i in 0.. {
|
||||
if v.len() >= 10 {
|
||||
break;
|
||||
}
|
||||
match len_to_capacity(i) {
|
||||
CapacityResult::NeedAlloc(cap) => {
|
||||
v.push((i, cap));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
&v[..],
|
||||
&[
|
||||
(16, 16),
|
||||
(32, 32),
|
||||
(64, 64),
|
||||
(128, 128),
|
||||
(256, 256),
|
||||
(512, 512),
|
||||
(1024, 1024),
|
||||
(2048, 2048),
|
||||
(4096, 4096),
|
||||
(8192, 8192)
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,6 +309,7 @@ mod tests {
|
||||
mod bench {
|
||||
use super::super::MemoryArena;
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
@@ -203,13 +337,13 @@ mod bench {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
let mut stack = ExpUnrolledLinkedList::new();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &mut heap);
|
||||
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
||||
/// page of memory.
|
||||
///
|
||||
/// The last 20 bits are an address within this page of memory.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct Addr(u32);
|
||||
|
||||
impl Addr {
|
||||
@@ -69,32 +69,16 @@ impl Addr {
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait required for an object to be `storable`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Most of the time you should not implement this trait,
|
||||
/// and only use the `MemoryArena` with object implementing `Copy`.
|
||||
///
|
||||
/// `ArenaStorable` is used in `tantivy` to force
|
||||
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
||||
pub trait ArenaStorable {
|
||||
fn num_bytes(&self) -> usize;
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
||||
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
||||
unsafe {
|
||||
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> ArenaStorable for V
|
||||
where
|
||||
V: Copy,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
mem::size_of::<V>()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
||||
ptr::write_unaligned(dst_ptr, self);
|
||||
}
|
||||
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
|
||||
assert_eq!(data.len(), std::mem::size_of::<Item>());
|
||||
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
|
||||
}
|
||||
|
||||
/// The `MemoryArena`
|
||||
@@ -126,47 +110,9 @@ impl MemoryArena {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
/// Writes a slice at the given address, assuming the
|
||||
/// memory was allocated beforehands.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or corrupt the heap if he space was not
|
||||
/// properly allocated beforehands.
|
||||
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
||||
let bytes = data.as_ref();
|
||||
self.pages[addr.page_id()]
|
||||
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
||||
.copy_from_slice(bytes);
|
||||
}
|
||||
|
||||
/// Returns the `len` bytes starting at `addr`
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the memory has not been allocated beforehands.
|
||||
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
||||
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap
|
||||
///
|
||||
/// It allocates the `Item` beforehands.
|
||||
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
||||
let num_bytes = val.num_bytes();
|
||||
let addr = self.allocate_space(num_bytes);
|
||||
unsafe {
|
||||
self.write(addr, val);
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
||||
val.write_into(self, addr)
|
||||
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
|
||||
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
|
||||
store(dest, val);
|
||||
}
|
||||
|
||||
/// Read an item in the heap at the given `address`.
|
||||
@@ -174,9 +120,21 @@ impl MemoryArena {
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
||||
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
||||
ptr::read_unaligned(ptr as *const Item)
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
load(self.slice(addr, mem::size_of::<Item>()))
|
||||
}
|
||||
|
||||
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
|
||||
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
/// Allocates `len` bytes and returns the allocated address.
|
||||
@@ -197,14 +155,10 @@ struct Page {
|
||||
|
||||
impl Page {
|
||||
fn new(page_id: usize) -> Page {
|
||||
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
||||
unsafe {
|
||||
data.set_len(PAGE_SIZE);
|
||||
} // avoid initializing page
|
||||
Page {
|
||||
page_id,
|
||||
len: 0,
|
||||
data: data.into_boxed_slice(),
|
||||
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,12 +167,16 @@ impl Page {
|
||||
len + self.len <= PAGE_SIZE
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.slice_from(local_addr)[..len]
|
||||
}
|
||||
|
||||
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.data[local_addr..][..len]
|
||||
fn slice_from(&self, local_addr: usize) -> &[u8] {
|
||||
&self.data[local_addr..]
|
||||
}
|
||||
|
||||
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||
@@ -230,16 +188,6 @@ impl Page {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||
self.data.as_ptr().add(addr)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||
self.data.as_mut_ptr().add(addr)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -254,13 +202,13 @@ mod tests {
|
||||
let b = b"happy tax payer";
|
||||
|
||||
let addr_a = arena.allocate_space(a.len());
|
||||
arena.write_bytes(addr_a, a);
|
||||
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
|
||||
|
||||
let addr_b = arena.allocate_space(b.len());
|
||||
arena.write_bytes(addr_b, b);
|
||||
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
|
||||
|
||||
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
||||
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
||||
assert_eq!(arena.slice(addr_a, a.len()), a);
|
||||
assert_eq!(arena.slice(addr_b, b.len()), b);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
@@ -283,9 +231,15 @@ mod tests {
|
||||
b: 221,
|
||||
c: 12,
|
||||
};
|
||||
let addr_a = arena.store(a);
|
||||
let addr_b = arena.store(b);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
||||
|
||||
let num_bytes = std::mem::size_of::<MyTest>();
|
||||
let addr_a = arena.allocate_space(num_bytes);
|
||||
arena.write_at(addr_a, a);
|
||||
|
||||
let addr_b = arena.allocate_space(num_bytes);
|
||||
arena.write_at(addr_b, b);
|
||||
|
||||
assert_eq!(arena.read::<MyTest>(addr_a), a);
|
||||
assert_eq!(arena.read::<MyTest>(addr_b), b);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::memory_arena::{Addr, MemoryArena};
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
@@ -1,37 +1,16 @@
|
||||
use super::murmurhash2;
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
extern crate murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use postings::stacker::memory_arena::store;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
|
||||
struct KeyBytesValue<'a, V> {
|
||||
key: &'a [u8],
|
||||
value: V,
|
||||
}
|
||||
|
||||
impl<'a, V> KeyBytesValue<'a, V> {
|
||||
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
||||
KeyBytesValue { key, value }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
||||
where
|
||||
V: ArenaStorable,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
arena.write(addr, self.key.len() as u16);
|
||||
arena.write_bytes(addr.offset(2), self.key);
|
||||
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
@@ -111,8 +90,7 @@ impl<'a> Iterator for Iter<'a> {
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) =
|
||||
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as BucketId)
|
||||
})
|
||||
}
|
||||
@@ -143,12 +121,22 @@ impl TermHashMap {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
||||
let key_addr = addr.offset(2u32);
|
||||
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
||||
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
||||
(key_bytes, val_addr)
|
||||
#[inline(always)]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.heap.slice_from(addr);
|
||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||
if stored_key == target_key {
|
||||
Some(value_addr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
@@ -199,36 +187,39 @@ impl TermHashMap {
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy,
|
||||
V: Copy + 'static,
|
||||
TMutator: FnMut(Option<V>) -> V,
|
||||
{
|
||||
if self.is_saturated() {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let hash = murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let val = updater(None);
|
||||
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
||||
let num_bytes =
|
||||
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||
let key_addr = self.heap.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.heap.slice_mut(key_addr, num_bytes);
|
||||
NativeEndian::write_u16(data, key_bytes.len() as u16);
|
||||
let stop = 2 + key_bytes.len();
|
||||
data[2..stop].copy_from_slice(key_bytes);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
} else if kv.hash == hash {
|
||||
let (key_matches, val_addr) = {
|
||||
let (stored_key, val_addr): (&[u8], Addr) =
|
||||
unsafe { self.get_key_value(kv.key_value_addr) };
|
||||
(stored_key == key_bytes, val_addr)
|
||||
};
|
||||
if key_matches {
|
||||
unsafe {
|
||||
// logic
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write(val_addr, new_v);
|
||||
};
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||
{
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write_at(val_addr, new_v);
|
||||
return bucket as BucketId;
|
||||
}
|
||||
}
|
||||
@@ -236,24 +227,6 @@ impl TermHashMap {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -285,10 +258,7 @@ mod tests {
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let mut iter_values = hash_map.iter();
|
||||
while let Some((key, addr, _)) = iter_values.next() {
|
||||
let val: u32 = unsafe {
|
||||
// test
|
||||
hash_map.heap.read(addr)
|
||||
};
|
||||
let val: u32 = hash_map.heap.read(addr);
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::borrow::Cow;
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
use std::io::{self, Read, Write};
|
||||
use std::str;
|
||||
use std::string::FromUtf8Error;
|
||||
|
||||
const SLASH_BYTE: u8 = b'/';
|
||||
const ESCAPE_BYTE: u8 = b'\\';
|
||||
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
|
||||
/// representation of facets.
|
||||
pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
|
||||
/// `char` used as a level separation in the binary
|
||||
/// representation of facets. (It is the null codepoint.)
|
||||
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||
/// and `/electronics`.
|
||||
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Facet(Vec<u8>);
|
||||
pub struct Facet(String);
|
||||
|
||||
impl Facet {
|
||||
/// Returns a new instance of the "root facet"
|
||||
/// Equivalent to `/`.
|
||||
pub fn root() -> Facet {
|
||||
Facet(vec![])
|
||||
Facet("".to_string())
|
||||
}
|
||||
|
||||
/// Returns true iff the facet is the root facet `/`.
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.encoded_bytes().is_empty()
|
||||
self.encoded_str().is_empty()
|
||||
}
|
||||
|
||||
/// Returns a binary representation of the facet.
|
||||
@@ -49,13 +54,19 @@ impl Facet {
|
||||
/// This representation has the benefit of making it possible to
|
||||
/// express "being a child of a given facet" as a range over
|
||||
/// the term ordinals.
|
||||
pub fn encoded_bytes(&self) -> &[u8] {
|
||||
pub fn encoded_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Creates a `Facet` from its binary representation.
|
||||
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
|
||||
Facet(encoded_bytes)
|
||||
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
|
||||
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
|
||||
//Ok(Facet(String::from_utf8(encoded_bytes)?))
|
||||
String::from_utf8(encoded_bytes).map(Facet)
|
||||
}
|
||||
|
||||
/// Parse a text representation of a facet.
|
||||
@@ -79,36 +90,37 @@ impl Facet {
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString,
|
||||
{
|
||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
||||
let mut facet_string: String = String::with_capacity(100);
|
||||
let mut step_it = path.into_iter();
|
||||
if let Some(step) = step_it.next() {
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
for step in step_it {
|
||||
facet_bytes.push(FACET_SEP_BYTE);
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push(FACET_SEP_CHAR);
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
Facet(facet_bytes)
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Accessor for the inner buffer of the `Facet`.
|
||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
|
||||
self.0.clear();
|
||||
self.0.push_str(facet_str);
|
||||
}
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
let other_bytes: &[u8] = other.encoded_bytes();
|
||||
self_bytes.len() < other_bytes.len()
|
||||
&& other_bytes.starts_with(self_bytes)
|
||||
&& other_bytes[self_bytes.len()] == 0u8
|
||||
let self_str = self.encoded_str();
|
||||
let other_str = other.encoded_str();
|
||||
self_str.len() < other_str.len()
|
||||
&& other_str.starts_with(self_str)
|
||||
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for Facet {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
self.encoded_bytes()
|
||||
impl Borrow<str> for Facet {
|
||||
fn borrow(&self) -> &str {
|
||||
self.encoded_str()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
let mut facet_encoded = Vec::new();
|
||||
assert!(!path.is_empty());
|
||||
assert!(path.starts_with('/'));
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
for &c in &path_bytes[1..] {
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push(FACET_SEP_BYTE);
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, any_char) => {
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
facet_encoded.push(any_char);
|
||||
}
|
||||
(State::Idle, other_char) => {
|
||||
facet_encoded.push(other_char);
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path[last_offset..]);
|
||||
Facet(facet_encoded)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Facet {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
|
||||
<String as BinarySerializable>::serialize(&self.0, writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
|
||||
Ok(Facet(bytes))
|
||||
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Facet {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
|
||||
for step in self.0.split(FACET_SEP_CHAR) {
|
||||
write!(f, "/")?;
|
||||
let step_str = unsafe { str::from_utf8_unchecked(step) };
|
||||
write!(f, "{}", escape_slashes(step_str))?;
|
||||
write!(f, "{}", escape_slashes(step))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ impl Term {
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let bytes = facet.encoded_bytes();
|
||||
let bytes = facet.encoded_str().as_bytes();
|
||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
@@ -68,12 +68,7 @@ impl Term {
|
||||
term
|
||||
}
|
||||
|
||||
/// Creates a new Term with an empty buffer,
|
||||
/// but with a given capacity.
|
||||
///
|
||||
/// It is declared unsafe, as the term content
|
||||
/// is not initialized, and a call to `.field()`
|
||||
/// would panic.
|
||||
/// Creates a new Term for a given field.
|
||||
pub(crate) fn for_field(field: Field) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(100));
|
||||
term.set_field(field);
|
||||
|
||||
@@ -167,7 +167,7 @@ mod tests {
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
//let term = Term::from_bytes(term_it.key());
|
||||
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
|
||||
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||
}
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use byteorder::ByteOrder;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::Endianness;
|
||||
@@ -7,7 +7,6 @@ use directory::ReadOnlySource;
|
||||
use postings::TermInfo;
|
||||
use std::cmp;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ptr;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
const BLOCK_LEN: usize = 256;
|
||||
@@ -88,13 +87,17 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
assert!(num_bits <= 56);
|
||||
let addr_byte = addr_bits / 8;
|
||||
let bit_shift = (addr_bits % 8) as u64;
|
||||
assert!(data.len() >= addr_byte + 7);
|
||||
let val_unshifted_unmasked: u64 = unsafe {
|
||||
// ok because the pointer is only accessed using `ptr::read_unaligned`
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let addr = data.as_ptr().add(addr_byte) as *const u64;
|
||||
// ok thanks to the 7 byte padding
|
||||
ptr::read_unaligned(addr)
|
||||
let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
|
||||
LittleEndian::read_u64(&data[addr_byte..][..8])
|
||||
} else {
|
||||
// the buffer is not large enough.
|
||||
// Let's copy the few remaining bytes to a 8 byte buffer
|
||||
// padded with 0s.
|
||||
let mut buf = [0u8; 8];
|
||||
let data_to_copy = &data[addr_byte..];
|
||||
let nbytes = data_to_copy.len();
|
||||
buf[..nbytes].copy_from_slice(data_to_copy);
|
||||
LittleEndian::read_u64(&buf)
|
||||
};
|
||||
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
|
||||
let mask = (1u64 << u64::from(num_bits)) - 1;
|
||||
@@ -246,7 +249,6 @@ impl TermInfoStoreWriter {
|
||||
self.num_terms.serialize(write)?;
|
||||
write.write_all(&self.buffer_block_metas)?;
|
||||
write.write_all(&self.buffer_term_infos)?;
|
||||
write.write_all(&[0u8; 7])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use schema::FACET_SEP_BYTE;
|
||||
use std::str;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
/// and emits a token for all of its parent.
|
||||
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
.position(|b| b == FACET_SEP_BYTE)
|
||||
.map(|pos| cursor + 1 + pos)
|
||||
{
|
||||
let facet_part =
|
||||
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
||||
let facet_part = &self.text[cursor..next_sep_pos];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::UpToPosition(next_sep_pos);
|
||||
} else {
|
||||
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
|
||||
let facet_part = &self.text[cursor..];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::Terminated;
|
||||
}
|
||||
@@ -86,7 +84,6 @@ mod tests {
|
||||
|
||||
use super::FacetTokenizer;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
@@ -95,11 +92,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
@@ -115,11 +112,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 1);
|
||||
|
||||
Reference in New Issue
Block a user