mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
Compare commits
11 Commits
segment_fr
...
0.8.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8241c5603 | ||
|
|
a4745151c0 | ||
|
|
e2ce326a8c | ||
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
e68775d71c | ||
|
|
dcc92d287e | ||
|
|
b48f81c051 | ||
|
|
a3042e956b | ||
|
|
1fa10f0a0b |
@@ -1,4 +1,4 @@
|
||||
Tantivy 0.8.1
|
||||
Tantivy 0.8.0
|
||||
=====================
|
||||
*No change in the index format*
|
||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.8.0-dev"
|
||||
version = "0.8.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -29,7 +29,7 @@ serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.7"
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||
@@ -49,6 +49,7 @@ failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
@@ -369,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
let mut facet = vec![];
|
||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
||||
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
|
||||
// TODO
|
||||
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
@@ -403,9 +404,9 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
||||
facet_after_bytes.push('\u{1}');
|
||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use common::serialize::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
@@ -18,7 +15,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
@@ -28,14 +25,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -43,17 +40,18 @@ impl BitPacker {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
@@ -102,9 +100,7 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -126,9 +122,7 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -64,17 +64,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut results = Vec::with_capacity(num_fruits);
|
||||
unsafe { results.set_len(num_fruits) };
|
||||
let mut num_items = 0;
|
||||
// This is lame, but it does not use unsafe code.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
results[pos] = fruit_res?;
|
||||
num_items += 1;
|
||||
let fruit = fruit_res?;
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
// this checks ensures that we filled of this
|
||||
// uninitialized memory.
|
||||
assert_eq!(num_items, results.len());
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string)
|
||||
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.clone(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::TantivyError;
|
||||
use error::DataCorruption;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
@@ -64,7 +64,12 @@ impl ManagedDirectory {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
MANAGED_FILEPATH.clone(),
|
||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
|
||||
43
src/error.rs
43
src/error.rs
@@ -8,9 +8,42 @@ use indexer::LockType;
|
||||
use query;
|
||||
use schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
impl DataCorruption {
|
||||
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: Some(filepath),
|
||||
comment,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn comment_only(comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: None,
|
||||
comment,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
}
|
||||
write!(f, ": {}.", self.comment)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The library's failure based error enum
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum TantivyError {
|
||||
@@ -33,8 +66,8 @@ pub enum TantivyError {
|
||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||
IOError(#[cause] IOError),
|
||||
/// Data corruption.
|
||||
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
|
||||
CorruptedFile(PathBuf),
|
||||
#[fail(display = "{:?}", _0)]
|
||||
DataCorruption(DataCorruption),
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
@@ -55,6 +88,12 @@ pub enum TantivyError {
|
||||
SystemError(String),
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||
TantivyError::FastFieldError(fastfield_error)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
@@ -20,6 +21,7 @@ use DocId;
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -37,6 +39,7 @@ impl FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
buffer: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,11 +58,18 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
pub fn facet_from_ord(
|
||||
&mut self,
|
||||
facet_ord: TermOrdinal,
|
||||
output: &mut Facet,
|
||||
) -> Result<(), str::Utf8Error> {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
let facet_str = str::from_utf8(&self.buffer[..])?;
|
||||
output.set_facet_str(facet_str);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
|
||||
@@ -82,20 +82,20 @@ mod tests {
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
|
||||
@@ -111,19 +111,18 @@ impl SegmentWriter {
|
||||
}
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&[u8]> = field_values
|
||||
let facets: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
for facet_bytes in facets {
|
||||
for fake_str in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
term.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
|
||||
@@ -126,7 +126,6 @@ impl SegmentPostings {
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
let mut start = 0;
|
||||
let end = arr.len();
|
||||
debug_assert!(target >= arr[start]);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut jump = 1;
|
||||
loop {
|
||||
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
debug_assert!(target >= self.doc());
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
@@ -622,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
@@ -635,6 +634,7 @@ mod tests {
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -662,6 +662,16 @@ mod tests {
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
@@ -693,7 +703,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -707,14 +717,44 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for doc in docs {
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
@@ -734,7 +774,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
@@ -744,7 +784,7 @@ mod tests {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
@@ -757,7 +797,7 @@ mod tests {
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(docs.clone());
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,7 @@
|
||||
use super::murmurhash2;
|
||||
extern crate murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
@@ -206,7 +209,7 @@ impl TermHashMap {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let hash = murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::borrow::Cow;
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
use std::io::{self, Read, Write};
|
||||
use std::str;
|
||||
use std::string::FromUtf8Error;
|
||||
|
||||
const SLASH_BYTE: u8 = b'/';
|
||||
const ESCAPE_BYTE: u8 = b'\\';
|
||||
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
|
||||
/// representation of facets.
|
||||
pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
|
||||
/// `char` used as a level separation in the binary
|
||||
/// representation of facets. (It is the null codepoint.)
|
||||
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||
/// and `/electronics`.
|
||||
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Facet(Vec<u8>);
|
||||
pub struct Facet(String);
|
||||
|
||||
impl Facet {
|
||||
/// Returns a new instance of the "root facet"
|
||||
/// Equivalent to `/`.
|
||||
pub fn root() -> Facet {
|
||||
Facet(vec![])
|
||||
Facet("".to_string())
|
||||
}
|
||||
|
||||
/// Returns true iff the facet is the root facet `/`.
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.encoded_bytes().is_empty()
|
||||
self.encoded_str().is_empty()
|
||||
}
|
||||
|
||||
/// Returns a binary representation of the facet.
|
||||
@@ -49,13 +54,19 @@ impl Facet {
|
||||
/// This representation has the benefit of making it possible to
|
||||
/// express "being a child of a given facet" as a range over
|
||||
/// the term ordinals.
|
||||
pub fn encoded_bytes(&self) -> &[u8] {
|
||||
pub fn encoded_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Creates a `Facet` from its binary representation.
|
||||
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
|
||||
Facet(encoded_bytes)
|
||||
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
|
||||
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
|
||||
//Ok(Facet(String::from_utf8(encoded_bytes)?))
|
||||
String::from_utf8(encoded_bytes).map(Facet)
|
||||
}
|
||||
|
||||
/// Parse a text representation of a facet.
|
||||
@@ -79,36 +90,37 @@ impl Facet {
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString,
|
||||
{
|
||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
||||
let mut facet_string: String = String::with_capacity(100);
|
||||
let mut step_it = path.into_iter();
|
||||
if let Some(step) = step_it.next() {
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
for step in step_it {
|
||||
facet_bytes.push(FACET_SEP_BYTE);
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push(FACET_SEP_CHAR);
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
Facet(facet_bytes)
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Accessor for the inner buffer of the `Facet`.
|
||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
|
||||
self.0.clear();
|
||||
self.0.push_str(facet_str);
|
||||
}
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
let other_bytes: &[u8] = other.encoded_bytes();
|
||||
self_bytes.len() < other_bytes.len()
|
||||
&& other_bytes.starts_with(self_bytes)
|
||||
&& other_bytes[self_bytes.len()] == 0u8
|
||||
let self_str = self.encoded_str();
|
||||
let other_str = other.encoded_str();
|
||||
self_str.len() < other_str.len()
|
||||
&& other_str.starts_with(self_str)
|
||||
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for Facet {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
self.encoded_bytes()
|
||||
impl Borrow<str> for Facet {
|
||||
fn borrow(&self) -> &str {
|
||||
self.encoded_str()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
let mut facet_encoded = Vec::new();
|
||||
assert!(!path.is_empty());
|
||||
assert!(path.starts_with("/"));
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
for &c in &path_bytes[1..] {
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push(FACET_SEP_BYTE);
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, any_char) => {
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
facet_encoded.push(any_char);
|
||||
}
|
||||
(State::Idle, other_char) => {
|
||||
facet_encoded.push(other_char);
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path[last_offset..]);
|
||||
Facet(facet_encoded)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Facet {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
|
||||
<String as BinarySerializable>::serialize(&self.0, writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
|
||||
Ok(Facet(bytes))
|
||||
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Facet {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
|
||||
for step in self.0.split(FACET_SEP_CHAR) {
|
||||
write!(f, "/")?;
|
||||
let step_str = unsafe { str::from_utf8_unchecked(step) };
|
||||
write!(f, "{}", escape_slashes(step_str))?;
|
||||
write!(f, "{}", escape_slashes(step))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ impl Term {
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let bytes = facet.encoded_bytes();
|
||||
let bytes = facet.encoded_str().as_bytes();
|
||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
@@ -68,12 +68,7 @@ impl Term {
|
||||
term
|
||||
}
|
||||
|
||||
/// Creates a new Term with an empty buffer,
|
||||
/// but with a given capacity.
|
||||
///
|
||||
/// It is declared unsafe, as the term content
|
||||
/// is not initialized, and a call to `.field()`
|
||||
/// would panic.
|
||||
/// Creates a new Term for a given field.
|
||||
pub(crate) fn for_field(field: Field) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(100));
|
||||
term.set_field(field);
|
||||
|
||||
@@ -167,7 +167,7 @@ mod tests {
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
//let term = Term::from_bytes(term_it.key());
|
||||
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
|
||||
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||
}
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use schema::FACET_SEP_BYTE;
|
||||
use std::str;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
/// and emits a token for all of its parent.
|
||||
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
.position(|b| b == FACET_SEP_BYTE)
|
||||
.map(|pos| cursor + 1 + pos)
|
||||
{
|
||||
let facet_part =
|
||||
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
||||
let facet_part = &self.text[cursor..next_sep_pos];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::UpToPosition(next_sep_pos);
|
||||
} else {
|
||||
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
|
||||
let facet_part = &self.text[cursor..];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::Terminated;
|
||||
}
|
||||
@@ -86,7 +84,6 @@ mod tests {
|
||||
|
||||
use super::FacetTokenizer;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
@@ -95,11 +92,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
@@ -115,11 +112,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 1);
|
||||
|
||||
Reference in New Issue
Block a user