Compare commits

...

11 Commits

Author SHA1 Message Date
Paul Masurel
b8241c5603 0.8.0 2018-12-26 10:18:34 +09:00
Paul Masurel
a4745151c0 Version to 0.8 2018-12-26 10:11:06 +09:00
Paul Masurel
e2ce326a8c Merge branch 'issue/457' 2018-12-18 10:35:01 +09:00
Paul Masurel
bb21d12a70 Bumping version 2018-12-18 10:14:12 +09:00
Paul Masurel
4565aba62a Added unit test for exponential search 2018-12-18 09:24:31 +09:00
Paul Masurel
545a7ec8dd Closes #457 2018-12-18 09:18:46 +09:00
Paul Masurel
e68775d71c Format and update murmurhash32 version 2018-12-17 19:12:38 +09:00
Paul Masurel
dcc92d287e Facet remove unsafe (#456)
* Removing some unsafe

* Removing some unsafe (2)

* Remove murmurhash
2018-12-17 19:08:48 +09:00
Paul Masurel
b48f81c051 Removing unsafe from bitpacking code (#455) 2018-12-17 19:06:37 +09:00
Paul Masurel
a3042e956b Facet remove unsafe (#454)
* Removing some unsafe

* Removing some unsafe (2)
2018-12-17 09:31:09 +09:00
dependabot[bot]
1fa10f0a0b Update itertools requirement from 0.7 to 0.8 (#453)
Updates the requirements on [itertools](https://github.com/bluss/rust-itertools) to permit the latest version.
- [Release notes](https://github.com/bluss/rust-itertools/releases)
- [Commits](https://github.com/bluss/rust-itertools/commits/0.8.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-12-17 09:28:36 +09:00
19 changed files with 222 additions and 201 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.1
Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.8.0-dev"
version = "0.8.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -29,7 +29,7 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.7"
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] }
@@ -49,6 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -369,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
}
FacetCounts { facet_counts }
}
@@ -403,9 +404,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));

View File

@@ -1,9 +1,6 @@
use common::serialize::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -18,7 +15,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: Write>(
pub fn write<TWrite: io::Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -28,14 +25,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -43,17 +40,18 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -102,9 +100,7 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -126,9 +122,7 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -64,17 +64,18 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
// This is lame, but it does not use unsafe code.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
results[pos] = fruit_res?;
num_items += 1;
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
}
}

View File

@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
}
/// Search Index

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::TantivyError;
use error::DataCorruption;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
@@ -64,7 +64,12 @@ impl ManagedDirectory {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {

View File

@@ -8,9 +8,42 @@ use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -33,8 +66,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -55,6 +88,12 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,5 +1,6 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -20,6 +21,7 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -37,6 +39,7 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -55,11 +58,18 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
.ord_to_term(facet_ord as u64, &mut self.buffer);
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet);
facet_reader.facet_from_ord(1, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet);
facet_reader.facet_from_ord(2, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet);
facet_reader.facet_from_ord(3, &mut facet).unwrap();
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet);
facet_reader.facet_from_ord(4, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -111,19 +111,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&[u8]> = field_values
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
let mut term = Term::for_field(field); // we set the Term
for facet_bytes in facets {
for fake_str in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =

View File

@@ -126,7 +126,6 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -622,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
@@ -635,6 +634,7 @@ mod tests {
use schema::Term;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -662,6 +662,16 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -693,7 +703,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -707,14 +717,44 @@ mod tests {
}
}
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for doc in docs {
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -734,7 +774,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -744,7 +784,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -757,7 +797,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(docs.clone());
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -1,9 +1,7 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,87 +0,0 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,4 +1,7 @@
use super::murmurhash2;
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
@@ -206,7 +209,7 @@ impl TermHashMap {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();

View File

@@ -6,6 +6,7 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write};
use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\';
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(Vec<u8>);
pub struct Facet(String);
impl Facet {
/// Returns a new instance of the "root facet"
/// Equivalent to `/`.
pub fn root() -> Facet {
Facet(vec![])
Facet("".to_string())
}
/// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool {
self.encoded_bytes().is_empty()
self.encoded_str().is_empty()
}
/// Returns a binary representation of the facet.
@@ -49,13 +54,19 @@ impl Facet {
/// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over
/// the term ordinals.
pub fn encoded_bytes(&self) -> &[u8] {
pub fn encoded_str(&self) -> &str {
&self.0
}
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation.
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
Facet(encoded_bytes)
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
}
/// Parse a text representation of a facet.
@@ -79,36 +90,37 @@ impl Facet {
Path: IntoIterator,
Path::Item: ToString,
{
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut facet_string: String = String::with_capacity(100);
let mut step_it = path.into_iter();
if let Some(step) = step_it.next() {
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push_str(&step.to_string());
}
for step in step_it {
facet_bytes.push(FACET_SEP_BYTE);
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push(FACET_SEP_CHAR);
facet_string.push_str(&step.to_string());
}
Facet(facet_bytes)
Facet(facet_string)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
}
/// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_bytes: &[u8] = self.encoded_bytes();
let other_bytes: &[u8] = other.encoded_bytes();
self_bytes.len() < other_bytes.len()
&& other_bytes.starts_with(self_bytes)
&& other_bytes[self_bytes.len()] == 0u8
let self_str = self.encoded_str();
let other_str = other.encoded_str();
self_str.len() < other_str.len()
&& other_str.starts_with(self_str)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
}
}
impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &[u8] {
self.encoded_bytes()
impl Borrow<str> for Facet {
fn borrow(&self) -> &str {
self.encoded_str()
}
}
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle,
}
let path: &str = path_asref.as_ref();
let mut facet_encoded = Vec::new();
assert!(!path.is_empty());
assert!(path.starts_with("/"));
let mut facet_encoded = String::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
for &c in &path_bytes[1..] {
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) {
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => {
facet_encoded.push(FACET_SEP_BYTE);
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
}
(State::Escaped, any_char) => {
(State::Escaped, _escaped_char) => {
state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
}
}
impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
<String as BinarySerializable>::serialize(&self.0, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
}
}
impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
for step in self.0.split(FACET_SEP_CHAR) {
write!(f, "/")?;
let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
write!(f, "{}", escape_slashes(step))?;
}
Ok(())
}

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_bytes();
let bytes = facet.encoded_str().as_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
@@ -68,12 +68,7 @@ impl Term {
term
}
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
term.set_field(field);

View File

@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
}
assert_eq!(&*term_string, "abcdef");
}

View File

@@ -1,6 +1,5 @@
use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
@@ -86,7 +84,6 @@ mod tests {
use super::FacetTokenizer;
use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer};
#[test]
@@ -95,11 +92,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -115,11 +112,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);