Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
2267722d01 Added SegmentFruit 2018-12-13 08:58:00 +09:00
31 changed files with 301 additions and 390 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.0 Tantivy 0.8.1
===================== =====================
*No change in the index format* *No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton) - API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.8.2" version = "0.8.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -29,7 +29,7 @@ serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = "1.0" serde_json = "1.0"
num_cpus = "1.2" num_cpus = "1.2"
itertools = "0.8" itertools = "0.7"
levenshtein_automata = {version="0.1", features=["fst_automaton"]} levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5" bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] } uuid = { version = "0.7", features = ["v4", "serde"] }
@@ -49,7 +49,6 @@ failure = "0.1"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.2" fail = "0.2"
scoped-pool = "1.0" scoped-pool = "1.0"
murmurhash32 = "0.2"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.2" winapi = "0.2"

View File

@@ -70,6 +70,8 @@ impl Collector for StatsCollector {
// Our standard deviation will be a float. // Our standard deviation will be a float.
type Fruit = Option<Stats>; type Fruit = Option<Stats>;
type SegmentFruit = Self::Fruit;
type Child = StatsSegmentCollector; type Child = StatsSegmentCollector;
fn for_segment( fn for_segment(

View File

@@ -58,6 +58,7 @@ pub struct Count;
impl Collector for Count { impl Collector for Count {
type Fruit = usize; type Fruit = usize;
type SegmentFruit = usize;
type Child = SegmentCountCollector; type Child = SegmentCountCollector;

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult { ) -> SkipResult {
loop { loop {
match collapse_it.peek() { match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) { Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Ordering::Less => {} Ordering::Less => {}
Ordering::Greater => { Ordering::Greater => {
return SkipResult::OverStep; return SkipResult::OverStep;
@@ -258,6 +258,8 @@ impl FacetCollector {
impl Collector for FacetCollector { impl Collector for FacetCollector {
type Fruit = FacetCounts; type Fruit = FacetCounts;
type SegmentFruit = FacetCounts;
type Child = FacetSegmentCollector; type Child = FacetSegmentCollector;
fn for_segment( fn for_segment(
@@ -369,8 +371,7 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![]; let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord]; let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet); facet_dict.ord_to_term(facet_ord as u64, &mut facet);
// TODO facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
} }
FacetCounts { facet_counts } FacetCounts { facet_counts }
} }
@@ -404,9 +405,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() { let right_bound = if facet.is_root() {
Bound::Unbounded Bound::Unbounded
} else { } else {
let mut facet_after_bytes: String = facet.encoded_str().to_owned(); let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push('\u{1}'); facet_after_bytes.push(1u8);
let facet_after = Facet::from_encoded_string(facet_after_bytes); let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
Bound::Excluded(facet_after) Bound::Excluded(facet_after)
}; };
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound)); let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));

View File

@@ -136,8 +136,10 @@ pub trait Collector: Sync {
/// e.g. `usize` for the `Count` collector. /// e.g. `usize` for the `Count` collector.
type Fruit: Fruit; type Fruit: Fruit;
type SegmentFruit: Fruit;
/// Type of the `SegmentCollector` associated to this collector. /// Type of the `SegmentCollector` associated to this collector.
type Child: SegmentCollector<Fruit = Self::Fruit>; type Child: SegmentCollector<Fruit = Self::SegmentFruit>;
/// `set_segment` is called before beginning to enumerate /// `set_segment` is called before beginning to enumerate
/// on this segment. /// on this segment.
@@ -152,7 +154,7 @@ pub trait Collector: Sync {
/// Combines the fruit associated to the collection of each segments /// Combines the fruit associated to the collection of each segments
/// into one fruit. /// into one fruit.
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>; fn merge_fruits(&self, segment_fruits: Vec<Self::SegmentFruit>) -> Result<Self::Fruit>;
} }
/// The `SegmentCollector` is the trait in charge of defining the /// The `SegmentCollector` is the trait in charge of defining the
@@ -181,6 +183,9 @@ where
Right: Collector, Right: Collector,
{ {
type Fruit = (Left::Fruit, Right::Fruit); type Fruit = (Left::Fruit, Right::Fruit);
type SegmentFruit = (Left::SegmentFruit, Right::SegmentFruit);
type Child = (Left::Child, Right::Child); type Child = (Left::Child, Right::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> { fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -195,7 +200,7 @@ where
fn merge_fruits( fn merge_fruits(
&self, &self,
children: Vec<(Left::Fruit, Right::Fruit)>, children: Vec<(Left::SegmentFruit, Right::SegmentFruit)>,
) -> Result<(Left::Fruit, Right::Fruit)> { ) -> Result<(Left::Fruit, Right::Fruit)> {
let mut left_fruits = vec![]; let mut left_fruits = vec![];
let mut right_fruits = vec![]; let mut right_fruits = vec![];
@@ -236,6 +241,7 @@ where
Three: Collector, Three: Collector,
{ {
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit); type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit);
type Child = (One::Child, Two::Child, Three::Child); type Child = (One::Child, Two::Child, Three::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> { fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -249,7 +255,7 @@ where
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring() self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
} }
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> { fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![]; let mut one_fruits = vec![];
let mut two_fruits = vec![]; let mut two_fruits = vec![];
let mut three_fruits = vec![]; let mut three_fruits = vec![];
@@ -295,6 +301,7 @@ where
Four: Collector, Four: Collector,
{ {
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit); type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
type SegmentFruit = (One::SegmentFruit, Two::SegmentFruit, Three::SegmentFruit, Four::SegmentFruit);
type Child = (One::Child, Two::Child, Three::Child, Four::Child); type Child = (One::Child, Two::Child, Three::Child, Four::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> { fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
@@ -312,7 +319,7 @@ where
|| self.3.requires_scoring() || self.3.requires_scoring()
} }
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> { fn merge_fruits(&self, children: Vec<Self::SegmentFruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![]; let mut one_fruits = vec![];
let mut two_fruits = vec![]; let mut two_fruits = vec![];
let mut three_fruits = vec![]; let mut three_fruits = vec![];

View File

@@ -18,6 +18,7 @@ pub struct CollectorWrapper<TCollector: Collector>(TCollector);
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> { impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
type Fruit = Box<Fruit>; type Fruit = Box<Fruit>;
type SegmentFruit = Box<Fruit>;
type Child = Box<BoxableSegmentCollector>; type Child = Box<BoxableSegmentCollector>;
fn for_segment( fn for_segment(
@@ -34,10 +35,10 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
} }
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> { fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
let typed_fruit: Vec<TCollector::Fruit> = children let typed_fruit: Vec<TCollector::SegmentFruit> = children
.into_iter() .into_iter()
.map(|untyped_fruit| { .map(|untyped_fruit| {
Downcast::<TCollector::Fruit>::downcast(untyped_fruit) Downcast::<TCollector::SegmentFruit>::downcast(untyped_fruit)
.map(|boxed_but_typed| *boxed_but_typed) .map(|boxed_but_typed| *boxed_but_typed)
.map_err(|e| { .map_err(|e| {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e); let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
@@ -152,7 +153,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
#[derive(Default)] #[derive(Default)]
pub struct MultiCollector<'a> { pub struct MultiCollector<'a> {
collector_wrappers: collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>, Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>, SegmentFruit = Box<Fruit>> + 'a>>,
} }
impl<'a> MultiCollector<'a> { impl<'a> MultiCollector<'a> {
@@ -177,7 +178,9 @@ impl<'a> MultiCollector<'a> {
} }
impl<'a> Collector for MultiCollector<'a> { impl<'a> Collector for MultiCollector<'a> {
type Fruit = MultiFruit; type Fruit = MultiFruit;
type SegmentFruit = MultiFruit;
type Child = MultiCollectorChild; type Child = MultiCollectorChild;
fn for_segment( fn for_segment(

View File

@@ -40,6 +40,7 @@ impl TestFruit {
impl Collector for TestCollector { impl Collector for TestCollector {
type Fruit = TestFruit; type Fruit = TestFruit;
type SegmentFruit = Self::Fruit;
type Child = TestSegmentCollector; type Child = TestSegmentCollector;
fn for_segment( fn for_segment(
@@ -109,6 +110,8 @@ impl FastFieldTestCollector {
impl Collector for FastFieldTestCollector { impl Collector for FastFieldTestCollector {
type Fruit = Vec<u64>; type Fruit = Vec<u64>;
type SegmentFruit = Self::Fruit;
type Child = FastFieldSegmentCollector; type Child = FastFieldSegmentCollector;
fn for_segment( fn for_segment(
@@ -165,6 +168,7 @@ impl BytesFastFieldTestCollector {
impl Collector for BytesFastFieldTestCollector { impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>; type Fruit = Vec<u8>;
type SegmentFruit = Self::Fruit;
type Child = BytesFastFieldSegmentCollector; type Child = BytesFastFieldSegmentCollector;
fn for_segment( fn for_segment(

View File

@@ -88,6 +88,7 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> { impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
type Fruit = Vec<(T, DocAddress)>; type Fruit = Vec<(T, DocAddress)>;
type SegmentFruit = Vec<(T, DocAddress)>;
type Child = TopFieldSegmentCollector<T>; type Child = TopFieldSegmentCollector<T>;

View File

@@ -89,6 +89,7 @@ impl TopDocs {
impl Collector for TopDocs { impl Collector for TopDocs {
type Fruit = Vec<(Score, DocAddress)>; type Fruit = Vec<(Score, DocAddress)>;
type SegmentFruit = Vec<(Score, DocAddress)>;
type Child = TopScoreSegmentCollector; type Child = TopScoreSegmentCollector;

View File

@@ -1,6 +1,9 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use common::serialize::BinarySerializable;
use std::io; use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref; use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker { pub(crate) struct BitPacker {
mini_buffer: u64, mini_buffer: u64,
@@ -15,7 +18,7 @@ impl BitPacker {
} }
} }
pub fn write<TWrite: io::Write>( pub fn write<TWrite: Write>(
&mut self, &mut self,
val: u64, val: u64,
num_bits: u8, num_bits: u8,
@@ -25,14 +28,14 @@ impl BitPacker {
let num_bits = num_bits as usize; let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 { if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32); self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?; self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32); self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64; self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else { } else {
self.mini_buffer |= val_u64 << self.mini_buffer_written; self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits; self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 { if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?; self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0; self.mini_buffer_written = 0;
self.mini_buffer = 0u64; self.mini_buffer = 0u64;
} }
@@ -40,18 +43,17 @@ impl BitPacker {
Ok(()) Ok(())
} }
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> { pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 { if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8; let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8]; let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?; output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0; self.mini_buffer_written = 0;
} }
Ok(()) Ok(())
} }
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> { pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?; self.flush(output)?;
// Padding the write file to simplify reads. // Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?; output.write_all(&[0u8; 7])?;
@@ -100,7 +102,9 @@ where
addr + 8 <= data.len(), addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes." "The fast field field should have been padded with 7 bytes."
); );
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]); #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask val_shifted & mask
} }
@@ -122,7 +126,9 @@ where
for output_val in output.iter_mut() { for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3; let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]); #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask; *output_val = val_shifted & mask;
addr_in_bits += num_bits; addr_in_bits += num_bits;

View File

@@ -64,18 +64,17 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to // This is important as it makes it possible for the fruit_receiver iteration to
// terminate. // terminate.
}; };
// This is lame, but it does not use unsafe code. let mut results = Vec::with_capacity(num_fruits);
let mut results_with_position = Vec::with_capacity(num_fruits); unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
for (pos, fruit_res) in fruit_receiver { for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?; results[pos] = fruit_res?;
results_with_position.push((pos, fruit)); num_items += 1;
} }
results_with_position.sort_by_key(|(pos, _)| *pos); // this checks ensures that we filled of this
assert_eq!(results_with_position.len(), num_fruits); // uninitialized memory.
Ok(results_with_position assert_eq!(num_items, results.len());
.into_iter() Ok(results)
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
} }
} }
} }

View File

@@ -13,7 +13,6 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
use directory::MmapDirectory; use directory::MmapDirectory;
use directory::{Directory, RAMDirectory}; use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError; use error::TantivyError;
use indexer::index_writer::open_index_writer; use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN; use indexer::index_writer::HEAP_SIZE_MIN;
@@ -38,13 +37,7 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data); let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string) serde_json::from_str(&meta_string)
.map_err(|e| { .map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
} }
/// Search Index /// Search Index
@@ -150,7 +143,7 @@ impl Index {
/// ///
/// This will overwrite existing meta.json /// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> { fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?; save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema); let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas) Index::create_from_metas(directory, &metas)
} }

View File

@@ -23,7 +23,7 @@ fn collect_segment<C: Collector>(
weight: &Weight, weight: &Weight,
segment_ord: u32, segment_ord: u32,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
) -> Result<C::Fruit> { ) -> Result<C::SegmentFruit> {
let mut scorer = weight.scorer(segment_reader)?; let mut scorer = weight.scorer(segment_reader)?;
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?; let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
if let Some(delete_bitset) = segment_reader.delete_bitset() { if let Some(delete_bitset) = segment_reader.delete_bitset() {

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH; use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr}; use directory::{ReadOnlySource, WritePtr};
use error::DataCorruption; use error::TantivyError;
use indexer::LockType; use indexer::LockType;
use serde_json; use serde_json;
use std::collections::HashSet; use std::collections::HashSet;
@@ -64,12 +64,7 @@ impl ManagedDirectory {
Ok(data) => { Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data); let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json) let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| { .map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
Ok(ManagedDirectory { Ok(ManagedDirectory {
directory: Box::new(directory), directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation { meta_informations: Arc::new(RwLock::new(MetaInformation {

View File

@@ -8,42 +8,9 @@ use indexer::LockType;
use query; use query;
use schema; use schema;
use serde_json; use serde_json;
use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::PoisonError; use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum /// The library's failure based error enum
#[derive(Debug, Fail)] #[derive(Debug, Fail)]
pub enum TantivyError { pub enum TantivyError {
@@ -66,8 +33,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)] #[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError), IOError(#[cause] IOError),
/// Data corruption. /// Data corruption.
#[fail(display = "{:?}", _0)] #[fail(display = "File contains corrupted data: '{:?}'", _0)]
DataCorruption(DataCorruption), CorruptedFile(PathBuf),
/// A thread holding the locked panicked and poisoned the lock. /// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")] #[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned, Poisoned,
@@ -88,12 +55,6 @@ pub enum TantivyError {
SystemError(String), SystemError(String),
} }
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError { impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error) TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,6 +1,5 @@
use super::MultiValueIntFastFieldReader; use super::MultiValueIntFastFieldReader;
use schema::Facet; use schema::Facet;
use std::str;
use termdict::TermDictionary; use termdict::TermDictionary;
use termdict::TermOrdinal; use termdict::TermOrdinal;
use DocId; use DocId;
@@ -21,7 +20,6 @@ use DocId;
pub struct FacetReader { pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>, term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary, term_dict: TermDictionary,
buffer: Vec<u8>,
} }
impl FacetReader { impl FacetReader {
@@ -39,7 +37,6 @@ impl FacetReader {
FacetReader { FacetReader {
term_ords, term_ords,
term_dict, term_dict,
buffer: vec![],
} }
} }
@@ -58,18 +55,11 @@ impl FacetReader {
} }
/// Given a term ordinal returns the term associated to it. /// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord( pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
let found_term = self let found_term = self
.term_dict .term_dict
.ord_to_term(facet_ord as u64, &mut self.buffer); .ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord); assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
} }
/// Return the list of facet ordinals associated to a document. /// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root(); let mut facet = Facet::root();
{ {
facet_reader.facet_from_ord(1, &mut facet).unwrap(); facet_reader.facet_from_ord(1, &mut facet);
assert_eq!(facet, Facet::from("/category")); assert_eq!(facet, Facet::from("/category"));
} }
{ {
facet_reader.facet_from_ord(2, &mut facet).unwrap(); facet_reader.facet_from_ord(2, &mut facet);
assert_eq!(facet, Facet::from("/category/cat1")); assert_eq!(facet, Facet::from("/category/cat1"));
} }
{ {
facet_reader.facet_from_ord(3, &mut facet).unwrap(); facet_reader.facet_from_ord(3, &mut facet);
assert_eq!(format!("{}", facet), "/category/cat2"); assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2")); assert_eq!(facet, Facet::from("/category/cat2"));
} }
{ {
facet_reader.facet_from_ord(4, &mut facet).unwrap(); facet_reader.facet_from_ord(4, &mut facet);
assert_eq!(facet, Facet::from("/category/cat3")); assert_eq!(facet, Facet::from("/category/cat3"));
} }

View File

@@ -558,8 +558,11 @@ impl IndexWriter {
// and recreate a new one channels. // and recreate a new one channels.
self.recreate_document_channel(); self.recreate_document_channel();
let former_workers_join_handle = let mut former_workers_join_handle = Vec::new();
mem::replace(&mut self.workers_join_handle, Vec::new()); swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle { for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle let indexing_worker_result = worker_handle
@@ -736,7 +739,7 @@ mod tests {
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c")); index_writer.add_document(doc!(text_field=>"c"));
} }
assert_eq!(index_writer.commit().unwrap(), 3u64); assert_eq!(index_writer.commit().unwrap(), 2u64);
index.load_searchers().unwrap(); index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1); assert_eq!(num_docs_containing("b"), 1);
@@ -799,6 +802,7 @@ mod tests {
{ {
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed"); let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.commit().expect("commit failed"); prepared_commit.commit().expect("commit failed");
} }
{ {
@@ -832,6 +836,7 @@ mod tests {
{ {
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed"); let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.abort().expect("commit failed"); prepared_commit.abort().expect("commit failed");
} }
{ {

View File

@@ -654,7 +654,6 @@ mod tests {
use schema::IntOptions; use schema::IntOptions;
use schema::Term; use schema::Term;
use schema::TextFieldIndexing; use schema::TextFieldIndexing;
use schema::INT_INDEXED;
use std::io::Cursor; use std::io::Cursor;
use DocAddress; use DocAddress;
use IndexWriter; use IndexWriter;
@@ -984,7 +983,7 @@ mod tests {
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
index.load_searchers().unwrap(); index.load_searchers().unwrap();
let searcher = index.searcher(); let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1030,7 +1029,7 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); index.load_searchers().unwrap();
let searcher = index.searcher(); let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1126,7 +1125,6 @@ mod tests {
{ {
// Test removing all docs // Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g")); index_writer.delete_term(Term::from_field_text(text_field, "g"));
index_writer.commit().unwrap();
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
@@ -1257,34 +1255,6 @@ mod tests {
} }
} }
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(index.searcher().num_docs(), 2);
}
#[test] #[test]
fn test_merge_multivalued_int_fields_all_deleted() { fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();

View File

@@ -18,6 +18,7 @@ use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes; use indexer::index_writer::advance_deletes;
use indexer::merger::IndexMerger; use indexer::merger::IndexMerger;
use indexer::stamper::Stamper; use indexer::stamper::Stamper;
use indexer::MergeCandidate;
use indexer::SegmentEntry; use indexer::SegmentEntry;
use indexer::SegmentSerializer; use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy}; use indexer::{DefaultMergePolicy, MergePolicy};
@@ -44,15 +45,8 @@ use Result;
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> { pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
save_metas( save_metas(vec![], schema, opstamp, None, directory)
&IndexMeta {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None
},
directory)
} }
/// Save the index meta file. /// Save the index meta file.
@@ -64,17 +58,20 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
fn save_metas( pub fn save_metas(
metas: &IndexMeta, segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
payload: Option<String>,
directory: &mut Directory, directory: &mut Directory,
) -> Result<()> { ) -> Result<()> {
// let metas = IndexMeta { let metas = IndexMeta {
// segments: segment_metas, segments: segment_metas,
// schema, schema,
// opstamp, opstamp,
// payload, payload,
// }; };
let mut buffer = serde_json::to_vec_pretty(metas)?; let mut buffer = serde_json::to_vec_pretty(&metas)?;
writeln!(&mut buffer)?; writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?; directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -89,11 +86,6 @@ fn save_metas(
#[derive(Clone)] #[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>); pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
struct MergeOperation {
pub target_opstamp: u64,
pub segment_ids: Vec<SegmentId>,
}
fn perform_merge( fn perform_merge(
index: &Index, index: &Index,
mut segment_entries: Vec<SegmentEntry>, mut segment_entries: Vec<SegmentEntry>,
@@ -134,13 +126,6 @@ fn perform_merge(
} }
struct InnerSegmentUpdater { struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
// `SegmentUpdater`.
//
// This should be up to date as all update happen through
// the unique active `SegmentUpdater`.
active_metas: RwLock<Arc<IndexMeta>>,
pool: CpuPool, pool: CpuPool,
index: Index, index: Index,
segment_manager: SegmentManager, segment_manager: SegmentManager,
@@ -164,9 +149,7 @@ impl SegmentUpdater {
.name_prefix("segment_updater") .name_prefix("segment_updater")
.pool_size(1) .pool_size(1)
.create(); .create();
let index_meta = index.load_metas()?;
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater { Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
active_metas: RwLock::new(Arc::new(index_meta)),
pool, pool,
index, index,
segment_manager, segment_manager,
@@ -261,18 +244,14 @@ impl SegmentUpdater {
// //
// Segment 1 from disk 1, Segment 1 from disk 2, etc. // Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32)); commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let index_meta = IndexMeta {
segments: commited_segment_metas,
schema: index.schema(),
opstamp,
payload: commit_message
};
save_metas( save_metas(
&index_meta, commited_segment_metas,
index.schema(),
opstamp,
commit_message,
directory.box_clone().borrow_mut(), directory.box_clone().borrow_mut(),
) )
.expect("Could not save metas."); .expect("Could not save metas.");
self.store_meta(&index_meta);
} }
} }
@@ -307,27 +286,16 @@ impl SegmentUpdater {
} }
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> { pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
//let future_merged_segment = */
let segment_ids_vec = segment_ids.to_vec(); let segment_ids_vec = segment_ids.to_vec();
let commit_opstamp = self.load_metas().opstamp;
self.run_async(move |segment_updater| { self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp) segment_updater.start_merge_impl(&segment_ids_vec[..])
}) })
.wait()? .wait()?
} }
fn store_meta(&self, index_meta: &IndexMeta) {
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
}
fn load_metas(&self) -> Arc<IndexMeta> {
self.0.active_metas.read().unwrap().clone()
}
// `segment_ids` is required to be non-empty. // `segment_ids` is required to be non-empty.
fn start_merge_impl( fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
&self,
segment_ids: &[SegmentId],
target_opstamp: u64,
) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty."); assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone(); let segment_updater_clone = self.clone();
@@ -342,6 +310,8 @@ impl SegmentUpdater {
); );
let (merging_future_send, merging_future_recv) = oneshot(); let (merging_future_send, merging_future_recv) = oneshot();
let target_opstamp = self.0.stamper.stamp();
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new() let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id)) .name(format!("mergingthread-{}", merging_thread_id))
@@ -403,32 +373,11 @@ impl SegmentUpdater {
// Committed segments cannot be merged with uncommitted_segments. // Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently. // We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy(); let merge_policy = self.get_merge_policy();
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let current_opstamp = self.0.stamper.stamp(); let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
let mut merge_candidates = merge_policy merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
.compute_merge_candidates(&uncommitted_segments) for MergeCandidate(segment_metas) in merge_candidates {
.into_iter() match self.start_merge_impl(&segment_metas) {
.map(|merge_candidate| MergeOperation {
target_opstamp: current_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| MergeOperation {
target_opstamp: commit_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for MergeOperation {
target_opstamp,
segment_ids,
} in merge_candidates
{
match self.start_merge_impl(&segment_ids, target_opstamp) {
Ok(merge_future) => { Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() { if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e); error!("The merge task failed quickly after starting: {:?}", e);
@@ -463,7 +412,12 @@ impl SegmentUpdater {
info!("End merge {:?}", after_merge_segment_entry.meta()); info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() { if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp; let committed_opstamp = segment_updater
.0
.index
.load_metas()
.expect("Failed to read opstamp")
.opstamp;
if delete_operation.opstamp < committed_opstamp { if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index; let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone()); let segment = index.segment(after_merge_segment_entry.meta().clone());
@@ -492,8 +446,8 @@ impl SegmentUpdater {
.end_merge(&before_merge_segment_ids, after_merge_segment_entry); .end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options(); segment_updater.consider_merge_options();
info!("save metas"); info!("save metas");
let previous_metas = segment_updater.load_metas(); let previous_metas = segment_updater.0.index.load_metas().unwrap();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone()); segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
segment_updater.garbage_collect_files_exec(); segment_updater.garbage_collect_files_exec();
}) })
.wait() .wait()

View File

@@ -111,18 +111,19 @@ impl SegmentWriter {
} }
match *field_options.field_type() { match *field_options.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values let facets: Vec<&[u8]> = field_values
.iter() .iter()
.flat_map(|field_value| match *field_value.value() { .flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()), Value::Facet(ref facet) => Some(facet.encoded_bytes()),
_ => { _ => {
panic!("Expected hierarchical facet"); panic!("Expected hierarchical facet");
} }
}) })
.collect(); .collect();
let mut term = Term::for_field(field); // we set the Term let mut term = Term::for_field(field); // we set the Term
for fake_str in facets { for facet_bytes in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| { FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text); term.set_text(&token.text);
let unordered_term_id = let unordered_term_id =

View File

@@ -1,68 +1,50 @@
use std::sync::Arc;
use std::sync::atomic::Ordering;
// AtomicU64 have not landed in stable. // AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on // For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform. // x86/64 bit platform, and a mutex on other platform.
#[cfg(target_arch = "x86_64")]
#[cfg(target = "x86_64")]
mod archicture_impl { mod archicture_impl {
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
#[derive(Default)] #[derive(Clone, Default)]
pub struct AtomicU64Ersatz(AtomicUsize); pub struct Stamper(Arc<AtomicU64>);
impl AtomicU64Ersatz { impl Stamper {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz { pub fn new(first_opstamp: u64) -> Stamper {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize)) Stamper(Arc::new(AtomicU64::new(first_opstamp)))
} }
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 { pub fn stamp(&self) -> u64 {
self.0.fetch_add(val as usize, order) as u64 self.0.fetch_add(1u64, Ordering::SeqCst) as u64
} }
} }
} }
#[cfg(not(target_arch = "x86_64"))] #[cfg(not(target = "x86_64"))]
mod archicture_impl { mod archicture_impl {
use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex};
/// Under other architecture, we rely on a mutex.
use std::sync::RwLock;
#[derive(Default)] #[derive(Clone, Default)]
pub struct AtomicU64Ersatz(RwLock<u64>); pub struct Stamper(Arc<Mutex<u64>>);
impl AtomicU64Ersatz { impl Stamper {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz { pub fn new(first_opstamp: u64) -> Stamper {
AtomicU64Ersatz(RwLock::new(first_opstamp)) Stamper(Arc::new(Mutex::new(first_opstamp)))
} }
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 { pub fn stamp(&self) -> u64 {
let mut lock = self.0.write().unwrap(); let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *lock; let previous_val = *guard;
*lock = previous_val + incr; *guard = previous_val + 1;
previous_val previous_val
} }
} }
} }
use self::archicture_impl::AtomicU64Ersatz; pub use self::archicture_impl::Stamper;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {

View File

@@ -126,6 +126,7 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) { fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0; let mut start = 0;
let end = arr.len(); let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]); debug_assert!(target <= arr[end - 1]);
let mut jump = 1; let mut jump = 1;
loop { loop {
@@ -215,10 +216,11 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search // we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs(); let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self let new_cur = self
.cur .cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target)); .wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions { if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur] sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter() .iter()
@@ -620,7 +622,6 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::exponential_search;
use super::search_within_block; use super::search_within_block;
use super::BlockSegmentPostings; use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult; use super::BlockSegmentPostingsSkipResult;
@@ -634,7 +635,6 @@ mod tests {
use schema::Term; use schema::Term;
use schema::INT_INDEXED; use schema::INT_INDEXED;
use DocId; use DocId;
use SkipResult;
#[test] #[test]
fn test_empty_segment_postings() { fn test_empty_segment_postings() {
@@ -662,16 +662,6 @@ mod tests {
.0 .0
} }
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) { fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!( assert_eq!(
search_within_block(block, target), search_within_block(block, target),
@@ -703,7 +693,7 @@ mod tests {
#[test] #[test]
fn test_block_segment_postings() { fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>()); let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32; let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty // checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty()); assert!(block_segments.docs().is_empty());
@@ -717,44 +707,14 @@ mod tests {
} }
} }
#[test] fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED); let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for &doc in docs { for doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64)); index_writer.add_document(doc!(int_field=>1u64));
} }
@@ -774,7 +734,7 @@ mod tests {
#[test] #[test]
fn test_block_segment_postings_skip() { fn test_block_segment_postings_skip() {
for i in 0..4 { for i in 0..4 {
let mut block_postings = build_block_postings(&[3]); let mut block_postings = build_block_postings(vec![3]);
assert_eq!( assert_eq!(
block_postings.skip_to(i), block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32) BlockSegmentPostingsSkipResult::Success(0u32)
@@ -784,7 +744,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
); );
} }
let mut block_postings = build_block_postings(&[3]); let mut block_postings = build_block_postings(vec![3]);
assert_eq!( assert_eq!(
block_postings.skip_to(4u32), block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
@@ -797,7 +757,7 @@ mod tests {
for i in 0..1300 { for i in 0..1300 {
docs.push((i * i / 100) + i); docs.push((i * i / 100) + i);
} }
let mut block_postings = build_block_postings(&docs[..]); let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] { for i in vec![0, 424, 10000] {
assert_eq!( assert_eq!(
block_postings.skip_to(i), block_postings.skip_to(i),

View File

@@ -1,7 +1,9 @@
mod expull; mod expull;
mod memory_arena; mod memory_arena;
mod murmurhash2;
mod term_hashmap; mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList; pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena}; pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap}; pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -0,0 +1,87 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,7 +1,4 @@
extern crate murmurhash32; use super::murmurhash2;
use self::murmurhash32::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena}; use super::{Addr, ArenaStorable, MemoryArena};
use std::iter; use std::iter;
use std::mem; use std::mem;
@@ -209,7 +206,7 @@ impl TermHashMap {
self.resize(); self.resize();
} }
let key_bytes: &[u8] = key.as_ref(); let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref()); let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash); let mut probe = self.probe(hash);
loop { loop {
let bucket = probe.next_probe(); let bucket = probe.next_probe();

View File

@@ -6,7 +6,6 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter}; use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::str; use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/'; const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\'; const ESCAPE_BYTE: u8 = b'\\';
@@ -15,10 +14,6 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets. /// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8; pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy. /// A Facet represent a point in a given hierarchy.
/// ///
/// They are typically represented similarly to a filepath. /// They are typically represented similarly to a filepath.
@@ -31,18 +26,18 @@ pub const FACET_SEP_CHAR: char = '\u{0}';
/// its facet. In the example above, `/electronics/tv_and_video/` /// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`. /// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)] #[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(String); pub struct Facet(Vec<u8>);
impl Facet { impl Facet {
/// Returns a new instance of the "root facet" /// Returns a new instance of the "root facet"
/// Equivalent to `/`. /// Equivalent to `/`.
pub fn root() -> Facet { pub fn root() -> Facet {
Facet("".to_string()) Facet(vec![])
} }
/// Returns true iff the facet is the root facet `/`. /// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool { pub fn is_root(&self) -> bool {
self.encoded_str().is_empty() self.encoded_bytes().is_empty()
} }
/// Returns a binary representation of the facet. /// Returns a binary representation of the facet.
@@ -54,19 +49,13 @@ impl Facet {
/// This representation has the benefit of making it possible to /// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over /// express "being a child of a given facet" as a range over
/// the term ordinals. /// the term ordinals.
pub fn encoded_str(&self) -> &str { pub fn encoded_bytes(&self) -> &[u8] {
&self.0 &self.0
} }
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation. /// Creates a `Facet` from its binary representation.
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> { pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8 Facet(encoded_bytes)
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
} }
/// Parse a text representation of a facet. /// Parse a text representation of a facet.
@@ -90,37 +79,36 @@ impl Facet {
Path: IntoIterator, Path: IntoIterator,
Path::Item: ToString, Path::Item: ToString,
{ {
let mut facet_string: String = String::with_capacity(100); let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut step_it = path.into_iter(); let mut step_it = path.into_iter();
if let Some(step) = step_it.next() { if let Some(step) = step_it.next() {
facet_string.push_str(&step.to_string()); facet_bytes.extend_from_slice(step.to_string().as_bytes());
} }
for step in step_it { for step in step_it {
facet_string.push(FACET_SEP_CHAR); facet_bytes.push(FACET_SEP_BYTE);
facet_string.push_str(&step.to_string()); facet_bytes.extend_from_slice(step.to_string().as_bytes());
} }
Facet(facet_string) Facet(facet_bytes)
} }
/// Accessor for the inner buffer of the `Facet`. /// Accessor for the inner buffer of the `Facet`.
pub(crate) fn set_facet_str(&mut self, facet_str: &str) { pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
self.0.clear(); &mut self.0
self.0.push_str(facet_str);
} }
/// Returns `true` iff other is a subfacet of `self`. /// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool { pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_str = self.encoded_str(); let self_bytes: &[u8] = self.encoded_bytes();
let other_str = other.encoded_str(); let other_bytes: &[u8] = other.encoded_bytes();
self_str.len() < other_str.len() self_bytes.len() < other_bytes.len()
&& other_str.starts_with(self_str) && other_bytes.starts_with(self_bytes)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE && other_bytes[self_bytes.len()] == 0u8
} }
} }
impl Borrow<str> for Facet { impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &str { fn borrow(&self) -> &[u8] {
self.encoded_str() self.encoded_bytes()
} }
} }
@@ -132,51 +120,45 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle, Idle,
} }
let path: &str = path_asref.as_ref(); let path: &str = path_asref.as_ref();
assert!(!path.is_empty()); let mut facet_encoded = Vec::new();
assert!(path.starts_with("/"));
let mut facet_encoded = String::new();
let mut state = State::Idle; let mut state = State::Idle;
let path_bytes = path.as_bytes(); let path_bytes = path.as_bytes();
let mut last_offset = 1; for &c in &path_bytes[1..] {
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) { match (state, c) {
(State::Idle, ESCAPE_BYTE) => { (State::Idle, ESCAPE_BYTE) => state = State::Escaped,
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => { (State::Idle, SLASH_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]); facet_encoded.push(FACET_SEP_BYTE);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
} }
(State::Escaped, _escaped_char) => { (State::Escaped, any_char) => {
state = State::Idle; state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
} }
(State::Idle, _any_char) => {}
} }
} }
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded) Facet(facet_encoded)
} }
} }
impl BinarySerializable for Facet { impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> { fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<String as BinarySerializable>::serialize(&self.0, writer) <Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
} }
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> { fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?)) let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
} }
} }
impl Display for Facet { impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result { fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(FACET_SEP_CHAR) { for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
write!(f, "/")?; write!(f, "/")?;
write!(f, "{}", escape_slashes(step))?; let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
} }
Ok(()) Ok(())
} }

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet. /// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term { pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_str().as_bytes(); let bytes = facet.encoded_bytes();
let buffer = Vec::with_capacity(4 + bytes.len()); let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer); let mut term = Term(buffer);
term.set_field(field); term.set_field(field);
@@ -68,7 +68,12 @@ impl Term {
term term
} }
/// Creates a new Term for a given field. /// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
pub(crate) fn for_field(field: Field) -> Term { pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100)); let mut term = Term(Vec::with_capacity(100));
term.set_field(field); term.set_field(field);

View File

@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new(); let mut term_string = String::new();
while term_it.advance() { while term_it.advance() {
//let term = Term::from_bytes(term_it.key()); //let term = Term::from_bytes(term_it.key());
term_string.push_str(str::from_utf8(term_it.key()).expect("test")); term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
} }
assert_eq!(&*term_string, "abcdef"); assert_eq!(&*term_string, "abcdef");
} }

View File

@@ -1,5 +1,6 @@
use super::{Token, TokenStream, Tokenizer}; use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE; use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation /// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent. /// and emits a token for all of its parent.
@@ -56,11 +57,12 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE) .position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos) .map(|pos| cursor + 1 + pos)
{ {
let facet_part = &self.text[cursor..next_sep_pos]; let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
self.token.text.push_str(facet_part); self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos); self.state = State::UpToPosition(next_sep_pos);
} else { } else {
let facet_part = &self.text[cursor..]; let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
self.token.text.push_str(facet_part); self.token.text.push_str(facet_part);
self.state = State::Terminated; self.state = State::Terminated;
} }
@@ -84,6 +86,7 @@ mod tests {
use super::FacetTokenizer; use super::FacetTokenizer;
use schema::Facet; use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer}; use tokenizer::{Token, TokenStream, Tokenizer};
#[test] #[test]
@@ -92,11 +95,11 @@ mod tests {
let mut tokens = vec![]; let mut tokens = vec![];
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet)); tokens.push(format!("{}", facet));
}; };
FacetTokenizer FacetTokenizer
.token_stream(facet.encoded_str()) .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.process(&mut add_token); .process(&mut add_token);
} }
assert_eq!(tokens.len(), 4); assert_eq!(tokens.len(), 4);
@@ -112,11 +115,11 @@ mod tests {
let mut tokens = vec![]; let mut tokens = vec![];
{ {
let mut add_token = |token: &Token| { let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet)); tokens.push(format!("{}", facet));
}; };
FacetTokenizer FacetTokenizer
.token_stream(facet.encoded_str()) // ok test .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.process(&mut add_token); .process(&mut add_token);
} }
assert_eq!(tokens.len(), 1); assert_eq!(tokens.len(), 1);