From dd028841e88e245928be5201ec12f7efbc3acd9a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 3 Feb 2018 00:14:54 +0900 Subject: [PATCH 1/7] Added documentation / test and change the contract of .add_facet() --- src/collector/facet_collector.rs | 101 ++++++++++++++++++------------- src/schema/facet.rs | 19 ++++++ 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 1fe4210f0..26e14d14f 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -14,10 +14,8 @@ use std::collections::BTreeSet; use termdict::TermMerger; use postings::SkipResult; use std::{u64, usize}; -use schema::FACET_SEP_BYTE; use std::iter::Peekable; - use DocId; use Result; use Score; @@ -93,7 +91,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`). /// /// Once collection is finished, you can harvest its results in the form -/// of a `FacetCounts` object, and extract your facet counts from it. +/// of a `FacetCounts` object, and extract your face t counts from it. /// /// This implementation assumes you are working with a number of facets that /// is much hundreds of time lower than your number of documents. @@ -204,7 +202,6 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// Ok(()) /// } /// ``` - pub struct FacetCollector { facet_ords: Vec, field: Field, @@ -218,14 +215,14 @@ pub struct FacetCollector { // collapse facet_id -> facet_ord current_collapse_facet_ords: Vec, - collapse: BTreeSet>, + facets: BTreeSet } -fn skip<'a, I: Iterator>>(target: &[u8], collapse_it: &mut Peekable) -> SkipResult { +fn skip<'a, I: Iterator>(target: &[u8], collapse_it: &mut Peekable) -> SkipResult { loop { match collapse_it.peek() { Some(facet_bytes) => { - match facet_bytes[..].cmp(&target) { + match facet_bytes.encoded_bytes().cmp(&target) { Ordering::Less => {} Ordering::Greater => { return SkipResult::OverStep; @@ -251,51 +248,49 @@ impl FacetCollector { /// This function does not check whether the field /// is of the proper type. pub fn for_field(field: Field) -> FacetCollector { - let mut facet_collector = FacetCollector { + FacetCollector { facet_ords: Vec::with_capacity(255), - field: field, - ff_reader: None, segment_counters: Vec::new(), - collapse: BTreeSet::new(), + field, + ff_reader: None, + facets: BTreeSet::new(), current_segment_collapse_mapping: Vec::new(), current_collapse_facet_ords: Vec::new(), current_segment_counts: Vec::new(), - }; - facet_collector.add_facet(Facet::from("/")); - facet_collector + } } + + /// Adds a facet that we want to record counts + /// + /// Adding facet `Facet::from("/country")` for instance, + /// will record the counts of all of the direct children of the facet country + /// (e.g. `/country/FR`, `/country/UK`). + /// + /// Adding two facets within which one is the prefix of the other is forbidden. + /// If you need the correct number of unique documents for two such facets, + /// just add them in separate `FacetCollector`. pub fn add_facet(&mut self, facet_from: T) where Facet: From { let facet = Facet::from(facet_from); - let facet_bytes: &[u8] = facet.encoded_bytes(); - self.collapse.remove(&facet_bytes[..0]); - for pos in facet_bytes.iter() - .cloned() - .position(|b| b == FACET_SEP_BYTE) { - self.collapse.remove(&facet_bytes[..pos]); - } - self.collapse.insert(facet_bytes.to_owned()); - } - - fn finalize_segment(&mut self) { - if self.ff_reader.is_some() { - self.segment_counters.push( - SegmentFacetCounter { - facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, - facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), - facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), - } + for old_facet in &self.facets { + assert!( + !old_facet.is_prefix_of(&facet), + "Tried to add a facet which is a descendant of an already added facet."); + assert!( + !facet.is_prefix_of(&old_facet), + "Tried to add a facet which is an ancestor of an already added facet." ); } + self.facets.insert(facet); } fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) { self.current_segment_collapse_mapping.clear(); self.current_collapse_facet_ords.clear(); self.current_segment_counts.clear(); - let mut collapse_facet_it = self.collapse.iter().peekable(); + let mut collapse_facet_it = self.facets.iter().peekable(); self.current_collapse_facet_ords.push(0); let mut facet_streamer = facet_reader .facet_dict() @@ -338,6 +333,18 @@ impl FacetCollector { } } + fn finalize_segment(&mut self) { + if self.ff_reader.is_some() { + self.segment_counters.push( + SegmentFacetCounter { + facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, + facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), + facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), + } + ); + } + } + /// Returns the results of the collection. /// /// This method does not just return the counters, @@ -392,7 +399,7 @@ impl FacetCollector { } } FacetCounts { - facet_counts: facet_counts + facet_counts } } } @@ -470,10 +477,7 @@ impl FacetCounts { let mut it = self.get(facet); for (ref facet, count) in (&mut it).take(k) { - heap.push(Hit { - count: count, - facet: facet - }); + heap.push(Hit { count, facet }); } let mut lowest_count: u64 = heap.peek() @@ -483,10 +487,7 @@ impl FacetCounts { if count > lowest_count { lowest_count = count; if let Some(mut head) = heap.peek_mut() { - *head = Hit { - count: count, - facet: facet - }; + *head = Hit { count, facet }; } } } @@ -508,6 +509,7 @@ mod tests { use query::AllQuery; use super::{FacetCollector, FacetCounts}; use std::iter; + use schema::Field; use rand::{thread_rng, Rng}; #[test] @@ -561,6 +563,21 @@ mod tests { } } + #[test] + #[should_panic(expected="Tried to add a facet which is a descendant of an already added facet.")] + fn test_misused_facet_collector() { + let mut facet_collector = FacetCollector::for_field(Field(0)); + facet_collector.add_facet(Facet::from("/country")); + facet_collector.add_facet(Facet::from("/country/europe")); + } + + #[test] + fn test_non_used_facet_collector() { + let mut facet_collector = FacetCollector::for_field(Field(0)); + facet_collector.add_facet(Facet::from("/country")); + facet_collector.add_facet(Facet::from("/countryeurope")); + } + #[test] fn test_facet_collector_topk() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/schema/facet.rs b/src/schema/facet.rs index d8f61eda9..9c67fb8bb 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display, Debug, Formatter}; use std::str; use std::io::{self, Read, Write}; use regex::Regex; +use std::borrow::Borrow; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::Cow; use common::BinarySerializable; @@ -93,8 +94,26 @@ impl Facet { pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec { &mut self.0 } + + + /// Returns `true` iff other is a subfacet of `self`. + pub fn is_prefix_of(&self, other: &Facet) -> bool { + let self_bytes: &[u8] = self.encoded_bytes(); + let other_bytes: &[u8] = other.encoded_bytes(); + if self_bytes.len() < other_bytes.len() { + if other_bytes.starts_with(self_bytes) { + return other_bytes[self_bytes.len()] == 0u8; + } + } + false + } } +impl Borrow<[u8]> for Facet { + fn borrow(&self) -> &[u8] { + self.encoded_bytes() + } +} impl<'a, T: ?Sized + AsRef> From<&'a T> for Facet { From df53dc4cebb02baa372f8c7490ad7449f7589ce8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 3 Feb 2018 00:21:05 +0900 Subject: [PATCH 2/7] Format --- src/collector/count_collector.rs | 1 - src/collector/facet_collector.rs | 206 ++++++++---------- src/common/bitpacker.rs | 6 +- src/common/composite_file.rs | 26 +-- src/compression/pack/compression_pack_simd.rs | 4 +- src/core/segment_reader.rs | 22 +- src/datastruct/stacker/hashmap.rs | 40 +--- src/datastruct/stacker/mod.rs | 1 - src/directory/mmap_directory.rs | 79 ++++--- src/directory/read_only_source.rs | 2 +- src/fastfield/facet_reader.rs | 13 +- src/fastfield/mod.rs | 36 ++- src/fastfield/multivalued/mod.rs | 2 +- src/fastfield/multivalued/reader.rs | 21 +- src/fastfield/multivalued/writer.rs | 13 +- src/fastfield/reader.rs | 37 ++-- src/fastfield/serializer.rs | 8 +- src/fastfield/writer.rs | 57 +++-- src/indexer/segment_writer.rs | 67 +++--- src/lib.rs | 4 +- src/postings/mod.rs | 37 +++- src/postings/postings_writer.rs | 86 ++++---- src/query/all_query.rs | 9 +- src/query/mod.rs | 2 +- src/query/phrase_query/mod.rs | 9 +- src/query/phrase_query/phrase_scorer.rs | 9 +- src/query/phrase_query/phrase_weight.rs | 3 +- src/schema/document.rs | 15 +- src/schema/facet.rs | 38 ++-- src/schema/field_entry.rs | 2 +- src/schema/field_type.rs | 8 +- src/schema/int_options.rs | 8 +- src/schema/schema.rs | 20 +- src/schema/text_options.rs | 2 +- src/schema/value.rs | 6 +- src/store/reader.rs | 4 +- src/store/writer.rs | 13 +- src/termdict/fstdict/streamer.rs | 6 +- src/termdict/fstdict/termdict.rs | 18 +- src/termdict/mod.rs | 70 +++--- src/termdict/streamdict/delta_encoder.rs | 24 +- src/tokenizer/alphanum_only.rs | 24 +- src/tokenizer/facet_tokenizer.rs | 39 ++-- src/tokenizer/tokenizer.rs | 1 - 44 files changed, 520 insertions(+), 578 deletions(-) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 67f563f1a..6707e687e 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -20,7 +20,6 @@ impl CountCollector { } } - impl Collector for CountCollector { fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { Ok(()) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 26e14d14f..41dd0bcf5 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -13,7 +13,7 @@ use termdict::TermStreamerBuilder; use std::collections::BTreeSet; use termdict::TermMerger; use postings::SkipResult; -use std::{u64, usize}; +use std::{usize, u64}; use std::iter::Peekable; use DocId; @@ -48,7 +48,6 @@ impl<'a> Ord for Hit<'a> { } } - struct SegmentFacetCounter { pub facet_reader: FacetReader, pub facet_ords: Vec, @@ -59,16 +58,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { if facet_bytes.is_empty() { 0 } else { - facet_bytes - .iter() - .cloned() - .filter(|b| *b == 0u8) - .count() + 1 + facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1 } } - - /// Collector for faceting /// /// The collector collects all facets. You need to configure it @@ -215,23 +208,24 @@ pub struct FacetCollector { // collapse facet_id -> facet_ord current_collapse_facet_ords: Vec, - facets: BTreeSet + facets: BTreeSet, } -fn skip<'a, I: Iterator>(target: &[u8], collapse_it: &mut Peekable) -> SkipResult { +fn skip<'a, I: Iterator>( + target: &[u8], + collapse_it: &mut Peekable, +) -> SkipResult { loop { match collapse_it.peek() { - Some(facet_bytes) => { - match facet_bytes.encoded_bytes().cmp(&target) { - Ordering::Less => {} - Ordering::Greater => { - return SkipResult::OverStep; - } - Ordering::Equal => { - return SkipResult::Reached; - } + Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(&target) { + Ordering::Less => {} + Ordering::Greater => { + return SkipResult::OverStep; } - } + Ordering::Equal => { + return SkipResult::Reached; + } + }, None => { return SkipResult::End; } @@ -241,7 +235,6 @@ fn skip<'a, I: Iterator>(target: &[u8], collapse_it: &mut Peekab } impl FacetCollector { - /// Create a facet collector to collect the facets /// from a specific facet `Field`. /// @@ -261,7 +254,6 @@ impl FacetCollector { } } - /// Adds a facet that we want to record counts /// /// Adding facet `Facet::from("/country")` for instance, @@ -272,12 +264,15 @@ impl FacetCollector { /// If you need the correct number of unique documents for two such facets, /// just add them in separate `FacetCollector`. pub fn add_facet(&mut self, facet_from: T) - where Facet: From { + where + Facet: From, + { let facet = Facet::from(facet_from); for old_facet in &self.facets { assert!( !old_facet.is_prefix_of(&facet), - "Tried to add a facet which is a descendant of an already added facet."); + "Tried to add a facet which is a descendant of an already added facet." + ); assert!( !facet.is_prefix_of(&old_facet), "Tried to add a facet which is an ancestor of an already added facet." @@ -292,10 +287,7 @@ impl FacetCollector { self.current_segment_counts.clear(); let mut collapse_facet_it = self.facets.iter().peekable(); self.current_collapse_facet_ords.push(0); - let mut facet_streamer = facet_reader - .facet_dict() - .range() - .into_stream(); + let mut facet_streamer = facet_reader.facet_dict().range().into_stream(); if !facet_streamer.advance() { return; } @@ -315,7 +307,8 @@ impl FacetCollector { continue 'outer; } else if depth == collapse_depth + 1 { collapsed_id = self.current_collapse_facet_ords.len(); - self.current_collapse_facet_ords.push(facet_streamer.term_ord()); + self.current_collapse_facet_ords + .push(facet_streamer.term_ord()); self.current_segment_collapse_mapping.push(collapsed_id); } else { self.current_segment_collapse_mapping.push(collapsed_id); @@ -335,13 +328,11 @@ impl FacetCollector { fn finalize_segment(&mut self) { if self.ff_reader.is_some() { - self.segment_counters.push( - SegmentFacetCounter { - facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, - facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), - facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), - } - ); + self.segment_counters.push(SegmentFacetCounter { + facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, + facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), + facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), + }); } } @@ -361,14 +352,9 @@ impl FacetCollector { .map(|segment_counter| &segment_counter.facet_counts[..]) .collect(); - let facet_streams = self.segment_counters .iter() - .map(|seg_counts| seg_counts - .facet_reader - .facet_dict() - .range() - .into_stream()) + .map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream()) .collect::>(); let mut facet_merger = TermMerger::new(facet_streams); @@ -398,51 +384,43 @@ impl FacetCollector { facet_counts.insert(Facet::from_encoded(bytes), count); } } - FacetCounts { - facet_counts - } + FacetCounts { facet_counts } } } - - - impl Collector for FacetCollector { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.finalize_segment(); let facet_reader = reader.facet_reader(self.field)?; self.set_collapse_mapping(&facet_reader); - self.current_segment_counts.resize(self.current_collapse_facet_ords.len(), 0); + self.current_segment_counts + .resize(self.current_collapse_facet_ords.len(), 0); self.ff_reader = Some(UnsafeCell::new(facet_reader)); Ok(()) } fn collect(&mut self, doc: DocId, _: Score) { - let facet_reader: &mut FacetReader = - unsafe { - &mut *self.ff_reader - .as_ref() - .expect("collect() was called before set_segment. This should never happen.") - .get() - }; + let facet_reader: &mut FacetReader = unsafe { + &mut *self.ff_reader + .as_ref() + .expect("collect() was called before set_segment. This should never happen.") + .get() + }; facet_reader.facet_ords(doc, &mut self.facet_ords); let mut previous_collapsed_ord: usize = usize::MAX; for &facet_ord in &self.facet_ords { let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize]; - self.current_segment_counts[collapsed_ord] += - if collapsed_ord == previous_collapsed_ord { - 0 - } else { - 1 - }; + self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord + { + 0 + } else { + 1 + }; previous_collapsed_ord = collapsed_ord; } } } - - - /// Intermediary result of the `FacetCollector` that stores /// the facet counts for all the segments. pub struct FacetCounts { @@ -450,20 +428,20 @@ pub struct FacetCounts { } impl FacetCounts { - - pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator - where Facet: From { + pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator + where + Facet: From, + { let facet = Facet::from(facet_from); let left_bound = Bound::Excluded(facet.clone()); - let right_bound = - if facet.is_root() { - Bound::Unbounded - } else { - let mut facet_after_bytes = facet.encoded_bytes().to_owned(); - facet_after_bytes.push(1u8); - let facet_after = Facet::from_encoded(facet_after_bytes); - Bound::Excluded(facet_after) - }; + let right_bound = if facet.is_root() { + Bound::Unbounded + } else { + let mut facet_after_bytes = facet.encoded_bytes().to_owned(); + facet_after_bytes.push(1u8); + let facet_after = Facet::from_encoded(facet_after_bytes); + Bound::Excluded(facet_after) + }; self.facet_counts .range((left_bound, right_bound)) @@ -471,8 +449,9 @@ impl FacetCounts { } pub fn top_k(&self, facet: T, k: usize) -> Vec<(&Facet, u64)> - where Facet: From { - + where + Facet: From, + { let mut heap = BinaryHeap::with_capacity(k); let mut it = self.get(facet); @@ -480,9 +459,7 @@ impl FacetCounts { heap.push(Hit { count, facet }); } - let mut lowest_count: u64 = heap.peek() - .map(|hit| hit.count) - .unwrap_or(u64::MIN); + let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); for (facet, count) in it { if count > lowest_count { lowest_count = count; @@ -496,16 +473,13 @@ impl FacetCounts { .map(|hit| (hit.facet, hit.count)) .collect::>() } - } - - #[cfg(test)] mod tests { use test::Bencher; use core::Index; - use schema::{SchemaBuilder, Document, Facet}; + use schema::{Document, Facet, SchemaBuilder}; use query::AllQuery; use super::{FacetCollector, FacetCounts}; use std::iter; @@ -550,21 +524,23 @@ mod tests { .get("/top1") .map(|(facet, count)| (facet.to_string(), count)) .collect(); - assert_eq!(facets, [ - ("/top1/mid0", 50), - ("/top1/mid1", 50), - ("/top1/mid2", 50), - ("/top1/mid3", 50), - ].iter() - .map(|&(facet_str, count)| { - (String::from(facet_str), count) - }) - .collect::>()); + assert_eq!( + facets, + [ + ("/top1/mid0", 50), + ("/top1/mid1", 50), + ("/top1/mid2", 50), + ("/top1/mid3", 50), + ].iter() + .map(|&(facet_str, count)| (String::from(facet_str), count)) + .collect::>() + ); } } #[test] - #[should_panic(expected="Tried to add a facet which is a descendant of an already added facet.")] + #[should_panic(expected = "Tried to add a facet which is a descendant of \ + an already added facet.")] fn test_misused_facet_collector() { let mut facet_collector = FacetCollector::for_field(Field(0)); facet_collector.add_facet(Facet::from("/country")); @@ -585,18 +561,14 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut docs: Vec = vec![ - ("a", 10), - ("b", 100), - ("c", 7), - ("d", 12), - ("e", 21) - ].into_iter() - .flat_map(|(c, count)| { - let facet = Facet::from(&format!("/facet_{}", c)); - let doc = doc!(facet_field => facet); - iter::repeat(doc).take(count) - }).collect(); + let mut docs: Vec = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)] + .into_iter() + .flat_map(|(c, count)| { + let facet = Facet::from(&format!("/facet_{}", c)); + let doc = doc!(facet_field => facet); + iter::repeat(doc).take(count) + }) + .collect(); thread_rng().shuffle(&mut docs[..]); let mut index_writer = index.writer(3_000_000).unwrap(); @@ -620,8 +592,9 @@ mod tests { vec![ (&Facet::from("/facet_b"), 100), (&Facet::from("/facet_e"), 21), - (&Facet::from("/facet_d"), 12) - ]); + (&Facet::from("/facet_d"), 12), + ] + ); } } @@ -632,12 +605,12 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut docs = vec!(); + let mut docs = vec![]; for val in 0..50 { let facet = Facet::from(&format!("/facet_{}", val)); - for _ in 0..val*val { - docs.push(doc!(facet_field=>facet.clone())); - } + for _ in 0..val * val { + docs.push(doc!(facet_field=>facet.clone())); + } } // 40425 docs thread_rng().shuffle(&mut docs[..]); @@ -656,4 +629,3 @@ mod tests { }); } } - diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index cb3b81e72..b78a32746 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -134,7 +134,8 @@ where addr + 8 <= data.len(), "The fast field field should have been padded with 7 bytes." ); - let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = + unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & mask) } else { @@ -165,7 +166,8 @@ where for output_val in output.iter_mut() { let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; - let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = + unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; *output_val = val_shifted & mask; addr_in_bits += num_bits; diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 341dc051c..cceafce49 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -8,7 +8,6 @@ use std::io::{self, Read}; use directory::ReadOnlySource; use common::BinarySerializable; - #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] pub struct FileAddr { field: Field, @@ -19,7 +18,7 @@ impl FileAddr { fn new(field: Field, idx: usize) -> FileAddr { FileAddr { field: field, - idx: idx + idx: idx, } } } @@ -36,7 +35,7 @@ impl BinarySerializable for FileAddr { let idx = VInt::deserialize(reader)?.0 as usize; Ok(FileAddr { field: field, - idx: idx + idx: idx, }) } } @@ -59,7 +58,7 @@ impl CompositeWrite { /// Start writing a new field. pub fn for_field(&mut self, field: Field) -> &mut CountingWriter { - self.for_field_with_idx(field, 0) + self.for_field_with_idx(field, 0) } /// Start writing a new field. @@ -71,7 +70,6 @@ impl CompositeWrite { &mut self.write } - /// Close the composite file. /// /// An index of the different field offsets @@ -89,9 +87,7 @@ impl CompositeWrite { let mut prev_offset = 0; for (offset, file_addr) in offset_fields { - VInt((offset - prev_offset) as u64).serialize( - &mut self.write, - )?; + VInt((offset - prev_offset) as u64).serialize(&mut self.write)?; file_addr.serialize(&mut self.write)?; prev_offset = offset; } @@ -103,7 +99,6 @@ impl CompositeWrite { } } - /// A composite file is an abstraction to store a /// file partitioned by field. /// @@ -174,20 +169,20 @@ impl CompositeFile { /// to a given `Field` and stored in a `CompositeFile`. pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option { self.offsets_index - .get(&FileAddr {field: field, idx: idx}) - .map(|&(from, to)| { - self.data.slice(from, to) + .get(&FileAddr { + field: field, + idx: idx, }) + .map(|&(from, to)| self.data.slice(from, to)) } } - #[cfg(test)] mod test { use std::io::Write; - use super::{CompositeWrite, CompositeFile}; - use directory::{RAMDirectory, Directory}; + use super::{CompositeFile, CompositeWrite}; + use directory::{Directory, RAMDirectory}; use schema::Field; use common::VInt; use common::BinarySerializable; @@ -231,7 +226,6 @@ mod test { assert_eq!(payload_4, 2u64); } } - } } diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 2a900e9ed..2db372630 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -25,7 +25,9 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize { } fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { - unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) } + unsafe { + simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) + } } fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 939873d08..a62b0befc 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -108,17 +108,21 @@ impl SegmentReader { pub fn facet_reader(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); if field_entry.field_type() != &FieldType::HierarchicalFacet { - return Err(ErrorKind::InvalidArgument(format!("The field {:?} is not a \ - hierarchical facet.", field_entry)).into()) + return Err(ErrorKind::InvalidArgument(format!( + "The field {:?} is not a \ + hierarchical facet.", + field_entry + )).into()); } let term_ords_reader = self.multi_value_reader(field)?; - let termdict_source = self.termdict_composite - .open_read(field) - .ok_or_else(|| { - ErrorKind::InvalidArgument(format!("The field \"{}\" is a hierarchical \ - but this segment does not seem to have the field term \ - dictionary.", field_entry.name())) - })?; + let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { + ErrorKind::InvalidArgument(format!( + "The field \"{}\" is a hierarchical \ + but this segment does not seem to have the field term \ + dictionary.", + field_entry.name() + )) + })?; let termdict = TermDictionaryImpl::from_source(termdict_source); let facet_reader = FacetReader::new(term_ords_reader, termdict); Ok(facet_reader) diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index ea57258df..18025b657 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -1,7 +1,7 @@ use std::iter; use std::mem; use postings::UnorderedTermId; -use super::heap::{Heap, HeapAllocable, BytesRef}; +use super::heap::{BytesRef, Heap, HeapAllocable}; mod murmurhash2 { @@ -53,9 +53,6 @@ mod murmurhash2 { } } - - - /// Split the thread memory budget into /// - the heap size /// - the hash table "table" itself. @@ -63,14 +60,10 @@ mod murmurhash2 { /// Returns (the heap size in bytes, the hash table size in number of bits) pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) { let table_size_limit: usize = per_thread_memory_budget / 3; - let compute_table_size = |num_bits: usize| { - (1 << num_bits) * mem::size_of::() - }; + let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::(); let table_num_bits: usize = (1..) .into_iter() - .take_while(|num_bits: &usize| { - compute_table_size(*num_bits) < table_size_limit - }) + .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) .last() .expect(&format!( "Per thread memory is too small: {}", @@ -81,7 +74,6 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) { (heap_size, table_num_bits) } - /// `KeyValue` is the item stored in the hash table. /// The key is actually a `BytesRef` object stored in an external heap. /// The `value_addr` also points to an address in the heap. @@ -101,7 +93,6 @@ impl KeyValue { } } - /// Customized `HashMap` with string keys /// /// This `HashMap` takes String as keys. Keys are @@ -118,7 +109,6 @@ pub struct TermHashMap<'a> { occupied: Vec, } - struct QuadraticProbing { hash: usize, i: usize, @@ -141,7 +131,6 @@ impl QuadraticProbing { } } - impl<'a> TermHashMap<'a> { pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> { let table_size = 1 << num_bucket_power_of_2; @@ -178,18 +167,17 @@ impl<'a> TermHashMap<'a> { } pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { - self.occupied - .iter() - .cloned() - .map(move |bucket: usize| { - let kv = self.table[bucket]; - let (key, offset) = self.get_key_value(kv.key_value_addr); - (key, offset, bucket as UnorderedTermId) - }) + self.occupied.iter().cloned().map(move |bucket: usize| { + let kv = self.table[bucket]; + let (key, offset) = self.get_key_value(kv.key_value_addr); + (key, offset, bucket as UnorderedTermId) + }) } - - pub fn get_or_create, V: HeapAllocable>(&mut self, key: S) -> (UnorderedTermId, &mut V) { + pub fn get_or_create, V: HeapAllocable>( + &mut self, + key: S, + ) -> (UnorderedTermId, &mut V) { let key_bytes: &[u8] = key.as_ref(); let hash = murmurhash2::murmurhash2(key.as_ref()); let mut probe = self.probe(hash); @@ -212,7 +200,6 @@ impl<'a> TermHashMap<'a> { } } - #[cfg(test)] mod tests { @@ -223,7 +210,6 @@ mod tests { use std::collections::HashSet; use super::split_memory; - struct TestValue { val: u32, _addr: u32, @@ -245,7 +231,6 @@ mod tests { assert_eq!(split_memory(10_000_000), (7902848, 18)); } - #[test] fn test_hash_map() { let heap = Heap::with_capacity(2_000_000); @@ -319,5 +304,4 @@ mod tests { }); } - } diff --git a/src/datastruct/stacker/mod.rs b/src/datastruct/stacker/mod.rs index 1d6cac450..811bfeee7 100644 --- a/src/datastruct/stacker/mod.rs +++ b/src/datastruct/stacker/mod.rs @@ -39,6 +39,5 @@ fn test_unrolled_linked_list() { assert!(!it.next().is_some()); } } - } } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 5dc1166af..bb7a99bf1 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -20,19 +20,17 @@ use std::sync::Arc; use std::sync::RwLock; use tempdir::TempDir; - /// Returns None iff the file exists, can be read, but is empty (and hence /// cannot be mmapped). /// fn open_mmap(full_path: &PathBuf) -> result::Result, OpenReadError> { - let file = File::open(&full_path) - .map_err(|e| { - if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) - } else { - OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) - } - })?; + let file = File::open(&full_path).map_err(|e| { + if e.kind() == io::ErrorKind::NotFound { + OpenReadError::FileDoesNotExist(full_path.clone()) + } else { + OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) + } + })?; let meta_data = file.metadata() .map_err(|e| IOError::with_path(full_path.to_owned(), e))?; @@ -44,9 +42,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result, OpenRe } MmapReadOnly::open(&file) .map(Some) - .map_err(|e| { - From::from(IOError::with_path(full_path.to_owned(), e)) - }) + .map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e))) } #[derive(Default, Clone, Debug, Serialize, Deserialize)] @@ -79,7 +75,6 @@ impl Default for MmapCache { } impl MmapCache { - /// Removes a `MmapReadOnly` entry from the mmap cache. fn discard_from_cache(&mut self, full_path: &Path) -> bool { self.cache.remove(full_path).is_some() @@ -95,21 +90,21 @@ impl MmapCache { fn get_mmap(&mut self, full_path: PathBuf) -> Result, OpenReadError> { Ok(match self.cache.entry(full_path.clone()) { - HashMapEntry::Occupied(occupied_entry) => { - let mmap = occupied_entry.get(); - self.counters.hit += 1; - Some(mmap.clone()) - } - HashMapEntry::Vacant(vacant_entry) => { - self.counters.miss += 1; - if let Some(mmap) = open_mmap(&full_path)? { - vacant_entry.insert(mmap.clone()); - Some(mmap) - } else { - None - } - } - }) + HashMapEntry::Occupied(occupied_entry) => { + let mmap = occupied_entry.get(); + self.counters.hit += 1; + Some(mmap.clone()) + } + HashMapEntry::Vacant(vacant_entry) => { + self.counters.miss += 1; + if let Some(mmap) = open_mmap(&full_path)? { + vacant_entry.insert(mmap.clone()); + Some(mmap) + } else { + None + } + } + }) } } @@ -257,9 +252,9 @@ impl Directory for MmapDirectory { })?; Ok(mmap_cache - .get_mmap(full_path)? - .map(ReadOnlySource::Mmap) - .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) + .get_mmap(full_path)? + .map(ReadOnlySource::Mmap) + .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) } fn open_write(&mut self, path: &Path) -> Result { @@ -292,20 +287,19 @@ impl Directory for MmapDirectory { Ok(BufWriter::new(Box::new(writer))) } - /// Any entry associated to the path in the mmap will be /// removed before the file is deleted. fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { debug!("Deleting file {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = self.mmap_cache - .write() - .map_err(|_| { - let msg = format!("Failed to acquired write lock \ - on mmap cache while deleting {:?}", - path); - IOError::with_path(path.to_owned(), make_io_err(msg)) - })?; + let mut mmap_cache = self.mmap_cache.write().map_err(|_| { + let msg = format!( + "Failed to acquired write lock \ + on mmap cache while deleting {:?}", + path + ); + IOError::with_path(path.to_owned(), make_io_err(msg)) + })?; mmap_cache.discard_from_cache(path); // Removing the entry in the MMap cache. @@ -415,7 +409,10 @@ mod tests { } for (i, path) in paths.iter().enumerate() { mmap_directory.delete(path).unwrap(); - assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths - i - 1); + assert_eq!( + mmap_directory.get_cache_info().mmapped.len(), + num_paths - i - 1 + ); } } assert_eq!(mmap_directory.get_cache_info().counters.hit, 10); diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 10d9a85d9..161fec11a 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -4,7 +4,7 @@ use super::shared_vec_slice::SharedVecSlice; use common::HasLen; use std::slice; use std::io::{self, Read}; -use stable_deref_trait::{StableDeref, CloneStableDeref}; +use stable_deref_trait::{CloneStableDeref, StableDeref}; /// Read object that represents files in tantivy. /// diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index d7a9d8ce9..ea5a9e25a 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -4,7 +4,6 @@ use termdict::TermOrdinal; use schema::Facet; use termdict::{TermDictionary, TermDictionaryImpl}; - /// The facet reader makes it possible to access the list of /// facets associated to a given document in a specific /// segment. @@ -24,7 +23,6 @@ pub struct FacetReader { } impl FacetReader { - /// Creates a new `FacetReader`. /// /// A facet reader just wraps : @@ -32,8 +30,10 @@ impl FacetReader { /// access the list of facet ords for a given document. /// - a `TermDictionaryImpl` that helps associating a facet to /// an ordinal and vice versa. - pub fn new(term_ords: MultiValueIntFastFieldReader, - term_dict: TermDictionaryImpl) -> FacetReader { + pub fn new( + term_ords: MultiValueIntFastFieldReader, + term_dict: TermDictionaryImpl, + ) -> FacetReader { FacetReader { term_ords: term_ords, term_dict: term_dict, @@ -56,7 +56,8 @@ impl FacetReader { /// Given a term ordinal returns the term associated to it. pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) { - let found_term = self.term_dict.ord_to_term(facet_ord as u64, output.inner_buffer_mut()); + let found_term = self.term_dict + .ord_to_term(facet_ord as u64, output.inner_buffer_mut()); assert!(found_term, "Term ordinal {} no found.", facet_ord); } @@ -64,4 +65,4 @@ impl FacetReader { pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec) { self.term_ords.get_vals(doc, output); } -} \ No newline at end of file +} diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 207012a3f..0afc50c7e 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -95,7 +95,9 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64); - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -129,7 +131,9 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64); - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -164,7 +168,9 @@ mod tests { for _ in 0..10_000 { add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -199,7 +205,9 @@ mod tests { 5_000_000_000_000_000_000u64 + i, ); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -238,7 +246,9 @@ mod tests { doc.add_i64(i64_field, i); fast_field_writers.add_document(&doc); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -277,7 +287,9 @@ mod tests { let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); let doc = Document::default(); fast_field_writers.add_document(&doc); - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } @@ -311,7 +323,9 @@ mod tests { for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -366,7 +380,9 @@ mod tests { for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); @@ -398,7 +414,9 @@ mod tests { for x in &permutation { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } let source = directory.open_read(&path).unwrap(); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 726682826..0043e7783 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -2,4 +2,4 @@ mod writer; mod reader; pub use self::writer::MultiValueIntFastFieldWriter; -pub use self::reader::MultiValueIntFastFieldReader; \ No newline at end of file +pub use self::reader::MultiValueIntFastFieldReader; diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index bbf5102ac..147cee89a 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -18,8 +18,10 @@ pub struct MultiValueIntFastFieldReader { } impl MultiValueIntFastFieldReader { - - pub(crate) fn open(idx_reader: U64FastFieldReader, vals_reader: U64FastFieldReader) -> MultiValueIntFastFieldReader { + pub(crate) fn open( + idx_reader: U64FastFieldReader, + vals_reader: U64FastFieldReader, + ) -> MultiValueIntFastFieldReader { MultiValueIntFastFieldReader { idx_reader: idx_reader, vals_reader: vals_reader, @@ -38,12 +40,11 @@ impl MultiValueIntFastFieldReader { } } - #[cfg(test)] mod tests { use core::Index; - use schema::{Facet, Document, SchemaBuilder}; + use schema::{Document, Facet, SchemaBuilder}; #[test] fn test_multifastfield_reader() { @@ -51,7 +52,9 @@ mod tests { let facet_field = schema_builder.add_facet_field("facets"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_with_num_threads(1, 30_000_000).expect("Failed to create index writer."); + let mut index_writer = index + .writer_with_num_threads(1, 30_000_000) + .expect("Failed to create index writer."); { let mut doc = Document::new(); doc.add_facet(facet_field, "/category/cat2"); @@ -72,9 +75,7 @@ mod tests { index.load_searchers().expect("Reloading searchers"); let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0); - let mut facet_reader = segment_reader - .facet_reader(facet_field) - .unwrap(); + let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap(); let mut facet = Facet::root(); { @@ -108,7 +109,5 @@ mod tests { facet_reader.facet_ords(2, &mut vals); assert_eq!(&vals[..], &[4]); } - - } -} \ No newline at end of file +} diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 499b8bad4..d988656ce 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -38,10 +38,15 @@ impl MultiValueIntFastFieldWriter { } /// Push the fast fields value to the `FastFieldWriter`. - pub fn serialize(&self, serializer: &mut FastFieldSerializer, mapping: &HashMap) -> io::Result<()> { + pub fn serialize( + &self, + serializer: &mut FastFieldSerializer, + mapping: &HashMap, + ) -> io::Result<()> { { // writing the offset index - let mut doc_index_serializer = serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?; + let mut doc_index_serializer = + serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?; for &offset in &self.doc_index { doc_index_serializer.add_val(offset)?; } @@ -50,13 +55,13 @@ impl MultiValueIntFastFieldWriter { } { // writing the values themselves. - let mut value_serializer = serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?; + let mut value_serializer = + serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?; for val in &self.vals { value_serializer.add_val(*mapping.get(val).expect("Missing term ordinal") as u64)?; } value_serializer.close_field()?; } Ok(()) - } } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 3e0486a1c..b556e39f3 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -5,7 +5,7 @@ use DocId; use schema::SchemaBuilder; use std::path::Path; use schema::FAST; -use directory::{WritePtr, RAMDirectory, Directory}; +use directory::{Directory, RAMDirectory, WritePtr}; use fastfield::{FastFieldSerializer, FastFieldsWriter}; use schema::FieldType; use std::mem; @@ -88,7 +88,7 @@ impl FastFieldReader for U64FastFieldReader { fn is_enabled(field_type: &FieldType) -> bool { match *field_type { FieldType::U64(ref integer_options) => integer_options.is_fast(), - FieldType::HierarchicalFacet => { true }, + FieldType::HierarchicalFacet => true, _ => false, } } @@ -113,7 +113,6 @@ impl FastFieldReader for U64FastFieldReader { u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field."); amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field."); - } let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); @@ -127,7 +126,6 @@ impl FastFieldReader for U64FastFieldReader { } } - impl From> for U64FastFieldReader { fn from(vals: Vec) -> U64FastFieldReader { let mut schema_builder = SchemaBuilder::default(); @@ -136,22 +134,23 @@ impl From> for U64FastFieldReader { let path = Path::new("__dummy__"); let mut directory: RAMDirectory = RAMDirectory::create(); { - let write: WritePtr = directory.open_write(path).expect( - "With a RAMDirectory, this should never fail.", - ); - let mut serializer = FastFieldSerializer::from_write(write).expect( - "With a RAMDirectory, this should never fail.", - ); + let write: WritePtr = directory + .open_write(path) + .expect("With a RAMDirectory, this should never fail."); + let mut serializer = FastFieldSerializer::from_write(write) + .expect("With a RAMDirectory, this should never fail."); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); { - let fast_field_writer = fast_field_writers.get_field_writer(field).expect( - "With a RAMDirectory, this should never fail.", - ); + let fast_field_writer = fast_field_writers + .get_field_writer(field) + .expect("With a RAMDirectory, this should never fail."); for val in vals { fast_field_writer.add_val(val); } } - fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap(); + fast_field_writers + .serialize(&mut serializer, HashMap::new()) + .unwrap(); serializer.close().unwrap(); } @@ -159,9 +158,9 @@ impl From> for U64FastFieldReader { let composite_file = CompositeFile::open(&source).expect("Failed to read the composite file"); - let field_source = composite_file.open_read(field).expect( - "File component not found", - ); + let field_source = composite_file + .open_read(field) + .expect("File component not found"); U64FastFieldReader::open(field_source) } } @@ -222,7 +221,9 @@ impl FastFieldReader for I64FastFieldReader { /// # Panics /// Panics if the data is corrupted. fn open(data: ReadOnlySource) -> I64FastFieldReader { - I64FastFieldReader { underlying: U64FastFieldReader::open(data) } + I64FastFieldReader { + underlying: U64FastFieldReader::open(data), + } } fn is_enabled(field_type: &FieldType) -> bool { diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index afce4f053..bde080e0e 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -35,7 +35,9 @@ impl FastFieldSerializer { pub fn from_write(write: WritePtr) -> io::Result { // just making room for the pointer to header. let composite_write = CompositeWrite::wrap(write); - Ok(FastFieldSerializer { composite_write: composite_write }) + Ok(FastFieldSerializer { + composite_write: composite_write, + }) } /// Start serializing a new u64 fast field @@ -54,12 +56,12 @@ impl FastFieldSerializer { field: Field, min_value: u64, max_value: u64, - idx: usize) -> io::Result>> { + idx: usize, + ) -> io::Result>> { let field_write = self.composite_write.for_field_with_idx(field, idx); FastSingleFieldSerializer::open(field_write, min_value, max_value) } - /// Closes the serializer /// /// After this call the data must be persistently save on disk. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 780ea9bf2..4ea73e702 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,4 +1,4 @@ -use schema::{Schema, Field, Document, Cardinality}; +use schema::{Cardinality, Document, Field, Schema}; use fastfield::FastFieldSerializer; use std::io; use schema::Value; @@ -25,12 +25,11 @@ impl FastFieldsWriter { for (field_id, field_entry) in schema.fields().iter().enumerate() { let field = Field(field_id as u32); - let default_value = - if let FieldType::I64(_) = *field_entry.field_type() { - common::i64_to_u64(0i64) - } else { - 0u64 - }; + let default_value = if let FieldType::I64(_) = *field_entry.field_type() { + common::i64_to_u64(0i64) + } else { + 0u64 + }; match *field_entry.field_type() { FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => { match int_options.get_fastfield_cardinality() { @@ -50,7 +49,7 @@ impl FastFieldsWriter { let fast_field_writer = MultiValueIntFastFieldWriter::new(field); multi_values_writers.push(fast_field_writer); } - _ => {}, + _ => {} } } FastFieldsWriter { @@ -64,7 +63,7 @@ impl FastFieldsWriter { pub(crate) fn new(fields: Vec) -> FastFieldsWriter { FastFieldsWriter { single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(), - multi_values_writers: vec!(), + multi_values_writers: vec![], } } @@ -73,23 +72,22 @@ impl FastFieldsWriter { // TODO optimize self.single_value_writers .iter_mut() - .find(|field_writer| { - field_writer.field() == field - }) + .find(|field_writer| field_writer.field() == field) } /// Returns the fast field multi-value writer for the given field. /// /// Returns None if the field does not exist, or is not /// configured as a multivalued fastfield in the schema. - pub(crate) fn get_multivalue_writer(&mut self, field: Field) -> Option<&mut MultiValueIntFastFieldWriter> { + pub(crate) fn get_multivalue_writer( + &mut self, + field: Field, + ) -> Option<&mut MultiValueIntFastFieldWriter> { // TODO optimize // TODO expose for users self.multi_values_writers .iter_mut() - .find(|multivalue_writer| { - multivalue_writer.field() == field - }) + .find(|multivalue_writer| multivalue_writer.field() == field) } /// Indexes all of the fastfields of a new document. @@ -104,9 +102,11 @@ impl FastFieldsWriter { /// Serializes all of the `FastFieldWriter`s by pushing them in /// order to the fast field serializer. - pub fn serialize(&self, - serializer: &mut FastFieldSerializer, - mapping: HashMap>) -> io::Result<()> { + pub fn serialize( + &self, + serializer: &mut FastFieldSerializer, + mapping: HashMap>, + ) -> io::Result<()> { for field_writer in &self.single_value_writers { field_writer.serialize(serializer)?; } @@ -201,9 +201,9 @@ impl IntFastFieldWriter { /// associated to the document with the `DocId` n. /// (Well, `n-1` actually because of 0-indexing) pub fn add_val(&mut self, val: u64) { - VInt(val).serialize(&mut self.vals).expect( - "unable to serialize VInt to Vec", - ); + VInt(val) + .serialize(&mut self.vals) + .expect("unable to serialize VInt to Vec"); if val > self.val_max { self.val_max = val; @@ -215,7 +215,6 @@ impl IntFastFieldWriter { self.val_count += 1; } - /// Extract the value associated to the fast field for /// this document. /// @@ -228,13 +227,11 @@ impl IntFastFieldWriter { /// only the first one is taken in account. fn extract_val(&self, doc: &Document) -> u64 { match doc.get_first(self.field) { - Some(v) => { - match *v { - Value::U64(ref val) => *val, - Value::I64(ref val) => common::i64_to_u64(*val), - _ => panic!("Expected a u64field, got {:?} ", v), - } - } + Some(v) => match *v { + Value::U64(ref val) => *val, + Value::I64(ref val) => common::i64_to_u64(*val), + _ => panic!("Expected a u64field, got {:?} ", v), + }, None => self.val_if_missing, } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index cb4fdc16e..f3b318356 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -17,7 +17,7 @@ use super::operation::AddOperation; use postings::MultiFieldPostingsWriter; use tokenizer::BoxedTokenizer; use tokenizer::FacetTokenizer; -use tokenizer::{Tokenizer, TokenStream}; +use tokenizer::{TokenStream, Tokenizer}; use schema::Value; /// A `SegmentWriter` is in charge of creating segment index from a @@ -126,11 +126,7 @@ impl<'a> SegmentWriter<'a> { /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document( - &mut self, - add_operation: AddOperation, - schema: &Schema, - ) -> io::Result<()> { + pub fn add_document(&mut self, add_operation: AddOperation, schema: &Schema) -> io::Result<()> { let doc_id = self.max_doc; let mut doc = add_operation.document; self.doc_opstamps.push(add_operation.opstamp); @@ -144,17 +140,16 @@ impl<'a> SegmentWriter<'a> { } match *field_options.field_type() { FieldType::HierarchicalFacet => { - let facets: Vec<&[u8]> = field_values.iter() - .flat_map(|field_value| { - match field_value.value() { - &Value::Facet(ref facet) => Some(facet.encoded_bytes()), - _ => { - panic!("Expected hierarchical facet"); - } + let facets: Vec<&[u8]> = field_values + .iter() + .flat_map(|field_value| match field_value.value() { + &Value::Facet(ref facet) => Some(facet.encoded_bytes()), + _ => { + panic!("Expected hierarchical facet"); } }) .collect(); - let mut term = unsafe {Term::with_capacity(100)}; + let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); for facet_bytes in facets { let mut unordered_term_id_opt = None; @@ -163,7 +158,8 @@ impl<'a> SegmentWriter<'a> { .token_stream(&fake_str) .process(&mut |ref token| { term.set_text(&token.text); - let unordered_term_id = self.multifield_postings.subscribe(doc_id, &term); + let unordered_term_id = + self.multifield_postings.subscribe(doc_id, &term); unordered_term_id_opt = Some(unordered_term_id); }); @@ -176,25 +172,26 @@ impl<'a> SegmentWriter<'a> { } } FieldType::Str(_) => { - let num_tokens = - if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] { - let texts: Vec<&str> = field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::Str(ref text) => Some(text.as_str()), - _ => None, - }) - .collect(); - if texts.is_empty() { - 0 - } else { - let mut token_stream = tokenizer.token_stream_texts(&texts[..]); - self.multifield_postings - .index_text(doc_id, field, &mut token_stream) - } - } else { + let num_tokens = if let Some(ref mut tokenizer) = + self.tokenizers[field.0 as usize] + { + let texts: Vec<&str> = field_values + .iter() + .flat_map(|field_value| match *field_value.value() { + Value::Str(ref text) => Some(text.as_str()), + _ => None, + }) + .collect(); + if texts.is_empty() { 0 - }; + } else { + let mut token_stream = tokenizer.token_stream_texts(&texts[..]); + self.multifield_postings + .index_text(doc_id, field, &mut token_stream) + } + } else { + 0 + }; self.fieldnorms_writer .get_field_writer(field) .map(|field_norms_writer| { @@ -226,9 +223,7 @@ impl<'a> SegmentWriter<'a> { } } self.fieldnorms_writer.fill_val_up_to(doc_id); - doc.filter_fields(|field| { - schema.get_field_entry(field).is_stored() - }); + doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); let doc_writer = self.segment_serializer.get_store_writer(); doc_writer.store(&doc)?; self.max_doc += 1; diff --git a/src/lib.rs b/src/lib.rs index 46f537067..ca6c0dfc1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,8 +34,6 @@ extern crate log; #[macro_use] extern crate error_chain; -extern crate regex; -extern crate tempfile; extern crate atomicwrites; extern crate bit_set; extern crate byteorder; @@ -49,11 +47,13 @@ extern crate itertools; extern crate lz4; extern crate num_cpus; extern crate owning_ref; +extern crate regex; extern crate rust_stemmers; extern crate serde; extern crate serde_json; extern crate stable_deref_trait; extern crate tempdir; +extern crate tempfile; extern crate time; extern crate uuid; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 972d38aae..fd86266f4 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -94,39 +94,52 @@ mod tests { index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); - let query = TermQuery::new(Term::from_field_text(title, "abc"), IndexRecordOption::WithFreqsAndPositions); + let query = TermQuery::new( + Term::from_field_text(title, "abc"), + IndexRecordOption::WithFreqsAndPositions, + ); let weight = query.specialized_weight(&*searcher); { - let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap(); + let mut scorer = weight + .specialized_scorer(searcher.segment_reader(0u32)) + .unwrap(); scorer.advance(); - assert_eq!(&[0,1,2], scorer.postings().positions()); + assert_eq!(&[0, 1, 2], scorer.postings().positions()); scorer.advance(); - assert_eq!(&[0,5], scorer.postings().positions()); + assert_eq!(&[0, 5], scorer.postings().positions()); } { - let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap(); + let mut scorer = weight + .specialized_scorer(searcher.segment_reader(0u32)) + .unwrap(); scorer.advance(); scorer.advance(); - assert_eq!(&[0,5], scorer.postings().positions()); + assert_eq!(&[0, 5], scorer.postings().positions()); } { - let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap(); + let mut scorer = weight + .specialized_scorer(searcher.segment_reader(0u32)) + .unwrap(); assert_eq!(scorer.skip_next(1), SkipResult::Reached); assert_eq!(scorer.doc(), 1); - assert_eq!(&[0,5], scorer.postings().positions()); + assert_eq!(&[0, 5], scorer.postings().positions()); } { - let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap(); + let mut scorer = weight + .specialized_scorer(searcher.segment_reader(0u32)) + .unwrap(); assert_eq!(scorer.skip_next(1002), SkipResult::Reached); assert_eq!(scorer.doc(), 1002); - assert_eq!(&[0,5], scorer.postings().positions()); + assert_eq!(&[0, 5], scorer.postings().positions()); } { - let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap(); + let mut scorer = weight + .specialized_scorer(searcher.segment_reader(0u32)) + .unwrap(); assert_eq!(scorer.skip_next(100), SkipResult::Reached); assert_eq!(scorer.skip_next(1002), SkipResult::Reached); assert_eq!(scorer.doc(), 1002); - assert_eq!(&[0,5], scorer.postings().positions()); + assert_eq!(&[0, 5], scorer.postings().positions()); } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index b63e0d527..d5061ebd2 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,15 +1,15 @@ use DocId; use schema::Term; -use postings::{InvertedIndexSerializer, FieldSerializer}; +use postings::{FieldSerializer, InvertedIndexSerializer}; use std::io; use std::collections::HashMap; use postings::Recorder; use Result; -use schema::{Schema, Field}; +use schema::{Field, Schema}; use std::marker::PhantomData; use std::ops::DerefMut; -use datastruct::stacker::{TermHashMap, Heap}; -use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; +use datastruct::stacker::{Heap, TermHashMap}; +use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder}; use schema::FieldEntry; use schema::FieldType; use tokenizer::Token; @@ -17,39 +17,31 @@ use tokenizer::TokenStream; use schema::IndexRecordOption; use postings::UnorderedTermId; - fn posting_from_field_entry<'a>( field_entry: &FieldEntry, heap: &'a Heap, ) -> Box { match *field_entry.field_type() { - FieldType::Str(ref text_options) => { - text_options + FieldType::Str(ref text_options) => text_options .get_indexing_options() - .map(|indexing_options| { - match indexing_options.index_option() { - IndexRecordOption::Basic => { - SpecializedPostingsWriter::::new_boxed(heap) - } - IndexRecordOption::WithFreqs => { - SpecializedPostingsWriter::::new_boxed(heap) - } - IndexRecordOption::WithFreqsAndPositions => { - SpecializedPostingsWriter::::new_boxed(heap) - } + .map(|indexing_options| match indexing_options.index_option() { + IndexRecordOption::Basic => { + SpecializedPostingsWriter::::new_boxed(heap) + } + IndexRecordOption::WithFreqs => { + SpecializedPostingsWriter::::new_boxed(heap) + } + IndexRecordOption::WithFreqsAndPositions => { + SpecializedPostingsWriter::::new_boxed(heap) } }) - .unwrap_or_else(|| { - SpecializedPostingsWriter::::new_boxed(heap) - }) + .unwrap_or_else(|| SpecializedPostingsWriter::::new_boxed(heap)), + FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => { + SpecializedPostingsWriter::::new_boxed(heap) } - FieldType::U64(_) | - FieldType::I64(_) | - FieldType::HierarchicalFacet => SpecializedPostingsWriter::::new_boxed(heap), } } - pub struct MultiFieldPostingsWriter<'a> { heap: &'a Heap, term_index: TermHashMap<'a>, @@ -88,7 +80,10 @@ impl<'a> MultiFieldPostingsWriter<'a> { /// It pushes all term, one field at a time, towards the /// postings serializer. #[allow(needless_range_loop)] - pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result>> { + pub fn serialize( + &self, + serializer: &mut InvertedIndexSerializer, + ) -> Result>> { let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect(); term_offsets.sort_by_key(|&(k, _, _)| k); @@ -99,8 +94,8 @@ impl<'a> MultiFieldPostingsWriter<'a> { .map(|(key, _, _)| Term::wrap(key).field()) .enumerate(); - - let mut unordered_term_mappings: HashMap> = HashMap::new(); + let mut unordered_term_mappings: HashMap> = + HashMap::new(); let mut prev_field = Field(u32::max_value()); for (offset, field) in term_offsets_it { @@ -120,8 +115,9 @@ impl<'a> MultiFieldPostingsWriter<'a> { let mut mapping = HashMap::new(); for (term_ord, term_unord_id) in term_offsets[start..stop] .iter() - .map(|&(_,_,bucket)| bucket) - .enumerate() { + .map(|&(_, _, bucket)| bucket) + .enumerate() + { mapping.insert(term_unord_id, term_ord); } unordered_term_mappings.insert(field, mapping); @@ -144,7 +140,6 @@ impl<'a> MultiFieldPostingsWriter<'a> { } } - /// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. /// @@ -168,20 +163,22 @@ pub trait PostingsWriter { /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. - fn serialize(&self, - term_addrs: &[(&[u8], u32, UnorderedTermId)], - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + term_addrs: &[(&[u8], u32, UnorderedTermId)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; /// Tokenize a text and subscribe all of its token. - fn index_text<'a>(&mut self, - term_index: &mut TermHashMap, - doc_id: DocId, - field: Field, - token_stream: &mut TokenStream, - heap: &Heap) - -> u32 { + fn index_text<'a>( + &mut self, + term_index: &mut TermHashMap, + doc_id: DocId, + field: Field, + token_stream: &mut TokenStream, + heap: &Heap, + ) -> u32 { let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); let mut sink = |token: &Token| { @@ -215,7 +212,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { - fn subscribe( &mut self, term_index: &mut TermHashMap, @@ -237,8 +233,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' term_ord } - - fn serialize( &self, term_addrs: &[(&[u8], u32, UnorderedTermId)], diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 632693cef..da36bf38b 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -9,7 +9,6 @@ use DocId; use std::any::Any; use core::Searcher; - /// Query that matches all of the documents. /// /// All of the document get the score 1f32. @@ -34,12 +33,11 @@ impl Weight for AllWeight { Ok(box AllScorer { started: false, doc: 0u32, - max_doc: reader.max_doc() + max_doc: reader.max_doc(), }) } } - /// Scorer associated to the `AllQuery` query. pub struct AllScorer { started: bool, @@ -51,8 +49,7 @@ impl DocSet for AllScorer { fn advance(&mut self) -> bool { if self.started { self.doc += 1u32; - } - else { + } else { self.started = true; } self.doc < self.max_doc @@ -71,4 +68,4 @@ impl Scorer for AllScorer { fn score(&self) -> Score { 1f32 } -} \ No newline at end of file +} diff --git a/src/query/mod.rs b/src/query/mod.rs index 7177871b5..7303baebb 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -24,4 +24,4 @@ pub use self::scorer::EmptyScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; -pub use self::all_query::{AllQuery, AllWeight, AllScorer}; \ No newline at end of file +pub use self::all_query::{AllQuery, AllScorer, AllWeight}; diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 8e78394cb..b1aaaa6ca 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -6,7 +6,6 @@ pub use self::phrase_query::PhraseQuery; pub use self::phrase_weight::PhraseWeight; pub use self::phrase_scorer::PhraseScorer; - #[cfg(test)] mod tests { @@ -75,8 +74,6 @@ mod tests { assert_eq!(test_query(vec!["g", "a"]), empty_vec); } - - #[test] // motivated by #234 pub fn test_phrase_query_docfreq_order() { let mut schema_builder = SchemaBuilder::default(); @@ -90,11 +87,13 @@ mod tests { let doc = doc!(text_field=>"b"); index_writer.add_document(doc); } - { // 1 + { + // 1 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } - { // 2 + { + // 2 let doc = doc!(text_field=>"b a"); index_writer.add_document(doc); } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 4b7a26095..f6775cb60 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -1,6 +1,6 @@ use query::Scorer; use DocId; -use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings}; +use postings::{DocSet, IntersectionDocSet, Postings, SegmentPostings, SkipResult}; struct PostingsWithOffset { offset: u32, @@ -11,7 +11,7 @@ impl PostingsWithOffset { pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset { PostingsWithOffset { offset, - segment_postings + segment_postings, } } } @@ -49,7 +49,6 @@ pub struct PhraseScorer { } impl PhraseScorer { - pub fn new(term_postings: Vec) -> PhraseScorer { let postings_with_offsets: Vec<_> = term_postings .into_iter() @@ -57,12 +56,11 @@ impl PhraseScorer { .map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32)) .collect(); PhraseScorer { - intersection_docset: IntersectionDocSet::from(postings_with_offsets) + intersection_docset: IntersectionDocSet::from(postings_with_offsets), } } fn phrase_match(&self) -> bool { - // TODO maybe we could avoid decoding positions lazily for all terms // when there is > 2 terms. // @@ -74,7 +72,6 @@ impl PhraseScorer { positions_arr[docset.offset as usize] = docset.positions(); } - let num_postings = positions_arr.len() as u32; let mut ord = 1u32; diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 5c73bd2a0..b70ba3747 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -23,7 +23,8 @@ impl Weight for PhraseWeight { for term in &self.phrase_terms { if let Some(postings) = reader .inverted_index(term.field()) - .read_postings(term, IndexRecordOption::WithFreqsAndPositions) { + .read_postings(term, IndexRecordOption::WithFreqsAndPositions) + { term_postings_list.push(postings); } else { return Ok(box EmptyScorer); diff --git a/src/schema/document.rs b/src/schema/document.rs index 7c15d9c16..82a07f184 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -21,9 +21,7 @@ pub struct Document { impl From> for Document { fn from(field_values: Vec) -> Self { - Document { - field_values - } + Document { field_values } } } @@ -38,7 +36,6 @@ impl PartialEq for Document { } } - impl Eq for Document {} impl Document { @@ -59,14 +56,16 @@ impl Document { /// Retain only the field that are matching the /// predicate given in argument. - pub fn filter_fieldsbool>(&mut self, predicate: P) { + pub fn filter_fields bool>(&mut self, predicate: P) { self.field_values .retain(|field_value| predicate(field_value.field())); } /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where Facet: From { + where + Facet: From, + { let facet = Facet::from(path); let value = Value::Facet(facet); self.add(FieldValue::new(field, value)); @@ -144,9 +143,7 @@ impl BinarySerializable for Document { fn deserialize(reader: &mut R) -> io::Result { let num_field_values = VInt::deserialize(reader)?.val() as usize; let field_values = (0..num_field_values) - .map(|_| { - FieldValue::deserialize(reader) - }) + .map(|_| FieldValue::deserialize(reader)) .collect::>>()?; Ok(Document::from(field_values)) } diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 9c67fb8bb..15e2dca65 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -1,4 +1,4 @@ -use std::fmt::{self, Display, Debug, Formatter}; +use std::fmt::{self, Debug, Display, Formatter}; use std::str; use std::io::{self, Read, Write}; use regex::Regex; @@ -7,7 +7,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::Cow; use common::BinarySerializable; - const SLASH_BYTE: u8 = '/' as u8; const ESCAPE_BYTE: u8 = '\\' as u8; @@ -30,7 +29,6 @@ pub const FACET_SEP_BYTE: u8 = 0u8; pub struct Facet(Vec); impl Facet { - /// Returns a new instance of the "root facet" /// Equivalent to `/`. pub fn root() -> Facet { @@ -66,7 +64,9 @@ impl Facet { /// contains a `/` or a `\`, it should be escaped /// using an anti-slash `/`. pub fn from_text<'a, T>(path: &'a T) -> Facet - where T: ?Sized + AsRef { + where + T: ?Sized + AsRef, + { From::from(path) } @@ -75,9 +75,10 @@ impl Facet { /// /// The steps are expected to be unescaped. pub fn from_path(path: Path) -> Facet - where - Path: IntoIterator, - Path::Item: ToString { + where + Path: IntoIterator, + Path::Item: ToString, + { let mut facet_bytes: Vec = Vec::with_capacity(100); let mut step_it = path.into_iter(); if let Some(step) = step_it.next() { @@ -95,7 +96,6 @@ impl Facet { &mut self.0 } - /// Returns `true` iff other is a subfacet of `self`. pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_bytes: &[u8] = self.encoded_bytes(); @@ -116,7 +116,6 @@ impl Borrow<[u8]> for Facet { } impl<'a, T: ?Sized + AsRef> From<&'a T> for Facet { - fn from(path_asref: &'a T) -> Facet { #[derive(Copy, Clone)] enum State { @@ -129,9 +128,7 @@ impl<'a, T: ?Sized + AsRef> From<&'a T> for Facet { let path_bytes = path.as_bytes(); for &c in &path_bytes[1..] { match (state, c) { - (State::Idle, ESCAPE_BYTE) => { - state = State::Escaped - } + (State::Idle, ESCAPE_BYTE) => state = State::Escaped, (State::Idle, SLASH_BYTE) => { facet_encoded.push(FACET_SEP_BYTE); } @@ -179,16 +176,19 @@ fn escape_slashes(s: &str) -> Cow { impl Serialize for Facet { fn serialize(&self, serializer: S) -> Result - where S: Serializer { + where + S: Serializer, + { serializer.serialize_str(&self.to_string()) } } impl<'de> Deserialize<'de> for Facet { - fn deserialize(deserializer: D) -> Result where - D: Deserializer<'de> { - <&'de str as Deserialize<'de>>::deserialize(deserializer) - .map(Facet::from) + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + <&'de str as Deserialize<'de>>::deserialize(deserializer).map(Facet::from) } } @@ -199,7 +199,6 @@ impl Debug for Facet { } } - #[cfg(test)] mod tests { @@ -226,7 +225,6 @@ mod tests { } } - #[test] fn test_facet_debug() { let v = ["first", "second", "third"]; @@ -234,4 +232,4 @@ mod tests { assert_eq!(format!("{:?}", facet), "Facet(/first/second/third)"); } -} \ No newline at end of file +} diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 5d5d49273..f120375bd 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -71,7 +71,7 @@ impl FieldEntry { match self.field_type { FieldType::Str(ref options) => options.get_indexing_options().is_some(), FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(), - FieldType::HierarchicalFacet => true + FieldType::HierarchicalFacet => true, } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 3a39490b3..3069a3b37 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -39,7 +39,7 @@ impl FieldType { FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => { int_options.is_indexed() } - FieldType::HierarchicalFacet => true + FieldType::HierarchicalFacet => true, } } @@ -59,7 +59,7 @@ impl FieldType { None } } - FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic) + FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic), } } @@ -75,9 +75,7 @@ impl FieldType { FieldType::U64(_) | FieldType::I64(_) => Err(ValueParsingError::TypeError( format!("Expected an integer, got {:?}", json), )), - FieldType::HierarchicalFacet => { - Ok(Value::Facet(Facet::from(field_text))) - } + FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))), }, JsonValue::Number(ref field_val_num) => match *self { FieldType::I64(_) => { diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index f47d79319..cd1fd8a22 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -1,6 +1,5 @@ use std::ops::BitOr; - /// Express whether a field is single-value or multi-valued. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum Cardinality { @@ -10,15 +9,14 @@ pub enum Cardinality { /// The document can have any number of values associated to the document. /// This is more memory and CPU expensive than the SingleValue solution. #[serde(rename = "multi")] - MultiValues + MultiValues, } /// Define how an int field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntOptions { indexed: bool, - #[serde(skip_serializing_if="Option::is_none")] - fast: Option, + #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, } @@ -86,7 +84,6 @@ impl Default for IntOptions { } } - /// Shortcut for a u64 fast field. /// /// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED` @@ -114,7 +111,6 @@ pub const INT_STORED: IntOptions = IntOptions { fast: None, }; - impl BitOr for IntOptions { type Output = IntOptions; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 864992790..b93944b8e 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -334,8 +334,12 @@ mod tests { #[test] pub fn test_schema_serialization() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue); - let popularity_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue); + let count_options = IntOptions::default() + .set_stored() + .set_fast(Cardinality::SingleValue); + let popularity_options = IntOptions::default() + .set_stored() + .set_fast(Cardinality::SingleValue); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); schema_builder.add_u64_field("count", count_options); @@ -399,7 +403,9 @@ mod tests { #[test] pub fn test_document_to_json() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue); + let count_options = IntOptions::default() + .set_stored() + .set_fast(Cardinality::SingleValue); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); schema_builder.add_u64_field("count", count_options); @@ -418,8 +424,12 @@ mod tests { #[test] pub fn test_parse_document() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue); - let popularity_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue); + let count_options = IntOptions::default() + .set_stored() + .set_fast(Cardinality::SingleValue); + let popularity_options = IntOptions::default() + .set_stored() + .set_fast(Cardinality::SingleValue); let title_field = schema_builder.add_text_field("title", TEXT); let author_field = schema_builder.add_text_field("author", STRING); let count_field = schema_builder.add_u64_field("count", count_options); diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 18202847e..401ed65a9 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -44,7 +44,7 @@ impl Default for TextOptions { /// Configuration defining indexing for a text field. /// It wraps: -/// +/// /// * record (See [`IndexRecordOption`](./enum.IndexRecordOption.html)) /// * tokenizer #[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] diff --git a/src/schema/value.rs b/src/schema/value.rs index eaf66f101..90e573ade 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -26,7 +26,7 @@ impl Serialize for Value { Value::Str(ref v) => serializer.serialize_str(v), Value::U64(u) => serializer.serialize_u64(u), Value::I64(u) => serializer.serialize_i64(u), - Value::Facet(ref facet) => facet.serialize(serializer) + Value::Facet(ref facet) => facet.serialize(serializer), } } } @@ -178,9 +178,7 @@ mod binary_serialize { let value = i64::deserialize(reader)?; Ok(Value::I64(value)) } - HIERARCHICAL_FACET_CODE => { - Ok(Value::Facet(Facet::deserialize(reader)?)) - } + HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)), _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code), diff --git a/src/store/reader.rs b/src/store/reader.rs index 4ed8cdab2..7f4343f8f 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -39,9 +39,7 @@ impl StoreReader { } fn block_offset(&self, doc_id: DocId) -> (DocId, u64) { - self.block_index() - .seek(doc_id + 1) - .unwrap_or((0u32, 0u64)) + self.block_index().seek(doc_id + 1).unwrap_or((0u32, 0u64)) } pub(crate) fn block_data(&self) -> &[u8] { diff --git a/src/store/writer.rs b/src/store/writer.rs index 1742a3d00..402158278 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,6 +1,6 @@ use directory::WritePtr; use DocId; -use common::{VInt, BinarySerializable}; +use common::{BinarySerializable, VInt}; use std::io::{self, Write}; use super::StoreReader; use lz4; @@ -66,10 +66,8 @@ impl StoreWriter { pub fn stack(&mut self, store_reader: &StoreReader) -> io::Result<()> { if !self.current_block.is_empty() { self.write_and_compress_block()?; - self.offset_index_writer.insert( - self.doc, - &(self.writer.written_bytes() as u64), - )?; + self.offset_index_writer + .insert(self.doc, &(self.writer.written_bytes() as u64))?; } let doc_offset = self.doc; let start_offset = self.writer.written_bytes() as u64; @@ -81,9 +79,8 @@ impl StoreWriter { // its start doc id and its start file offset. for (next_doc_id, block_addr) in store_reader.block_index() { self.doc = doc_offset + next_doc_id; - self.offset_index_writer.insert( - self.doc, - &(start_offset + block_addr))?; + self.offset_index_writer + .insert(self.doc, &(start_offset + block_addr))?; } Ok(()) } diff --git a/src/termdict/fstdict/streamer.rs b/src/termdict/fstdict/streamer.rs index 9af6d42e2..4ebc3630c 100644 --- a/src/termdict/fstdict/streamer.rs +++ b/src/termdict/fstdict/streamer.rs @@ -1,8 +1,8 @@ use fst::{IntoStreamer, Streamer}; -use fst::map::{StreamBuilder, Stream}; +use fst::map::{Stream, StreamBuilder}; use postings::TermInfo; use super::TermDictionaryImpl; -use termdict::{TermOrdinal, TermDictionary, TermStreamerBuilder, TermStreamer}; +use termdict::{TermDictionary, TermOrdinal, TermStreamer, TermStreamerBuilder}; /// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html) pub struct TermStreamerBuilderImpl<'a> { @@ -53,7 +53,6 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> { } } - /// See [`TermStreamer`](./trait.TermStreamer.html) pub struct TermStreamerImpl<'a> { fst_map: &'a TermDictionaryImpl, @@ -88,4 +87,3 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> { &self.current_value } } - diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 9ae69d914..96c2be3f9 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -6,7 +6,7 @@ use common::BinarySerializable; use schema::FieldType; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder, TermOrdinal}; -use super::{TermStreamerImpl, TermStreamerBuilderImpl}; +use super::{TermStreamerBuilderImpl, TermStreamerImpl}; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -95,7 +95,6 @@ pub struct TermDictionaryImpl { values_mmap: ReadOnlySource, } - impl<'a> TermDictionary<'a> for TermDictionaryImpl { type Streamer = TermStreamerImpl<'a>; @@ -105,9 +104,8 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { let total_len = source.len(); let length_offset = total_len - 4; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u32::deserialize(&mut split_len_buffer).expect( - "Deserializing 4 bytes should always work", - ) as usize; + let footer_size = u32::deserialize(&mut split_len_buffer) + .expect("Deserializing 4 bytes should always work") as usize; let split_len = length_offset - footer_size; let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); @@ -128,16 +126,14 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { let mut node = fst.root(); while ord != 0 || !node.is_final() { if let Some(transition) = node.transitions() - .take_while(|transition| { - transition.out.value() <= ord - }) - .last() { + .take_while(|transition| transition.out.value() <= ord) + .last() + { ord -= transition.out.value(); bytes.push(transition.inp); let new_node_addr = transition.addr; node = fst.node(new_node_addr); - } - else { + } else { return false; } } diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index ef07aae86..5f84d933a 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -48,34 +48,30 @@ followed by a streaming through at most `1024` elements in the term `stream`. */ -use schema::{Term, Field, FieldType}; +use schema::{Field, FieldType, Term}; use directory::ReadOnlySource; use postings::TermInfo; - /// Position of the term in the sorted list of terms. pub type TermOrdinal = u64; - pub use self::merger::TermMerger; #[cfg(not(feature = "streamdict"))] mod fstdict; #[cfg(not(feature = "streamdict"))] -pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, - TermStreamerBuilderImpl}; - +pub use self::fstdict::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerBuilderImpl, + TermStreamerImpl}; #[cfg(feature = "streamdict")] mod streamdict; #[cfg(feature = "streamdict")] -pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl, - TermStreamerBuilderImpl}; +pub use self::streamdict::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerBuilderImpl, + TermStreamerImpl}; mod merger; use std::io; - /// Dictionary associating sorted `&[u8]` to values pub trait TermDictionary<'a> where @@ -154,7 +150,6 @@ where fn finish(self) -> io::Result; } - /// `TermStreamer` acts as a cursor over a range of terms of a segment. /// Terms are guaranteed to be sorted. pub trait TermStreamer: Sized { @@ -202,7 +197,6 @@ pub trait TermStreamer: Sized { } } - /// `TermStreamerBuilder` is an helper object used to define /// a range of terms that should be streamed. pub trait TermStreamerBuilder { @@ -226,13 +220,12 @@ pub trait TermStreamerBuilder { fn into_stream(self) -> Self::Streamer; } - #[cfg(test)] mod tests { - use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl}; - use directory::{RAMDirectory, Directory, ReadOnlySource}; + use super::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerImpl}; + use directory::{Directory, RAMDirectory, ReadOnlySource}; use std::path::PathBuf; - use schema::{FieldType, SchemaBuilder, Document, TEXT}; + use schema::{Document, FieldType, SchemaBuilder, TEXT}; use core::Index; use std::str; use termdict::TermStreamer; @@ -243,17 +236,15 @@ mod tests { const BLOCK_SIZE: usize = 1_500; - fn make_term_info(val: u64) -> TermInfo { TermInfo { doc_freq: val as u32, - positions_offset: val * 2u64, + positions_offset: val * 2u64, postings_offset: val * 3u64, positions_inner_offset: 5u8, } } - #[test] fn test_term_ordinals() { const COUNTRIES: [&'static str; 7] = [ @@ -263,15 +254,15 @@ mod tests { "Slovenia", "Spain", "Sweden", - "Switzerland" + "Switzerland", ]; let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(write, field_type).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) @@ -283,7 +274,7 @@ mod tests { let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source); for (term_ord, term) in COUNTRIES.iter().enumerate() { assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); - let mut bytes = vec!(); + let mut bytes = vec![]; assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes)); assert_eq!(bytes, term.as_bytes()); } @@ -296,8 +287,8 @@ mod tests { { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(write, field_type).unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u64)) .unwrap(); @@ -377,7 +368,6 @@ mod tests { assert_eq!(&*term_string, "abcdef"); } - #[test] fn test_term_dictionary_stream() { let ids: Vec<_> = (0u32..10_000u32) @@ -385,8 +375,8 @@ mod tests { .collect(); let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -411,13 +401,12 @@ mod tests { term_dictionary.get(key.as_bytes()); } - #[test] fn test_stream_high_range_prefix_suffix() { let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); // term requires more than 16bits term_dictionary_builder .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) @@ -451,8 +440,8 @@ mod tests { .collect(); let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -520,14 +509,15 @@ mod tests { fn test_empty_string() { let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); + term_dictionary_builder + .insert(&[], &make_term_info(1 as u64)) .unwrap(); term_dictionary_builder - .insert(&[], &make_term_info(1 as u64)).unwrap(); - term_dictionary_builder - .insert(&[1u8], &make_term_info(2 as u64)).unwrap(); - term_dictionary_builder - .finish().unwrap() + .insert(&[1u8], &make_term_info(2 as u64)) + .unwrap(); + term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source); @@ -543,8 +533,8 @@ mod tests { fn test_stream_range_boundaries() { let field_type = FieldType::Str(TEXT); let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type) - .unwrap(); + let mut term_dictionary_builder = + TermDictionaryBuilderImpl::new(vec![], field_type).unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; term_dictionary_builder diff --git a/src/termdict/streamdict/delta_encoder.rs b/src/termdict/streamdict/delta_encoder.rs index cf47105f5..b7e64f3cf 100644 --- a/src/termdict/streamdict/delta_encoder.rs +++ b/src/termdict/streamdict/delta_encoder.rs @@ -49,7 +49,6 @@ impl TermDeltaDecoder { } } - // code // first bit represents whether the prefix / suffix len can be encoded // on the same byte. (the next one) @@ -57,18 +56,17 @@ impl TermDeltaDecoder { #[inline(always)] pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] { - let (prefix_len, suffix_len): (usize, usize) = - if (code & 1u8) == 1u8 { - let b = cursor[0]; - cursor = &cursor[1..]; - let prefix_len = (b & 15u8) as usize; - let suffix_len = (b >> 4u8) as usize; - (prefix_len, suffix_len) - } else { - let prefix_len = u32::deserialize(&mut cursor).unwrap(); - let suffix_len = u32::deserialize(&mut cursor).unwrap(); - (prefix_len as usize, suffix_len as usize) - }; + let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 { + let b = cursor[0]; + cursor = &cursor[1..]; + let prefix_len = (b & 15u8) as usize; + let suffix_len = (b >> 4u8) as usize; + (prefix_len, suffix_len) + } else { + let prefix_len = u32::deserialize(&mut cursor).unwrap(); + let suffix_len = u32::deserialize(&mut cursor).unwrap(); + (prefix_len as usize, suffix_len as usize) + }; unsafe { self.term.set_len(prefix_len) }; self.term.extend_from_slice(&(*cursor)[..suffix_len]); &cursor[suffix_len..] diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index c3bcab050..b5c70178c 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -6,32 +6,28 @@ use super::{Token, TokenFilter, TokenStream}; pub struct AlphaNumOnlyFilter; pub struct AlphaNumOnlyFilterStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { tail: TailTokenStream, } - impl AlphaNumOnlyFilterStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { fn predicate(&self, token: &Token) -> bool { token.text.chars().all(|c| c.is_ascii_alphanumeric()) } - fn wrap( - tail: TailTokenStream, - ) -> AlphaNumOnlyFilterStream { - AlphaNumOnlyFilterStream { - tail - } + fn wrap(tail: TailTokenStream) -> AlphaNumOnlyFilterStream { + AlphaNumOnlyFilterStream { tail } } } - impl TokenFilter for AlphaNumOnlyFilter - where - TailTokenStream: TokenStream, +where + TailTokenStream: TokenStream, { type ResultTokenStream = AlphaNumOnlyFilterStream; @@ -41,8 +37,8 @@ impl TokenFilter for AlphaNumOnlyFilter } impl TokenStream for AlphaNumOnlyFilterStream - where - TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { fn token(&self) -> &Token { self.tail.token() diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index b6138ec7a..982c35f7b 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -1,8 +1,7 @@ -use super::{Token, Tokenizer, TokenStream}; +use super::{Token, TokenStream, Tokenizer}; use std::str; use schema::FACET_SEP_BYTE; - /// The `FacetTokenizer` process a `Facet` binary representation /// and emits a token for all of its parent. /// @@ -39,27 +38,27 @@ impl<'a> Tokenizer<'a> for FacetTokenizer { } } - impl<'a> TokenStream for FacetTokenStream<'a> { fn advance(&mut self) -> bool { match self.state { State::RootFacetNotEmitted => { - self.state = - if self.text.is_empty() { - State::Terminated - } else { - State::UpToPosition(0) - }; + self.state = if self.text.is_empty() { + State::Terminated + } else { + State::UpToPosition(0) + }; true } State::UpToPosition(cursor) => { let bytes: &[u8] = self.text.as_bytes(); - if let Some(next_sep_pos) = bytes[cursor+1..] + if let Some(next_sep_pos) = bytes[cursor + 1..] .iter() .cloned() .position(|b| b == FACET_SEP_BYTE) - .map(|pos| cursor + 1 + pos) { - let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) }; + .map(|pos| cursor + 1 + pos) + { + let facet_part = + unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) }; self.token.text.push_str(facet_part); self.state = State::UpToPosition(next_sep_pos); } else { @@ -69,9 +68,7 @@ impl<'a> TokenStream for FacetTokenStream<'a> { } true } - State::Terminated => { - false - } + State::Terminated => false, } } @@ -87,7 +84,7 @@ impl<'a> TokenStream for FacetTokenStream<'a> { #[cfg(test)] mod tests { - use tokenizer::{TokenStream, Token, Tokenizer}; + use tokenizer::{Token, TokenStream, Tokenizer}; use super::FacetTokenizer; use schema::Facet; @@ -101,7 +98,9 @@ mod tests { tokens.push(format!("{}", facet)); }; FacetTokenizer - .token_stream(unsafe { ::std::str::from_utf8_unchecked(facet.encoded_bytes()) }) + .token_stream(unsafe { + ::std::str::from_utf8_unchecked(facet.encoded_bytes()) + }) .process(&mut add_token); } assert_eq!(tokens.len(), 4); @@ -121,10 +120,12 @@ mod tests { tokens.push(format!("{}", facet)); }; FacetTokenizer - .token_stream(unsafe { ::std::str::from_utf8_unchecked(facet.encoded_bytes()) }) + .token_stream(unsafe { + ::std::str::from_utf8_unchecked(facet.encoded_bytes()) + }) .process(&mut add_token); } assert_eq!(tokens.len(), 1); assert_eq!(tokens[0], "/"); } -} \ No newline at end of file +} diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 2909bcdd1..bbd2d0d41 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -253,7 +253,6 @@ where } } - /// Trait for the pluggable components of `Tokenizer`s. pub trait TokenFilter: Clone { /// The resulting `TokenStream` type. From 920f086e1d2f3df13c0adba4c33193b4aa6d1074 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 3 Feb 2018 11:46:01 +0900 Subject: [PATCH 3/7] Clippy --- src/collector/facet_collector.rs | 11 +++++++---- src/datastruct/stacker/hashmap.rs | 14 +++++++------- src/directory/mmap_directory.rs | 14 +++++++------- src/fastfield/mod.rs | 18 +++++++++--------- src/fastfield/reader.rs | 2 +- src/fastfield/writer.rs | 2 +- src/indexer/index_writer.rs | 9 +++------ src/indexer/segment_updater.rs | 2 +- src/indexer/segment_writer.rs | 21 +++++++++++---------- src/postings/postings_writer.rs | 2 +- src/schema/facet.rs | 7 ++++--- src/store/writer.rs | 2 +- src/termdict/fstdict/termdict.rs | 4 ++-- 13 files changed, 55 insertions(+), 53 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 41dd0bcf5..6bc2a2246 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -217,7 +217,7 @@ fn skip<'a, I: Iterator>( ) -> SkipResult { loop { match collapse_it.peek() { - Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(&target) { + Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) { Ordering::Less => {} Ordering::Greater => { return SkipResult::OverStep; @@ -274,7 +274,7 @@ impl FacetCollector { "Tried to add a facet which is a descendant of an already added facet." ); assert!( - !facet.is_prefix_of(&old_facet), + !facet.is_prefix_of(old_facet), "Tried to add a facet which is an ancestor of an already added facet." ); } @@ -305,7 +305,8 @@ impl FacetCollector { let depth = facet_depth(facet_streamer.key()); if depth <= collapse_depth { continue 'outer; - } else if depth == collapse_depth + 1 { + } + if depth == collapse_depth + 1 { collapsed_id = self.current_collapse_facet_ords.len(); self.current_collapse_facet_ords .push(facet_streamer.term_ord()); @@ -428,6 +429,8 @@ pub struct FacetCounts { } impl FacetCounts { + + #[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator where Facet: From, @@ -455,7 +458,7 @@ impl FacetCounts { let mut heap = BinaryHeap::with_capacity(k); let mut it = self.get(facet); - for (ref facet, count) in (&mut it).take(k) { + for (facet, count) in (&mut it).take(k) { heap.push(Hit { count, facet }); } diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 18025b657..6e804889b 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -10,7 +10,7 @@ mod murmurhash2 { #[inline(always)] pub fn murmurhash2(key: &[u8]) -> u32 { let mut key_ptr: *const u32 = key.as_ptr() as *const u32; - let m: u32 = 0x5bd1e995; + let m: u32 = 0x5bd1_e995; let r = 24; let len = key.len() as u32; @@ -31,18 +31,18 @@ mod murmurhash2 { let key_ptr_u8: *const u8 = key_ptr as *const u8; match remaining { 3 => { - h ^= unsafe { *key_ptr_u8.wrapping_offset(2) as u32 } << 16; - h ^= unsafe { *key_ptr_u8.wrapping_offset(1) as u32 } << 8; - h ^= unsafe { *key_ptr_u8 as u32 }; + h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16; + h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8; + h ^= unsafe { u32::from(*key_ptr_u8) }; h = h.wrapping_mul(m); } 2 => { - h ^= unsafe { *key_ptr_u8.wrapping_offset(1) as u32 } << 8; - h ^= unsafe { *key_ptr_u8 as u32 }; + h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8; + h ^= unsafe { u32::from(*key_ptr_u8) }; h = h.wrapping_mul(m); } 1 => { - h ^= unsafe { *key_ptr_u8 as u32 }; + h ^= unsafe { u32::from(*key_ptr_u8) }; h = h.wrapping_mul(m); } _ => {} diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index bb7a99bf1..4075ddf26 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -23,10 +23,10 @@ use tempdir::TempDir; /// Returns None iff the file exists, can be read, but is empty (and hence /// cannot be mmapped). /// -fn open_mmap(full_path: &PathBuf) -> result::Result, OpenReadError> { - let file = File::open(&full_path).map_err(|e| { +fn open_mmap(full_path: &Path) -> result::Result, OpenReadError> { + let file = File::open(full_path).map_err(|e| { if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) + OpenReadError::FileDoesNotExist(full_path.to_owned()) } else { OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) } @@ -88,8 +88,8 @@ impl MmapCache { } } - fn get_mmap(&mut self, full_path: PathBuf) -> Result, OpenReadError> { - Ok(match self.cache.entry(full_path.clone()) { + fn get_mmap(&mut self, full_path: &Path) -> Result, OpenReadError> { + Ok(match self.cache.entry(full_path.to_owned()) { HashMapEntry::Occupied(occupied_entry) => { let mmap = occupied_entry.get(); self.counters.hit += 1; @@ -97,7 +97,7 @@ impl MmapCache { } HashMapEntry::Vacant(vacant_entry) => { self.counters.miss += 1; - if let Some(mmap) = open_mmap(&full_path)? { + if let Some(mmap) = open_mmap(full_path)? { vacant_entry.insert(mmap.clone()); Some(mmap) } else { @@ -252,7 +252,7 @@ impl Directory for MmapDirectory { })?; Ok(mmap_cache - .get_mmap(full_path)? + .get_mmap(&full_path)? .map(ReadOnlySource::Mmap) .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 0afc50c7e..6dcf2b480 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -96,7 +96,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64); fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -132,7 +132,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64); add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64); fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -169,7 +169,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -206,7 +206,7 @@ mod tests { ); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -247,7 +247,7 @@ mod tests { fast_field_writers.add_document(&doc); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -288,7 +288,7 @@ mod tests { let doc = Document::default(); fast_field_writers.add_document(&doc); fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -324,7 +324,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -381,7 +381,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } @@ -415,7 +415,7 @@ mod tests { add_single_field_doc(&mut fast_field_writers, *FIELD, *x); } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index b556e39f3..1142c25d8 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -149,7 +149,7 @@ impl From> for U64FastFieldReader { } } fast_field_writers - .serialize(&mut serializer, HashMap::new()) + .serialize(&mut serializer, &HashMap::new()) .unwrap(); serializer.close().unwrap(); } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 4ea73e702..7248b93e1 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -105,7 +105,7 @@ impl FastFieldsWriter { pub fn serialize( &self, serializer: &mut FastFieldSerializer, - mapping: HashMap>, + mapping: &HashMap>, ) -> io::Result<()> { for field_writer in &self.single_value_writers { field_writer.serialize(serializer)?; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index d47c7fdc6..d1a14b5e8 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -23,7 +23,6 @@ use indexer::SegmentWriter; use postings::DocSet; use schema::IndexRecordOption; use schema::Document; -use schema::Schema; use schema::Term; use std::mem; use std::mem::swap; @@ -250,17 +249,17 @@ fn index_documents( heap: &mut Heap, table_size: usize, segment: &Segment, - schema: &Schema, generation: usize, document_iterator: &mut Iterator, segment_updater: &mut SegmentUpdater, mut delete_cursor: DeleteCursor, ) -> Result { heap.clear(); + let schema = segment.schema(); let segment_id = segment.id(); - let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?; + let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?; for doc in document_iterator { - segment_writer.add_document(doc, schema)?; + segment_writer.add_document(doc, &schema)?; // There is two possible conditions to close the segment. // One is the memory arena dedicated to the segment is // getting full. @@ -368,7 +367,6 @@ impl IndexWriter { /// The thread consumes documents from the pipeline. /// fn add_indexing_worker(&mut self) -> Result<()> { - let schema = self.index.schema(); let document_receiver_clone = self.document_receiver.clone(); let mut segment_updater = self.segment_updater.clone(); let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread); @@ -409,7 +407,6 @@ impl IndexWriter { &mut heap, table_size, &segment, - &schema, generation, &mut document_iterator, &mut segment_updater, diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 6dc54a4dd..a0682e646 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -69,7 +69,7 @@ pub fn save_metas( segments: segment_metas, schema, opstamp, - payload: payload.clone(), + payload, }; let mut buffer = serde_json::to_vec_pretty(&metas)?; write!(&mut buffer, "\n")?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index f3b318356..3c6134e9f 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -142,12 +142,13 @@ impl<'a> SegmentWriter<'a> { FieldType::HierarchicalFacet => { let facets: Vec<&[u8]> = field_values .iter() - .flat_map(|field_value| match field_value.value() { - &Value::Facet(ref facet) => Some(facet.encoded_bytes()), - _ => { - panic!("Expected hierarchical facet"); - } - }) + .flat_map(|field_value| + match *field_value.value() { + Value::Facet(ref facet) => Some(facet.encoded_bytes()), + _ => { + panic!("Expected hierarchical facet"); + } + }) .collect(); let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); @@ -155,8 +156,8 @@ impl<'a> SegmentWriter<'a> { let mut unordered_term_id_opt = None; let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) }; FacetTokenizer - .token_stream(&fake_str) - .process(&mut |ref token| { + .token_stream(fake_str) + .process(&mut |token| { term.set_text(&token.text); let unordered_term_id = self.multifield_postings.subscribe(doc_id, &term); @@ -259,8 +260,8 @@ fn write( mut serializer: SegmentSerializer, ) -> Result<()> { let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; - fast_field_writers.serialize(serializer.get_fast_field_serializer(), term_ord_map)?; - fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer(), HashMap::new())?; + fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; + fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer(), &HashMap::new())?; serializer.close()?; Ok(()) diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index d5061ebd2..7d0e1168a 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -171,7 +171,7 @@ pub trait PostingsWriter { ) -> io::Result<()>; /// Tokenize a text and subscribe all of its token. - fn index_text<'a>( + fn index_text( &mut self, term_index: &mut TermHashMap, doc_id: DocId, diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 15e2dca65..061848df4 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -7,8 +7,8 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::Cow; use common::BinarySerializable; -const SLASH_BYTE: u8 = '/' as u8; -const ESCAPE_BYTE: u8 = '\\' as u8; +const SLASH_BYTE: u8 = b'/'; +const ESCAPE_BYTE: u8 = b'\\'; /// BYTE used as a level separation in the binary /// representation of facets. @@ -63,7 +63,7 @@ impl Facet { /// It is conceptually, if one of the steps of this path /// contains a `/` or a `\`, it should be escaped /// using an anti-slash `/`. - pub fn from_text<'a, T>(path: &'a T) -> Facet + pub fn from_text(path: &T) -> Facet where T: ?Sized + AsRef, { @@ -97,6 +97,7 @@ impl Facet { } /// Returns `true` iff other is a subfacet of `self`. + #[allow(collapsible_if)] pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_bytes: &[u8] = self.encoded_bytes(); let other_bytes: &[u8] = other.encoded_bytes(); diff --git a/src/store/writer.rs b/src/store/writer.rs index 402158278..34261c4cb 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -46,7 +46,7 @@ impl StoreWriter { /// The document id is implicitely the number of times /// this method has been called. /// - pub fn store<'a>(&mut self, stored_document: &Document) -> io::Result<()> { + pub fn store(&mut self, stored_document: &Document) -> io::Result<()> { self.intermediary_buffer.clear(); stored_document.serialize(&mut self.intermediary_buffer)?; let doc_num_bytes = self.intermediary_buffer.len(); diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 96c2be3f9..5b56e64df 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -62,7 +62,7 @@ where fn insert>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> { let key = key_ref.as_ref(); - self.insert_key(key.as_ref())?; + self.insert_key(key)?; self.insert_value(value)?; Ok(()) } @@ -137,7 +137,7 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { return false; } } - return true; + true } fn term_ord>(&self, key: K) -> Option { From 6a104e4f696d47a9aac7639fad5f2654970dd0e9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 3 Feb 2018 11:59:34 +0900 Subject: [PATCH 4/7] Cargo fmt --- src/collector/facet_collector.rs | 1 - src/indexer/index_writer.rs | 3 ++- src/indexer/segment_writer.rs | 27 ++++++++++++--------------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 6bc2a2246..13d50e161 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -429,7 +429,6 @@ pub struct FacetCounts { } impl FacetCounts { - #[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator where diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index d1a14b5e8..36b6b829f 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -257,7 +257,8 @@ fn index_documents( heap.clear(); let schema = segment.schema(); let segment_id = segment.id(); - let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?; + let mut segment_writer = + SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?; for doc in document_iterator { segment_writer.add_document(doc, &schema)?; // There is two possible conditions to close the segment. diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 3c6134e9f..d3fcbd736 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -142,27 +142,24 @@ impl<'a> SegmentWriter<'a> { FieldType::HierarchicalFacet => { let facets: Vec<&[u8]> = field_values .iter() - .flat_map(|field_value| - match *field_value.value() { - Value::Facet(ref facet) => Some(facet.encoded_bytes()), - _ => { - panic!("Expected hierarchical facet"); - } - }) + .flat_map(|field_value| match *field_value.value() { + Value::Facet(ref facet) => Some(facet.encoded_bytes()), + _ => { + panic!("Expected hierarchical facet"); + } + }) .collect(); let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); for facet_bytes in facets { let mut unordered_term_id_opt = None; let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) }; - FacetTokenizer - .token_stream(fake_str) - .process(&mut |token| { - term.set_text(&token.text); - let unordered_term_id = - self.multifield_postings.subscribe(doc_id, &term); - unordered_term_id_opt = Some(unordered_term_id); - }); + FacetTokenizer.token_stream(fake_str).process(&mut |token| { + term.set_text(&token.text); + let unordered_term_id = + self.multifield_postings.subscribe(doc_id, &term); + unordered_term_id_opt = Some(unordered_term_id); + }); if let Some(unordered_term_id) = unordered_term_id_opt { self.fast_field_writers From 1fc7afa90a69d47d836ea22bae2da785fb75dad7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 5 Feb 2018 09:33:25 +0900 Subject: [PATCH 5/7] Issue/range query (#242) BitSet and RangeQuery --- .gitignore | 3 +- CHANGELOG.md | 10 + src/collector/count_collector.rs | 22 +- src/common/bitset.rs | 396 ++++++++++++++++++++++ src/common/mod.rs | 3 + src/lib.rs | 107 ++++++ src/postings/docset.rs | 35 +- src/postings/intersection.rs | 3 +- src/postings/postings.rs | 24 +- src/postings/segment_postings.rs | 20 +- src/postings/vec_postings.rs | 4 +- src/query/all_query.rs | 4 +- src/query/bitset/mod.rs | 268 +++++++++++++++ src/query/boolean_query/boolean_query.rs | 10 +- src/query/boolean_query/boolean_scorer.rs | 2 +- src/query/boolean_query/boolean_weight.rs | 38 ++- src/query/mod.rs | 6 + src/query/phrase_query/phrase_scorer.rs | 4 +- src/query/range_query.rs | 292 ++++++++++++++++ src/query/scorer.rs | 64 +++- src/query/term_query/term_scorer.rs | 2 +- src/termdict/fstdict/termdict.rs | 12 +- src/termdict/mod.rs | 8 +- 23 files changed, 1247 insertions(+), 90 deletions(-) create mode 100644 src/common/bitset.rs create mode 100644 src/query/bitset/mod.rs create mode 100644 src/query/range_query.rs diff --git a/.gitignore b/.gitignore index e2a04b58a..b6f5cc5b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.swp target target/debug .vscode @@ -8,4 +9,4 @@ benchmark cpp/simdcomp/bitpackingbenchmark *.bk .idea -trace.dat \ No newline at end of file +trace.dat diff --git a/CHANGELOG.md b/CHANGELOG.md index b5f468f2d..5f425cbc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +Tantivy 0.5 +========================== +- Faceting +- RangeQuery +- Configurable tokenization pipeline +- Allowing super large indexes + - 64 bits file address + - Smarter encoding of the `TermInfo` objects + + Tantivy 0.4.3 ========================== diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 6707e687e..15363e33a 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -33,18 +33,16 @@ impl Collector for CountCollector { #[cfg(test)] mod tests { - use super::*; - use test::Bencher; - use collector::Collector; + use collector::{Collector, CountCollector}; - #[bench] - fn build_collector(b: &mut Bencher) { - b.iter(|| { - let mut count_collector = CountCollector::default(); - for doc in 0..1_000_000 { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + #[test] + fn test_count_collector() { + let mut count_collector = CountCollector::default(); + assert_eq!(count_collector.count(), 0); + count_collector.collect(0u32, 1f32); + assert_eq!(count_collector.count(), 1); + assert_eq!(count_collector.count(), 1); + count_collector.collect(1u32, 1f32); + assert_eq!(count_collector.count(), 2); } } diff --git a/src/common/bitset.rs b/src/common/bitset.rs new file mode 100644 index 000000000..fb01e961e --- /dev/null +++ b/src/common/bitset.rs @@ -0,0 +1,396 @@ +use std::fmt; + +#[derive(Clone, Copy, Eq, PartialEq)] +pub(crate) struct TinySet(u64); + +impl fmt::Debug for TinySet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.into_iter().collect::>().fmt(f) + } +} + +pub struct TinySetIterator(TinySet); +impl Iterator for TinySetIterator { + type Item = u32; + + fn next(&mut self) -> Option { + self.0.pop_lowest() + } +} + +impl IntoIterator for TinySet { + type Item = u32; + type IntoIter = TinySetIterator; + fn into_iter(self) -> Self::IntoIter { + TinySetIterator(self) + } +} + +impl TinySet { + + /// Returns an empty `TinySet`. + pub fn empty() -> TinySet { + TinySet(0u64) + } + + /// Returns the complement of the set in `[0, 64[`. + fn complement(&self) -> TinySet { + TinySet(!self.0) + } + + + /// Returns true iff the `TinySet` contains the element `el`. + pub fn contains(&self, el: u32) -> bool { + !self.intersect(TinySet::singleton(el)).is_empty() + } + + /// Returns the intersection of `self` and `other` + pub fn intersect(&self, other: TinySet) -> TinySet { + TinySet(self.0 & other.0) + } + + /// Creates a new `TinySet` containing only one element + /// within `[0; 64[` + #[inline(always)] + pub fn singleton(el: u32) -> TinySet { + TinySet(1u64 << (el as u64)) + } + + /// Insert a new element within [0..64[ + #[inline(always)] + pub fn insert(self, el: u32) -> TinySet { + self.union(TinySet::singleton(el)) + } + + /// Insert a new element within [0..64[ + #[inline(always)] + pub fn insert_mut(&mut self, el: u32) -> bool { + let old = *self; + *self = old.insert(el); + old != *self + } + + /// Returns the union of two tinysets + #[inline(always)] + pub fn union(self, other: TinySet) -> TinySet { + TinySet(self.0 | other.0) + } + + /// Returns true iff the `TinySet` is empty. + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.0 == 0u64 + } + + /// Returns the lowest element in the `TinySet` + /// and removes it. + #[inline(always)] + pub fn pop_lowest(&mut self) -> Option { + if let Some(lowest) = self.lowest() { + self.0 ^= TinySet::singleton(lowest).0; + Some(lowest) + } else { + None + } + } + + /// Returns the lowest element in the `TinySet` + /// (or None if the set is empty). + #[inline(always)] + pub fn lowest(&mut self) -> Option { + if self.is_empty() { + None + } else { + let least_significant_bit = self.0.trailing_zeros() as u32; + Some(least_significant_bit) + } + } + + /// Returns a `TinySet` than contains all values up + /// to limit excluded. + /// + /// The limit is assumed to be strictly lower than 64. + pub fn range_lower(upper_bound: u32) -> TinySet { + TinySet((1u64 << ((upper_bound % 64u32) as u64)) - 1u64) + } + + /// Returns a `TinySet` that contains all values greater + /// or equal to the given limit, included. (and up to 63) + /// + /// The limit is assumed to be strictly lower than 64. + pub fn range_greater_or_equal(from_included: u32) -> TinySet { + TinySet::range_lower(from_included).complement() + } +} + +#[derive(Clone)] +pub struct BitSet { + tinysets: Box<[TinySet]>, + len: usize, //< Technically it should be u32, but we + // count multiple inserts. + // `usize` guards us from overflow. + max_value: u32, +} + +fn num_buckets(max_val: u32) -> u32 { + (max_val + 63u32) / 64u32 +} + +impl BitSet { + + /// Create a new `BitSet` that may contain elements + /// within `[0, max_val[`. + pub fn with_max_value(max_value: u32) -> BitSet { + let num_buckets = num_buckets(max_value); + let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice(); + BitSet { + tinysets: tinybisets, + len: 0, + max_value + } + } + + /// Removes all elements from the `BitSet`. + pub fn clear(&mut self) { + for tinyset in self.tinysets.iter_mut() { + *tinyset = TinySet::empty(); + } + } + + /// Returns the number of elements in the `BitSet`. + pub fn len(&self) -> usize { + self.len + } + + /// Inserts an element in the `BitSet` + pub fn insert(&mut self, el: u32) { + // we do not check saturated els. + let higher = el / 64u32; + let lower = el % 64u32; + self.len += + if self.tinysets[higher as usize].insert_mut(lower) { + 1 + } else { + 0 + }; + } + + /// Returns true iff the elements is in the `BitSet`. + pub fn contains(&self, el: u32) -> bool { + self.tinyset(el / 64u32) + .contains(el % 64) + } + + /// Returns the first non-empty `TinySet` associated to a bucket lower + /// or greater than bucket. + /// + /// Reminder: the tiny set with the bucket `bucket`, represents the + /// elements from `bucket * 64` to `(bucket+1) * 64`. + pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option { + self.tinysets[bucket as usize..] + .iter() + .cloned() + .position(|tinyset| !tinyset.is_empty()) + .map(|delta_bucket| bucket + delta_bucket as u32) + } + + pub fn max_value(&self) -> u32 { + self.max_value + } + + /// Returns the tiny bitset representing the + /// the set restricted to the number range from + /// `bucket * 64` to `(bucket + 1) * 64`. + pub(crate) fn tinyset(&self, bucket: u32) -> TinySet { + self.tinysets[bucket as usize] + } +} + + +#[cfg(test)] +mod tests { + + extern crate test; + use tests; + use std::collections::HashSet; + use super::BitSet; + use super::TinySet; + use tests::generate_nonunique_unsorted; + use std::collections::BTreeSet; + use query::BitSetDocSet; + use DocSet; + + #[test] + fn test_tiny_set() { + assert!(TinySet::empty().is_empty()); + { + let mut u = TinySet::empty().insert(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()) + } + { + let mut u = TinySet::empty() + .insert(1u32) + .insert(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()) + } + { + let mut u = TinySet::empty().insert(2u32); + assert_eq!(u.pop_lowest(), Some(2u32)); + u.insert_mut(1u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()); + } + { + let mut u = TinySet::empty().insert(63u32); + assert_eq!(u.pop_lowest(), Some(63u32)); + assert!(u.pop_lowest().is_none()); + } + } + + #[test] + fn test_bitset() { + let test_against_hashset = |els: &[u32], max_value: u32| { + let mut hashset: HashSet = HashSet::new(); + let mut bitset = BitSet::with_max_value(max_value); + for &el in els { + assert!(el < max_value); + hashset.insert(el); + bitset.insert(el); + } + for el in 0..max_value { + assert_eq!(hashset.contains(&el), bitset.contains(el)); + } + assert_eq!(bitset.max_value(), max_value); + }; + + test_against_hashset(&[], 0); + test_against_hashset(&[], 1); + test_against_hashset(&[0u32], 1); + test_against_hashset(&[0u32], 100); + test_against_hashset(&[1u32, 2u32], 4); + test_against_hashset(&[99u32], 100); + test_against_hashset(&[63u32], 64); + test_against_hashset(&[62u32, 63u32], 64); + } + + + #[test] + fn test_bitset_large() { + let arr = generate_nonunique_unsorted(1_000_000, 50_000); + let mut btreeset: BTreeSet = BTreeSet::new(); + let mut bitset = BitSet::with_max_value(1_000_000); + for el in arr { + btreeset.insert(el); + bitset.insert(el); + } + for i in 0..1_000_000 { + assert_eq!(btreeset.contains(&i), bitset.contains(i)); + } + assert_eq!(btreeset.len(), bitset.len()); + let mut bitset_docset = BitSetDocSet::from(bitset); + for el in btreeset.into_iter() { + bitset_docset.advance(); + assert_eq!(bitset_docset.doc(), el); + } + assert!(!bitset_docset.advance()); + } + + #[test] + fn test_bitset_num_buckets() { + use super::num_buckets; + assert_eq!(num_buckets(0u32), 0); + assert_eq!(num_buckets(1u32), 1); + assert_eq!(num_buckets(64u32), 1); + assert_eq!(num_buckets(65u32), 2); + assert_eq!(num_buckets(128u32), 2); + assert_eq!(num_buckets(129u32), 3); + } + + #[test] + fn test_tinyset_range() { + assert_eq!(TinySet::range_lower(3).into_iter().collect::>(), [0, 1, 2]); + assert!(TinySet::range_lower(0).is_empty()); + assert_eq!( + TinySet::range_lower(63).into_iter().collect::>(), + (0u32..63u32).collect::>() + ); + assert_eq!(TinySet::range_lower(1).into_iter().collect::>(), [0]); + assert_eq!(TinySet::range_lower(2).into_iter().collect::>(), [0, 1]); + assert_eq!( + TinySet::range_greater_or_equal(3).into_iter().collect::>(), + (3u32..64u32).collect::>() + ); + } + + #[test] + fn test_bitset_len() { + let mut bitset = BitSet::with_max_value(1_000); + assert_eq!(bitset.len(), 0); + bitset.insert(3u32); + assert_eq!(bitset.len(), 1); + bitset.insert(103u32); + assert_eq!(bitset.len(), 2); + bitset.insert(3u32); + assert_eq!(bitset.len(), 2); + bitset.insert(103u32); + assert_eq!(bitset.len(), 2); + bitset.insert(104u32); + assert_eq!(bitset.len(), 3); + } + + #[test] + fn test_bitset_clear() { + let mut bitset = BitSet::with_max_value(1_000); + let els = tests::sample(1_000, 0.01f32); + for &el in &els { + bitset.insert(el); + } + assert!(els.iter().all(|el| bitset.contains(*el))); + bitset.clear(); + for el in 0u32..1000u32 { + assert!(!bitset.contains(el)); + } + } + + #[bench] + fn bench_tinyset_pop(b: &mut test::Bencher) { + b.iter(|| { + test::black_box(TinySet::singleton(31u32)) + .pop_lowest() + }); + } + + #[bench] + fn bench_tinyset_sum(b: &mut test::Bencher) { + let tiny_set = TinySet::empty() + .insert(10u32) + .insert(14u32) + .insert(21u32); + b.iter(|| { + assert_eq!( + test::black_box(tiny_set).into_iter().sum::(), + 45u32); + }); + } + + #[bench] + fn bench_tinyarr_sum(b: &mut test::Bencher) { + let v = [10u32, 14u32, 21u32] ; + b.iter(|| { + test::black_box(v) + .iter() + .cloned() + .sum::() + }); + } + + #[bench] + fn bench_bitset_initialize(b: &mut test::Bencher) { + b.iter(|| { + BitSet::with_max_value(1_000_000) + }); + } +} + diff --git a/src/common/mod.rs b/src/common/mod.rs index 39c86aa3f..aceea844d 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -4,6 +4,7 @@ mod vint; mod counting_writer; mod composite_file; pub mod bitpacker; +mod bitset; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub use self::serialize::BinarySerializable; @@ -12,6 +13,8 @@ pub use self::timer::TimerTree; pub use self::timer::OpenTimer; pub use self::vint::VInt; pub use self::counting_writer::CountingWriter; +pub use self::bitset::BitSet; +pub(crate) use self::bitset::TinySet; use std::io; diff --git a/src/lib.rs b/src/lib.rs index ca6c0dfc1..ec38e0936 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ #![feature(box_syntax)] #![feature(optin_builtin_traits)] #![feature(conservative_impl_trait)] +#![feature(collections_range)] #![feature(integer_atomics)] #![cfg_attr(test, feature(test))] #![cfg_attr(test, feature(iterator_step_by))] @@ -17,11 +18,106 @@ //! Tantivy is a search engine library. //! Think `Lucene`, but in Rust. //! +//! ```rust + +//! # extern crate tempdir; +//! # +//! #[macro_use] +//! extern crate tantivy; +//! +//! // ... +//! +//! # use std::path::Path; +//! # use tempdir::TempDir; +//! # use tantivy::Index; +//! # use tantivy::schema::*; +//! # use tantivy::collector::TopCollector; +//! # use tantivy::query::QueryParser; +//! # +//! # fn main() { +//! # // Let's create a temporary directory for the +//! # // sake of this example +//! # if let Ok(dir) = TempDir::new("tantivy_example_dir") { +//! # run_example(dir.path()).unwrap(); +//! # dir.close().unwrap(); +//! # } +//! # } +//! # +//! # fn run_example(index_path: &Path) -> tantivy::Result<()> { +//! // First we need to define a schema ... +//! +//! // `TEXT` means the field should be tokenized and indexed, +//! // along with its term frequency and term positions. +//! // +//! // `STORED` means that the field will also be saved +//! // in a compressed, row-oriented key-value store. +//! // This store is useful to reconstruct the +//! // documents that were selected during the search phase. +//! let mut schema_builder = SchemaBuilder::default(); +//! let title = schema_builder.add_text_field("title", TEXT | STORED); +//! let body = schema_builder.add_text_field("body", TEXT); +//! let schema = schema_builder.build(); +//! +//! // Indexing documents +//! +//! let index = Index::create(index_path, schema.clone())?; +//! +//! // Here we use a buffer of 100MB that will be split +//! // between indexing threads. +//! let mut index_writer = index.writer(100_000_000)?; +//! +//! // Let's index one documents! +//! index_writer.add_document(doc!( +//! title => "The Old Man and the Sea", +//! body => "He was an old man who fished alone in a skiff in \ +//! the Gulf Stream and he had gone eighty-four days \ +//! now without taking a fish." +//! )); +//! +//! // We need to call .commit() explicitly to force the +//! // index_writer to finish processing the documents in the queue, +//! // flush the current index to the disk, and advertise +//! // the existence of new documents. +//! index_writer.commit()?; +//! +//! // # Searching +//! +//! index.load_searchers()?; +//! +//! let searcher = index.searcher(); +//! +//! let query_parser = QueryParser::for_index(&index, vec![title, body]); +//! +//! // QueryParser may fail if the query is not in the right +//! // format. For user facing applications, this can be a problem. +//! // A ticket has been opened regarding this problem. +//! let query = query_parser.parse_query("sea whale")?; +//! +//! let mut top_collector = TopCollector::with_limit(10); +//! searcher.search(&*query, &mut top_collector)?; +//! +//! // Our top collector now contains the 10 +//! // most relevant doc ids... +//! let doc_addresses = top_collector.docs(); +//! for doc_address in doc_addresses { +//! let retrieved_doc = searcher.doc(&doc_address)?; +//! println!("{}", schema.to_json(&retrieved_doc)); +//! } +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! +//! //! A good place for you to get started is to check out //! the example code ( //! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / //! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) + + + #[macro_use] extern crate lazy_static; @@ -190,6 +286,7 @@ mod tests { use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader}; use Postings; use rand::{Rng, SeedableRng, XorShiftRng}; + use rand::distributions::{Range, IndependentSample}; fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; @@ -200,6 +297,16 @@ mod tests { .collect() } + pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec { + let seed: &[u32; 4] = &[1, 2, 3, 4]; + let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); + let between = Range::new(0u32, max_value); + (0..n_elems) + .map(|_| between.ind_sample(&mut rng)) + .collect::>() + } + + pub fn generate_array(n: usize, ratio: f32) -> Vec { generate_array_with_seed(n, ratio, 4) } diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 65c41f76b..c030b092b 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -2,6 +2,7 @@ use DocId; use std::borrow::Borrow; use std::borrow::BorrowMut; use std::cmp::Ordering; +use common::BitSet; /// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`. #[derive(PartialEq, Eq, Debug)] @@ -92,7 +93,14 @@ pub trait DocSet { /// Returns a best-effort hint of the /// length of the docset. - fn size_hint(&self) -> usize; + fn size_hint(&self) -> u32; + + /// Appends all docs to a `bitset`. + fn append_to_bitset(&mut self, bitset: &mut BitSet) { + while self.advance() { + bitset.insert(self.doc()); + } + } } impl DocSet for Box { @@ -111,30 +119,13 @@ impl DocSet for Box { unboxed.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { let unboxed: &TDocSet = self.borrow(); unboxed.size_hint() } -} -impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { - fn advance(&mut self) -> bool { - let unref: &mut TDocSet = *self; - unref.advance() - } - - fn skip_next(&mut self, target: DocId) -> SkipResult { - let unref: &mut TDocSet = *self; - unref.skip_next(target) - } - - fn doc(&self) -> DocId { - let unref: &TDocSet = *self; - unref.doc() - } - - fn size_hint(&self) -> usize { - let unref: &TDocSet = *self; - unref.size_hint() + fn append_to_bitset(&mut self, bitset: &mut BitSet) { + let unboxed: &mut TDocSet = self.borrow_mut(); + unboxed.append_to_bitset(bitset); } } diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index b105405f6..5234f51c0 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -31,7 +31,8 @@ impl IntersectionDocSet { } impl DocSet for IntersectionDocSet { - fn size_hint(&self) -> usize { + /// Returns the minimum `.size_hint()` of the intersected docsets. + fn size_hint(&self) -> u32 { self.docsets .iter() .map(|docset| docset.size_hint()) diff --git a/src/postings/postings.rs b/src/postings/postings.rs index 52f16198a..ac5516e2e 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -30,15 +30,15 @@ impl Postings for Box { unboxed.positions() } } - -impl<'a, TPostings: Postings> Postings for &'a mut TPostings { - fn term_freq(&self) -> u32 { - let unref: &TPostings = *self; - unref.term_freq() - } - - fn positions(&self) -> &[u32] { - let unref: &TPostings = *self; - unref.positions() - } -} +// +//impl<'a, TPostings: Postings> Postings for &'a mut TPostings { +// fn term_freq(&self) -> u32 { +// let unref: &TPostings = *self; +// unref.term_freq() +// } +// +// fn positions(&self) -> &[u32] { +// let unref: &TPostings = *self; +// unref.positions() +// } +//} diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 682fc82f6..9fbee7efa 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,5 +1,6 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BLOCK_SIZE}; use DocId; +use common::BitSet; use postings::{DocSet, HasLen, Postings, SkipResult}; use std::cmp; use fst::Streamer; @@ -235,8 +236,8 @@ impl DocSet for SegmentPostings { } } - fn size_hint(&self) -> usize { - self.len() + fn size_hint(&self) -> u32 { + self.len() as u32 } /// Return the current document's `DocId`. @@ -249,6 +250,21 @@ impl DocSet for SegmentPostings { ); docs[self.cur] } + + fn append_to_bitset(&mut self, bitset: &mut BitSet) { + // finish the current block + if self.advance() { + for &doc in &self.block_cursor.docs()[self.cur..] { + bitset.insert(doc); + } + // ... iterate through the remaining blocks. + while self.block_cursor.advance() { + for &doc in self.block_cursor.docs() { + bitset.insert(doc); + } + } + } + } } impl HasLen for SegmentPostings { diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index f6c5ae8d9..51c402cd6 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -35,8 +35,8 @@ impl DocSet for VecPostings { self.doc_ids[self.cursor.0] } - fn size_hint(&self) -> usize { - self.len() + fn size_hint(&self) -> u32 { + self.len() as u32 } } diff --git a/src/query/all_query.rs b/src/query/all_query.rs index da36bf38b..4d2c3eff2 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -59,8 +59,8 @@ impl DocSet for AllScorer { self.doc } - fn size_hint(&self) -> usize { - self.max_doc as usize + fn size_hint(&self) -> u32 { + self.max_doc } } diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs new file mode 100644 index 000000000..cb8ecde7b --- /dev/null +++ b/src/query/bitset/mod.rs @@ -0,0 +1,268 @@ +use common::{BitSet, TinySet}; +use DocId; +use postings::DocSet; +use postings::SkipResult; +use std::cmp::Ordering; + +/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`. +/// +/// # Implementation detail +/// +/// Skipping is relatively fast here as we can directly point to the +/// right tiny bitset bucket. +/// +/// TODO: Consider implementing a `BitTreeSet` in order to advance faster +/// when the bitset is sparse +pub struct BitSetDocSet { + docs: BitSet, + cursor_bucket: u32, //< index associated to the current tiny bitset + cursor_tinybitset: TinySet, + doc: u32, +} + +impl BitSetDocSet { + fn go_to_bucket(&mut self, bucket_addr: u32) { + self.cursor_bucket = bucket_addr; + self.cursor_tinybitset = self.docs.tinyset(bucket_addr); + } +} + +impl From for BitSetDocSet { + fn from(docs: BitSet) -> BitSetDocSet { + let first_tiny_bitset = if docs.max_value() == 0 { + TinySet::empty() + } else { + docs.tinyset(0) + }; + BitSetDocSet { + docs, + cursor_bucket: 0, + cursor_tinybitset: first_tiny_bitset, + doc: 0u32, + } + } +} + +impl DocSet for BitSetDocSet { + fn advance(&mut self) -> bool { + if let Some(lower) = self.cursor_tinybitset.pop_lowest() { + self.doc = (self.cursor_bucket as u32 * 64u32) | lower; + return true; + } + if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) { + self.go_to_bucket(cursor_bucket); + let lower = self.cursor_tinybitset.pop_lowest().unwrap(); + self.doc = (cursor_bucket * 64u32) | lower; + true + } else { + false + } +} + + fn skip_next(&mut self, target: DocId) -> SkipResult { + // skip is required to advance. + if !self.advance() { + return SkipResult::End; + } + let target_bucket = target / 64u32; + + // Mask for all of the bits greater or equal + // to our target document. + match target_bucket.cmp(&self.cursor_bucket) { + Ordering::Greater => { + self.go_to_bucket(target_bucket); + let greater_filter: TinySet = TinySet::range_greater_or_equal(target); + self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter); + if !self.advance() { + SkipResult::End + } else { + if self.doc() == target { + SkipResult::Reached + } else { + debug_assert!(self.doc() > target); + SkipResult::OverStep + } + } + } + Ordering::Equal => loop { + match self.doc().cmp(&target) { + Ordering::Less => { + if !self.advance() { + return SkipResult::End; + } + } + Ordering::Equal => { + return SkipResult::Reached; + } + Ordering::Greater => { + debug_assert!(self.doc() > target); + return SkipResult::OverStep; + } + } + }, + Ordering::Less => { + debug_assert!(self.doc() > target); + SkipResult::OverStep + } + } + } + + /// Returns the current document + fn doc(&self) -> DocId { + self.doc + } + + /// Advances the cursor to the next document + /// None is returned if the iterator has `DocSet` + /// has already been entirely consumed. + fn next(&mut self) -> Option { + if self.advance() { + Some(self.doc()) + } else { + None + } + } + + /// Returns half of the `max_doc` + /// This is quite a terrible heuristic, + /// but we don't have access to any better + /// value. + fn size_hint(&self) -> u32 { + self.docs.len() as u32 + } +} + +#[cfg(test)] +mod tests { + use DocId; + use common::BitSet; + use postings::{DocSet, SkipResult}; + use super::BitSetDocSet; + extern crate test; + + fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet { + let mut docset = BitSet::with_max_value(max_doc); + for &doc in docs { + docset.insert(doc); + } + BitSetDocSet::from(docset) + } + + fn test_go_through_sequential(docs: &[DocId]) { + let mut docset = create_docbitset(docs, 1_000u32); + for &doc in docs { + assert!(docset.advance()); + assert_eq!(doc, docset.doc()); + } + assert!(!docset.advance()); + assert!(!docset.advance()); + } + + #[test] + fn test_docbitset_sequential() { + test_go_through_sequential(&[]); + test_go_through_sequential(&[1, 2, 3]); + test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]); + test_go_through_sequential(&[63, 64, 65]); + test_go_through_sequential(&[1, 2, 3, 4, 95, 96, 97, 98, 99]); + } + + #[test] + fn test_docbitset_skip() { + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); + assert_eq!(docset.skip_next(7), SkipResult::Reached); + assert_eq!(docset.doc(), 7); + assert!(docset.advance(), 7); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000); + assert_eq!(docset.skip_next(3), SkipResult::OverStep); + assert_eq!(docset.doc(), 5); + assert!(docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5112), SkipResult::Reached); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5113), SkipResult::End); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[5112], 10_000); + assert_eq!(docset.skip_next(5111), SkipResult::OverStep); + assert_eq!(docset.doc(), 5112); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000); + assert_eq!(docset.skip_next(5112), SkipResult::Reached); + assert_eq!(docset.doc(), 5112); + assert!(docset.advance()); + assert_eq!(docset.doc(), 5500); + assert!(docset.advance()); + assert_eq!(docset.doc(), 6666); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000); + assert_eq!(docset.skip_next(5111), SkipResult::OverStep); + assert_eq!(docset.doc(), 5112); + assert!(docset.advance()); + assert_eq!(docset.doc(), 5500); + assert!(docset.advance()); + assert_eq!(docset.doc(), 6666); + assert!(!docset.advance()); + } + { + let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000); + assert_eq!(docset.skip_next(5111), SkipResult::OverStep); + assert_eq!(docset.doc(), 5112); + assert!(docset.advance()); + assert_eq!(docset.doc(), 5513); + assert!(docset.advance()); + assert_eq!(docset.doc(), 6666); + assert!(!docset.advance()); + } + } + + + #[bench] + fn bench_bitset_1pct_insert(b: &mut test::Bencher) { + use tests; + let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); + b.iter(|| { + let mut bitset = BitSet::with_max_value(1_000_000); + for el in els.iter().cloned() { bitset.insert(el); } + }); + } + + #[bench] + fn bench_bitset_1pct_clone(b: &mut test::Bencher) { + use tests; + let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); + let mut bitset = BitSet::with_max_value(1_000_000); + for el in els { bitset.insert(el); } + b.iter(|| { bitset.clone() }); + } + + #[bench] + fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) { + use tests; + use DocSet; + let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); + let mut bitset = BitSet::with_max_value(1_000_000); + for el in els { bitset.insert(el); } + b.iter(|| { + let mut docset = BitSetDocSet::from(bitset.clone()); + while docset.advance() {} + }); + } +} + diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index af4418d4e..112c2f519 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -8,7 +8,6 @@ use schema::Term; use query::TermQuery; use schema::IndexRecordOption; use query::Occur; -use query::OccurFilter; /// The boolean query combines a set of queries /// @@ -39,14 +38,9 @@ impl Query for BooleanQuery { fn weight(&self, searcher: &Searcher) -> Result> { let sub_weights = self.subqueries .iter() - .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) + .map(|&(ref occur, ref subquery)| Ok((*occur, subquery.weight(searcher)?))) .collect::>()?; - let occurs: Vec = self.subqueries - .iter() - .map(|&(ref occur, ref _subquery)| *occur) - .collect(); - let filter = OccurFilter::new(&occurs); - Ok(box BooleanWeight::new(sub_weights, filter)) + Ok(box BooleanWeight::new(sub_weights)) } } diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 5bc574c68..12228850b 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -90,7 +90,7 @@ impl BooleanScorer { } impl DocSet for BooleanScorer { - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { // TODO fix this. it should be the min // of the MUST scorer // and the max of the SHOULD scorers. diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index ef5d36374..76f9e8a2e 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,31 +1,43 @@ use query::Weight; use core::SegmentReader; +use query::EmptyScorer; use query::Scorer; use super::BooleanScorer; use query::OccurFilter; +use query::Occur; use Result; pub struct BooleanWeight { - weights: Vec>, - occur_filter: OccurFilter, + weights: Vec<(Occur, Box)>, } impl BooleanWeight { - pub fn new(weights: Vec>, occur_filter: OccurFilter) -> BooleanWeight { - BooleanWeight { - weights, - occur_filter, - } + pub fn new(weights: Vec<(Occur, Box)>) -> BooleanWeight { + BooleanWeight { weights } } } impl Weight for BooleanWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { - let sub_scorers: Vec> = self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect::>()?; - let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); - Ok(box boolean_scorer) + if self.weights.is_empty() { + Ok(box EmptyScorer) + } else if self.weights.len() == 1 { + let &(occur, ref weight) = &self.weights[0]; + if occur == Occur::MustNot { + Ok(box EmptyScorer) + } else { + weight.scorer(reader) + } + } else { + let sub_scorers: Vec> = self.weights + .iter() + .map(|&(_, ref weight)| weight) + .map(|weight| weight.scorer(reader)) + .collect::>()?; + let occurs: Vec = self.weights.iter().map(|&(ref occur, _)| *occur).collect(); + let occur_filter = OccurFilter::new(&occurs); + let boolean_scorer = BooleanScorer::new(sub_scorers, occur_filter); + Ok(box boolean_scorer) + } } } diff --git a/src/query/mod.rs b/src/query/mod.rs index 7303baebb..7541daf4c 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -12,7 +12,10 @@ mod term_query; mod query_parser; mod phrase_query; mod all_query; +mod bitset; +mod range_query; +pub use self::bitset::BitSetDocSet; pub use self::boolean_query::BooleanQuery; pub use self::occur_filter::OccurFilter; pub use self::occur::Occur; @@ -24,4 +27,7 @@ pub use self::scorer::EmptyScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; + pub use self::all_query::{AllQuery, AllScorer, AllWeight}; +pub use self::range_query::RangeQuery; +pub use self::scorer::ConstScorer; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index f6775cb60..1c576cac0 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -35,7 +35,7 @@ impl DocSet for PostingsWithOffset { self.segment_postings.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.segment_postings.size_hint() } @@ -122,7 +122,7 @@ impl DocSet for PhraseScorer { self.intersection_docset.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.intersection_docset.size_hint() } } diff --git a/src/query/range_query.rs b/src/query/range_query.rs new file mode 100644 index 000000000..3b9e65994 --- /dev/null +++ b/src/query/range_query.rs @@ -0,0 +1,292 @@ +use schema::{Field, IndexRecordOption, Term}; +use query::{Query, Scorer, Weight}; +use termdict::{TermDictionary, TermStreamer, TermStreamerBuilder}; +use core::SegmentReader; +use common::BitSet; +use Result; +use std::any::Any; +use core::Searcher; +use query::BitSetDocSet; +use query::ConstScorer; +use std::collections::Bound; +use std::collections::range::RangeArgument; + + +fn map_boundVec >(bound: Bound, transform: &Transform) -> Bound> { + use self::Bound::*; + match bound { + Excluded(from_val) => Excluded(transform(from_val)), + Included(from_val) => Included(transform(from_val)), + Unbounded => Unbounded + } +} + + + +/// `RangeQuery` match all documents that have at least one term within a defined range. +/// +/// Matched document will all get a constant `Score` of one. +/// +/// # Implementation +/// +/// The current implement will iterate over the terms within the range +/// and append all of the document cross into a `BitSet`. +/// +/// # Example +/// +/// ```rust +/// +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED}; +/// # use tantivy::collector::CountCollector; +/// # use tantivy::query::Query; +/// # use tantivy::Result; +/// # use tantivy::query::RangeQuery; +/// # +/// # fn run() -> Result<()> { +/// # let mut schema_builder = SchemaBuilder::new(); +/// # let year_field = schema_builder.add_u64_field("year", INT_INDEXED); +/// # let schema = schema_builder.build(); +/// # +/// # let index = Index::create_in_ram(schema); +/// # { +/// # let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap(); +/// # for year in 1950u64..2017u64 { +/// # let num_docs_within_year = 10 + (year - 1950) * (year - 1950); +/// # for _ in 0..num_docs_within_year { +/// # index_writer.add_document(doc!(year_field => year)); +/// # } +/// # } +/// # index_writer.commit().unwrap(); +/// # } +/// # index.load_searchers()?; +/// let searcher = index.searcher(); +/// +/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970); +/// +/// // ... or `1960..=1969` if inclusive range is enabled. +/// let mut count_collector = CountCollector::default(); +/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?; +/// +/// let num_60s_books = count_collector.count(); +/// +/// # assert_eq!(num_60s_books, 2285); +/// # Ok(()) +/// # } +/// # +/// # fn main() { +/// # run().unwrap() +/// # } +/// ``` +#[derive(Debug)] +pub struct RangeQuery { + field: Field, + left_bound: Bound>, + right_bound: Bound>, +} + +impl RangeQuery { + + /// Create a new `RangeQuery` over a `i64` field. + pub fn new_i64>(field: Field, range: TRangeArgument) -> RangeQuery { + let make_term_val = |val: &i64| { + Term::from_field_i64(field, *val).value_bytes().to_owned() + }; + RangeQuery { + field, + left_bound: map_bound(range.start(), &make_term_val), + right_bound: map_bound(range.end(), &make_term_val) + } + } + + /// Create a new `RangeQuery` over a `u64` field. + pub fn new_u64>(field: Field, range: TRangeArgument) -> RangeQuery { + let make_term_val = |val: &u64| { + Term::from_field_u64(field, *val).value_bytes().to_owned() + }; + RangeQuery { + field, + left_bound: map_bound(range.start(), &make_term_val), + right_bound: map_bound(range.end(), &make_term_val) + } + } + + /// Create a new `RangeQuery` over a `Str` field. + pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>(field: Field, range: TRangeArgument) -> RangeQuery { + let make_term_val = |val: &&str| { + val.as_bytes().to_vec() + }; + RangeQuery { + field, + left_bound: map_bound(range.start(), &make_term_val), + right_bound: map_bound(range.end(), &make_term_val) + } + } +} + +impl Query for RangeQuery { + fn as_any(&self) -> &Any { + self + } + + fn weight(&self, _searcher: &Searcher) -> Result> { + Ok(box RangeWeight { + field: self.field, + left_bound: self.left_bound.clone(), + right_bound: self.right_bound.clone() + }) + } +} + +pub struct RangeWeight { + field: Field, + left_bound: Bound>, + right_bound: Bound>, +} + +impl RangeWeight { + fn term_range<'a, T>(&self, term_dict: &'a T) -> T::Streamer + where + T: TermDictionary<'a> + 'a, + { + use std::collections::Bound::*; + let mut term_stream_builder = term_dict.range(); + term_stream_builder = match &self.left_bound { + &Included(ref term_val) => term_stream_builder.ge(term_val), + &Excluded(ref term_val) => term_stream_builder.gt(term_val), + &Unbounded => term_stream_builder, + }; + term_stream_builder = match &self.right_bound { + &Included(ref term_val) => term_stream_builder.le(term_val), + &Excluded(ref term_val) => term_stream_builder.lt(term_val), + &Unbounded => term_stream_builder, + }; + term_stream_builder.into_stream() + } +} + +impl Weight for RangeWeight { + fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { + let max_doc = reader.max_doc(); + let mut doc_bitset = BitSet::with_max_value(max_doc); + + let inverted_index = reader.inverted_index(self.field); + let term_dict = inverted_index.terms(); + let mut term_range = self.term_range(term_dict); + while term_range.advance() { + let term_info = term_range.value(); + let mut block_segment_postings = inverted_index + .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); + while block_segment_postings.advance() { + for &doc in block_segment_postings.docs() { + doc_bitset.insert(doc); + } + } + } + let doc_bitset = BitSetDocSet::from(doc_bitset); + Ok(box ConstScorer::new(doc_bitset)) + } +} + +#[cfg(test)] +mod tests { + + use Index; + use schema::{Document, Field, SchemaBuilder, INT_INDEXED}; + use collector::CountCollector; + use std::collections::Bound; + use query::Query; + use Result; + use super::RangeQuery; + + #[test] + fn test_range_query_simple() { + + fn run() -> Result<()> { + let mut schema_builder = SchemaBuilder::new(); + let year_field= schema_builder.add_u64_field("year", INT_INDEXED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap(); + for year in 1950u64..2017u64 { + let num_docs_within_year = 10 + (year - 1950) * (year - 1950); + for _ in 0..num_docs_within_year { + index_writer.add_document(doc!(year_field => year)); + } + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + + let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64); + + // ... or `1960..=1969` if inclusive range is enabled. + let mut count_collector = CountCollector::default(); + docs_in_the_sixties.search(&*searcher, &mut count_collector)?; + assert_eq!(count_collector.count(), 2285); + Ok(()) + } + + run().unwrap(); + + } + + #[test] + fn test_range_query() { + let int_field: Field; + let schema = { + let mut schema_builder = SchemaBuilder::new(); + int_field = schema_builder.add_i64_field("intfield", INT_INDEXED); + schema_builder.build() + }; + + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap(); + + for i in 1..100 { + let mut doc = Document::new(); + for j in 1..100 { + if i % j == 0 { + doc.add_i64(int_field, j as i64); + } + } + index_writer.add_document(doc); + } + + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let count_multiples = |range_query: RangeQuery| { + let mut count_collector = CountCollector::default(); + range_query + .search(&*searcher, &mut count_collector) + .unwrap(); + count_collector.count() + }; + + assert_eq!( + count_multiples(RangeQuery::new_i64(int_field, 10..11)), + 9 + ); + assert_eq!( + count_multiples(RangeQuery::new_i64(int_field, (Bound::Included(10), Bound::Included(11)) )), + 18 + ); + assert_eq!( + count_multiples(RangeQuery::new_i64(int_field, (Bound::Excluded(9), Bound::Included(10)))), + 9 + ); + assert_eq!( + count_multiples(RangeQuery::new_i64(int_field, 9..)), + 91 + ); + } + +} diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 170e6aa56..2cbeb001d 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -2,6 +2,8 @@ use DocSet; use DocId; use Score; use collector::Collector; +use postings::SkipResult; +use common::BitSet; use std::ops::{Deref, DerefMut}; /// Scored set of documents matching a query within a specific segment. @@ -49,7 +51,7 @@ impl DocSet for EmptyScorer { DocId::max_value() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { 0 } } @@ -59,3 +61,63 @@ impl Scorer for EmptyScorer { 0f32 } } + + +/// Wraps a `DocSet` and simply returns a constant `Scorer`. +/// The `ConstScorer` is useful if you have a `DocSet` where +/// you needed a scorer. +/// +/// The `ConstScorer`'s constant score can be set +/// by calling `.set_score(...)`. +pub struct ConstScorer { + docset: TDocSet, + score: Score, +} + +impl ConstScorer { + + /// Creates a new `ConstScorer`. + pub fn new(docset: TDocSet) -> ConstScorer { + ConstScorer { + docset, + score: 1f32, + } + } + + /// Sets the constant score to a different value. + pub fn set_score(&mut self, score: Score) { + self.score = score; + } +} + +impl DocSet for ConstScorer { + fn advance(&mut self) -> bool { + self.docset.advance() + } + + fn skip_next(&mut self, target: DocId) -> SkipResult { + self.docset.skip_next(target) + } + + fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { + self.docset.fill_buffer(buffer) + } + + fn doc(&self) -> DocId { + self.docset.doc() + } + + fn size_hint(&self) -> u32 { + self.docset.size_hint() + } + + fn append_to_bitset(&mut self, bitset: &mut BitSet) { + self.docset.append_to_bitset(bitset); + } +} + +impl Scorer for ConstScorer { + fn score(&self) -> Score { + 1f32 + } +} diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 43b969140..626cbe029 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -37,7 +37,7 @@ where self.postings.doc() } - fn size_hint(&self) -> usize { + fn size_hint(&self) -> u32 { self.postings.size_hint() } diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 5b56e64df..b36be64ae 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -54,7 +54,7 @@ where fn new(w: W, _field_type: FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { - fst_builder: fst_builder, + fst_builder, data: Vec::new(), term_ord: 0, }) @@ -111,7 +111,7 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { let values_source = source.slice(split_len, length_offset); let fst_index = open_fst_index(fst_source); TermDictionaryImpl { - fst_index: fst_index, + fst_index, values_mmap: values_source, } } @@ -120,6 +120,10 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { self.values_mmap.len() / TermInfo::SIZE_IN_BYTES } + fn term_ord>(&self, key: K) -> Option { + self.fst_index.get(key) + } + fn ord_to_term(&self, mut ord: TermOrdinal, bytes: &mut Vec) -> bool { bytes.clear(); let fst = self.fst_index.as_fst(); @@ -140,10 +144,6 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { true } - fn term_ord>(&self, key: K) -> Option { - self.fst_index.get(key) - } - fn term_info_from_ord(&self, term_ord: TermOrdinal) -> TermInfo { let buffer = self.values_mmap.as_slice(); let offset = term_ord as usize * TermInfo::SIZE_IN_BYTES; diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 5f84d933a..0dbc6667d 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -86,6 +86,10 @@ where /// Opens a `TermDictionary` given a data source. fn from_source(source: ReadOnlySource) -> Self; + /// Returns the number of terms in the dictionary. + /// Term ordinals range from 0 to `num_terms() - 1`. + fn num_terms(&self) -> usize; + /// Returns the ordinal associated to a given term. fn term_ord>(&self, term: K) -> Option; @@ -103,10 +107,6 @@ where /// Returns the number of terms in the dictionary. fn term_info_from_ord(&self, term_ord: TermOrdinal) -> TermInfo; - /// Returns the number of terms in the dictionary. - /// Term ordinals range from 0 to `num_terms() - 1`. - fn num_terms(&self) -> usize; - /// Lookups the value corresponding to the key. fn get>(&self, target_key: K) -> Option; From 9370427ae2863a4e2bb7ade4d224626b6adf6a1e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 12 Feb 2018 10:24:58 +0900 Subject: [PATCH 6/7] Terminfo blocks (#244) * Using u64 key in the store * Using Option<> for the next element, as opposed to u64 * Code simplification. * Added TermInfoStoreWriter. * Added a TermInfoStore * Added FixedSized for BinarySerialized. --- src/collector/facet_collector.rs | 2 +- src/common/bitpacker.rs | 103 ++---- src/common/mod.rs | 62 +++- src/common/serialize.rs | 104 ++++-- src/common/vint.rs | 4 + .../pack/compression_pack_nosimd.rs | 10 +- src/datastruct/skip/mod.rs | 41 ++- src/datastruct/skip/skiplist.rs | 67 ++-- src/datastruct/skip/skiplist_builder.rs | 46 ++- src/fastfield/mod.rs | 4 +- src/fastfield/reader.rs | 11 +- src/fastfield/serializer.rs | 15 +- src/postings/term_info.rs | 25 +- src/store/reader.rs | 5 +- src/store/writer.rs | 10 +- src/termdict/fstdict/mod.rs | 2 + src/termdict/fstdict/term_info_store.rs | 318 ++++++++++++++++++ src/termdict/fstdict/termdict.rs | 42 +-- src/termdict/mod.rs | 5 +- 19 files changed, 646 insertions(+), 230 deletions(-) create mode 100644 src/termdict/fstdict/term_info_store.rs diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 13d50e161..b9efd2660 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -330,7 +330,7 @@ impl FacetCollector { fn finalize_segment(&mut self) { if self.ff_reader.is_some() { self.segment_counters.push(SegmentFacetCounter { - facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() }, + facet_reader: self.ff_reader.take().unwrap().into_inner(), facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()), facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()), }); diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index b78a32746..992e2d1db 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -4,64 +4,31 @@ use common::serialize::BinarySerializable; use std::mem; use std::ops::Deref; -/// Computes the number of bits that will be used for bitpacking. -/// -/// In general the target is the minimum number of bits -/// required to express the amplitude given in argument. -/// -/// e.g. If the amplitude is 10, we can store all ints on simply 4bits. -/// -/// The logic is slightly more convoluted here as for optimization -/// reasons, we want to ensure that a value spawns over at most 8 bytes -/// of aligns bytes. -/// -/// Spanning over 9 bytes is possible for instance, if we do -/// bitpacking with an amplitude of 63 bits. -/// In this case, the second int will start on bit -/// 63 (which belongs to byte 7) and ends at byte 15; -/// Hence 9 bytes (from byte 7 to byte 15 included). -/// -/// To avoid this, we force the number of bits to 64bits -/// when the result is greater than `64-8 = 56 bits`. -/// -/// Note that this only affects rare use cases spawning over -/// a very large range of values. Even in this case, it results -/// in an extra cost of at most 12% compared to the optimal -/// number of bits. -pub fn compute_num_bits(amplitude: u64) -> u8 { - let amplitude = (64u32 - amplitude.leading_zeros()) as u8; - if amplitude <= 64 - 8 { - amplitude - } else { - 64 - } -} -pub struct BitPacker { +pub(crate) struct BitPacker { mini_buffer: u64, - mini_buffer_written: usize, - num_bits: usize, + mini_buffer_written: usize } impl BitPacker { - pub fn new(num_bits: usize) -> BitPacker { + pub fn new() -> BitPacker { BitPacker { mini_buffer: 0u64, - mini_buffer_written: 0, - num_bits, + mini_buffer_written: 0 } } - pub fn write(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> { + pub fn write(&mut self, val: u64, num_bits: u8, output: &mut TWrite) -> io::Result<()> { let val_u64 = val as u64; - if self.mini_buffer_written + self.num_bits > 64 { + let num_bits = num_bits as usize; + if self.mini_buffer_written + num_bits > 64 { self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32); self.mini_buffer.serialize(output)?; self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32); - self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64; + self.mini_buffer_written = self.mini_buffer_written + num_bits - 64; } else { self.mini_buffer |= val_u64 << self.mini_buffer_written; - self.mini_buffer_written += self.num_bits; + self.mini_buffer_written += num_bits; if self.mini_buffer_written == 64 { self.mini_buffer.serialize(output)?; self.mini_buffer_written = 0; @@ -71,7 +38,7 @@ impl BitPacker { Ok(()) } - pub(crate) fn flush(&mut self, output: &mut TWrite) -> io::Result<()> { + pub fn flush(&mut self, output: &mut TWrite) -> io::Result<()> { if self.mini_buffer_written > 0 { let num_bytes = (self.mini_buffer_written + 7) / 8; let arr: [u8; 8] = unsafe { mem::transmute::(self.mini_buffer) }; @@ -91,8 +58,8 @@ impl BitPacker { #[derive(Clone)] pub struct BitUnpacker -where - Data: Deref, + where + Data: Deref, { num_bits: usize, mask: u64, @@ -100,17 +67,18 @@ where } impl BitUnpacker -where - Data: Deref, + where + Data: Deref, { - pub fn new(data: Data, num_bits: usize) -> BitUnpacker { - let mask: u64 = if num_bits == 64 { - !0u64 - } else { - (1u64 << num_bits) - 1u64 - }; + pub fn new(data: Data, num_bits: u8) -> BitUnpacker { + let mask: u64 = + if num_bits == 64 { + !0u64 + } else { + (1u64 << num_bits) - 1u64 + }; BitUnpacker { - num_bits, + num_bits: num_bits as usize, mask, data, } @@ -148,7 +116,7 @@ where } unsafe { *(buffer[..].as_ptr() as *const u64) } }; - let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; + let val_shifted = val_unshifted_unmasked >> (bit_shift as u64); (val_shifted & mask) } } @@ -178,37 +146,26 @@ where #[cfg(test)] mod test { - use super::{compute_num_bits, BitPacker, BitUnpacker}; + use super::{BitPacker, BitUnpacker}; - #[test] - fn test_compute_num_bits() { - assert_eq!(compute_num_bits(1), 1u8); - assert_eq!(compute_num_bits(0), 0u8); - assert_eq!(compute_num_bits(2), 2u8); - assert_eq!(compute_num_bits(3), 2u8); - assert_eq!(compute_num_bits(4), 3u8); - assert_eq!(compute_num_bits(255), 8u8); - assert_eq!(compute_num_bits(256), 9u8); - assert_eq!(compute_num_bits(5_000_000_000), 33u8); - } - fn create_fastfield_bitpacker(len: usize, num_bits: usize) -> (BitUnpacker>, Vec) { + fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker>, Vec) { let mut data = Vec::new(); - let mut bitpacker = BitPacker::new(num_bits); - let max_val: u64 = (1 << num_bits) - 1; + let mut bitpacker = BitPacker::new(); + let max_val: u64 = (1u64 << num_bits as u64) - 1u64; let vals: Vec = (0u64..len as u64) .map(|i| if max_val == 0 { 0 } else { i % max_val }) .collect(); for &val in &vals { - bitpacker.write(val, &mut data).unwrap(); + bitpacker.write(val, num_bits,&mut data).unwrap(); } bitpacker.close(&mut data).unwrap(); - assert_eq!(data.len(), (num_bits * len + 7) / 8 + 7); + assert_eq!(data.len(), ((num_bits as usize)* len + 7) / 8 + 7); let bitunpacker = BitUnpacker::new(data, num_bits); (bitunpacker, vals) } - fn test_bitpacker_util(len: usize, num_bits: usize) { + fn test_bitpacker_util(len: usize, num_bits: u8) { let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits); for (i, val) in vals.iter().enumerate() { assert_eq!(bitunpacker.get(i), *val); diff --git a/src/common/mod.rs b/src/common/mod.rs index aceea844d..c103b468d 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -7,7 +7,7 @@ pub mod bitpacker; mod bitset; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; -pub use self::serialize::BinarySerializable; +pub use self::serialize::{BinarySerializable, FixedSize}; pub use self::timer::Timing; pub use self::timer::TimerTree; pub use self::timer::OpenTimer; @@ -15,11 +15,50 @@ pub use self::vint::VInt; pub use self::counting_writer::CountingWriter; pub use self::bitset::BitSet; pub(crate) use self::bitset::TinySet; +pub use byteorder::LittleEndian as Endianness; use std::io; +/// Computes the number of bits that will be used for bitpacking. +/// +/// In general the target is the minimum number of bits +/// required to express the amplitude given in argument. +/// +/// e.g. If the amplitude is 10, we can store all ints on simply 4bits. +/// +/// The logic is slightly more convoluted here as for optimization +/// reasons, we want to ensure that a value spawns over at most 8 bytes +/// of aligns bytes. +/// +/// Spanning over 9 bytes is possible for instance, if we do +/// bitpacking with an amplitude of 63 bits. +/// In this case, the second int will start on bit +/// 63 (which belongs to byte 7) and ends at byte 15; +/// Hence 9 bytes (from byte 7 to byte 15 included). +/// +/// To avoid this, we force the number of bits to 64bits +/// when the result is greater than `64-8 = 56 bits`. +/// +/// Note that this only affects rare use cases spawning over +/// a very large range of values. Even in this case, it results +/// in an extra cost of at most 12% compared to the optimal +/// number of bits. +pub(crate) fn compute_num_bits(n: u64) -> u8 { + let amplitude = (64u32 - n.leading_zeros()) as u8; + if amplitude <= 64 - 8 { + amplitude + } else { + 64 + } +} + + +pub(crate) fn is_power_of_2(n: usize) -> bool { + (n > 0) && (n & (n - 1) == 0) +} + /// Create a default io error given a string. -pub fn make_io_err(msg: String) -> io::Error { +pub(crate) fn make_io_err(msg: String) -> io::Error { io::Error::new(io::ErrorKind::Other, msg) } @@ -68,9 +107,10 @@ pub fn u64_to_i64(val: u64) -> i64 { } #[cfg(test)] -mod test { +pub(crate) mod test { - use super::{i64_to_u64, u64_to_i64}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64}; + pub use super::serialize::test::fixed_size_test; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); @@ -87,4 +127,18 @@ mod test { test_i64_converter_helper(i); } } + + + #[test] + fn test_compute_num_bits() { + assert_eq!(compute_num_bits(1), 1u8); + assert_eq!(compute_num_bits(0), 0u8); + assert_eq!(compute_num_bits(2), 2u8); + assert_eq!(compute_num_bits(3), 2u8); + assert_eq!(compute_num_bits(4), 3u8); + assert_eq!(compute_num_bits(255), 8u8); + assert_eq!(compute_num_bits(256), 9u8); + assert_eq!(compute_num_bits(5_000_000_000), 33u8); + } } + diff --git a/src/common/serialize.rs b/src/common/serialize.rs index f66c02b13..9012c0eb2 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -1,16 +1,26 @@ use byteorder::{ReadBytesExt, WriteBytesExt}; -use byteorder::LittleEndian as Endianness; +use common::Endianness; use std::fmt; use std::io::Write; use std::io::Read; use std::io; use common::VInt; +/// Trait for a simple binary serialization. pub trait BinarySerializable: fmt::Debug + Sized { + /// Serialize fn serialize(&self, writer: &mut W) -> io::Result<()>; + /// Deserialize fn deserialize(reader: &mut R) -> io::Result; } + +/// `FixedSize` marks a `BinarySerializable` as +/// always serializing to the same size. +pub trait FixedSize: BinarySerializable { + const SIZE_IN_BYTES: usize; +} + impl BinarySerializable for () { fn serialize(&self, _: &mut W) -> io::Result<()> { Ok(()) @@ -20,6 +30,10 @@ impl BinarySerializable for () { } } +impl FixedSize for () { + const SIZE_IN_BYTES: usize = 0; +} + impl BinarySerializable for Vec { fn serialize(&self, writer: &mut W) -> io::Result<()> { VInt(self.len() as u64).serialize(writer)?; @@ -59,6 +73,10 @@ impl BinarySerializable for u32 { } } +impl FixedSize for u32 { + const SIZE_IN_BYTES: usize = 4; +} + impl BinarySerializable for u64 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_u64::(*self) @@ -68,6 +86,10 @@ impl BinarySerializable for u64 { } } +impl FixedSize for u64 { + const SIZE_IN_BYTES: usize = 8; +} + impl BinarySerializable for i64 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_i64::(*self) @@ -77,6 +99,11 @@ impl BinarySerializable for i64 { } } +impl FixedSize for i64 { + const SIZE_IN_BYTES: usize = 8; +} + + impl BinarySerializable for u8 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_u8(*self) @@ -86,6 +113,10 @@ impl BinarySerializable for u8 { } } +impl FixedSize for u8 { + const SIZE_IN_BYTES: usize = 1; +} + impl BinarySerializable for String { fn serialize(&self, writer: &mut W) -> io::Result<()> { let data: &[u8] = self.as_bytes(); @@ -103,64 +134,79 @@ impl BinarySerializable for String { } } + #[cfg(test)] -mod test { +pub mod test { use common::VInt; use super::*; - fn serialize_test(v: T, num_bytes: usize) { + + pub fn fixed_size_test() { + let mut buffer = Vec::new(); + O::default().serialize(&mut buffer).unwrap(); + assert_eq!(buffer.len(), O::SIZE_IN_BYTES); + } + + + fn serialize_test(v: T) -> usize { let mut buffer: Vec = Vec::new(); - if num_bytes != 0 { - v.serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), num_bytes); - } else { - v.serialize(&mut buffer).unwrap(); - } + v.serialize(&mut buffer).unwrap(); + let num_bytes = buffer.len(); let mut cursor = &buffer[..]; let deser = T::deserialize(&mut cursor).unwrap(); assert_eq!(deser, v); + num_bytes } #[test] fn test_serialize_u8() { - serialize_test(3u8, 1); - serialize_test(5u8, 1); + fixed_size_test::(); } #[test] fn test_serialize_u32() { - serialize_test(3u32, 4); - serialize_test(5u32, 4); - serialize_test(u32::max_value(), 4); + fixed_size_test::(); + assert_eq!(4, serialize_test(3u32)); + assert_eq!(4, serialize_test(5u32)); + assert_eq!(4, serialize_test(u32::max_value())); + } + + #[test] + fn test_serialize_i64() { + fixed_size_test::(); + } + + #[test] + fn test_serialize_u64() { + fixed_size_test::(); } #[test] fn test_serialize_string() { - serialize_test(String::from(""), 1); - serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4); - serialize_test(String::from("富士さん見える。"), 1 + 3 * 8); + assert_eq!(serialize_test(String::from("")), 1); + assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4); + assert_eq!(serialize_test(String::from("富士さん見える。")), 1 + 3 * 8); } #[test] fn test_serialize_vec() { - let v: Vec = Vec::new(); - serialize_test(v, 1); - serialize_test(vec![1u32, 3u32], 1 + 4 * 2); + assert_eq!(serialize_test(Vec::::new()), 1); + assert_eq!(serialize_test(vec![1u32, 3u32]), 1 + 4 * 2); } #[test] fn test_serialize_vint() { for i in 0..10_000 { - serialize_test(VInt(i as u64), 0); + serialize_test(VInt(i as u64)); } - serialize_test(VInt(7u64), 1); - serialize_test(VInt(127u64), 1); - serialize_test(VInt(128u64), 2); - serialize_test(VInt(129u64), 2); - serialize_test(VInt(1234u64), 2); - serialize_test(VInt(16_383), 2); - serialize_test(VInt(16_384), 3); - serialize_test(VInt(u64::max_value()), 10); + assert_eq!(serialize_test(VInt(7u64)), 1); + assert_eq!(serialize_test(VInt(127u64)), 1); + assert_eq!(serialize_test(VInt(128u64)), 2); + assert_eq!(serialize_test(VInt(129u64)), 2); + assert_eq!(serialize_test(VInt(1234u64)), 2); + assert_eq!(serialize_test(VInt(16_383u64)), 2); + assert_eq!(serialize_test(VInt(16_384u64)), 3); + assert_eq!(serialize_test(VInt(u64::max_value())), 10); } } diff --git a/src/common/vint.rs b/src/common/vint.rs index 70f673cfc..b0c32d1d3 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -11,6 +11,10 @@ impl VInt { pub fn val(&self) -> u64 { self.0 } + + pub fn deserialize_u64(reader: &mut R) -> io::Result { + VInt::deserialize(reader).map(|vint| vint.0) + } } impl BinarySerializable for VInt { diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 23e010b4c..420cd5dbe 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -23,9 +23,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz let num_bits = compute_num_bits(max_delta as u64); counting_writer.write_all(&[num_bits]).unwrap(); - let mut bit_packer = BitPacker::new(num_bits as usize); + let mut bit_packer = BitPacker::new(); for val in vals { - bit_packer.write(*val as u64, &mut counting_writer).unwrap(); + bit_packer.write(*val as u64, num_bits,&mut counting_writer).unwrap(); } counting_writer.written_bytes() } @@ -61,13 +61,13 @@ impl BlockEncoder { let num_bits = compute_num_bits(max as u64); let mut counting_writer = CountingWriter::wrap(output); counting_writer.write_all(&[num_bits]).unwrap(); - let mut bit_packer = BitPacker::new(num_bits as usize); + let mut bit_packer = BitPacker::new(); for val in vals { - bit_packer.write(*val as u64, &mut counting_writer).unwrap(); + bit_packer.write(*val as u64, num_bits, &mut counting_writer).unwrap(); } for _ in vals.len()..COMPRESSION_BLOCK_SIZE { bit_packer - .write(vals[0] as u64, &mut counting_writer) + .write(vals[0] as u64, num_bits, &mut counting_writer) .unwrap(); } bit_packer.flush(&mut counting_writer).expect( diff --git a/src/datastruct/skip/mod.rs b/src/datastruct/skip/mod.rs index 18268fdd0..260393e72 100644 --- a/src/datastruct/skip/mod.rs +++ b/src/datastruct/skip/mod.rs @@ -9,12 +9,12 @@ pub use self::skiplist::SkipList; #[cfg(test)] mod tests { - use super::*; + use super::{SkipList, SkipListBuilder}; #[test] fn test_skiplist() { let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10); + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(8); skip_list_builder.insert(2, &3).unwrap(); skip_list_builder.write::>(&mut output).unwrap(); let mut skip_list: SkipList = SkipList::from(output.as_slice()); @@ -24,7 +24,7 @@ mod tests { #[test] fn test_skiplist2() { let mut output: Vec = Vec::new(); - let skip_list_builder: SkipListBuilder = SkipListBuilder::new(10); + let skip_list_builder: SkipListBuilder = SkipListBuilder::new(8); skip_list_builder.write::>(&mut output).unwrap(); let mut skip_list: SkipList = SkipList::from(output.as_slice()); assert_eq!(skip_list.next(), None); @@ -71,7 +71,7 @@ mod tests { #[test] fn test_skiplist5() { let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3); + let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); skip_list_builder.insert(2, &()).unwrap(); skip_list_builder.insert(3, &()).unwrap(); skip_list_builder.insert(5, &()).unwrap(); @@ -103,7 +103,7 @@ mod tests { #[test] fn test_skiplist7() { let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3); + let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); for i in 0..1000 { skip_list_builder.insert(i, &()).unwrap(); } @@ -121,35 +121,48 @@ mod tests { #[test] fn test_skiplist8() { let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10); + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(8); skip_list_builder.insert(2, &3).unwrap(); skip_list_builder.write::>(&mut output).unwrap(); - assert_eq!(output.len(), 13); + assert_eq!(output.len(), 11); assert_eq!(output[0], 1u8 + 128u8); } #[test] fn test_skiplist9() { let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3); - for i in 0..9 { + let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(4); + for i in 0..4*4*4 { skip_list_builder.insert(i, &i).unwrap(); } skip_list_builder.write::>(&mut output).unwrap(); - assert_eq!(output.len(), 117); - assert_eq!(output[0], 3u8 + 128u8); + assert_eq!(output.len(), 774); + assert_eq!(output[0], 4u8 + 128u8); } #[test] fn test_skiplist10() { // checking that void gets serialized to nothing. let mut output: Vec = Vec::new(); - let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3); - for i in 0..9 { + let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); + for i in 0..((4*4*4) - 1) { skip_list_builder.insert(i, &()).unwrap(); } skip_list_builder.write::>(&mut output).unwrap(); - assert_eq!(output.len(), 81); + assert_eq!(output.len(), 230); + assert_eq!(output[0], 128u8 + 3u8); + } + + #[test] + fn test_skiplist11() { + // checking that void gets serialized to nothing. + let mut output: Vec = Vec::new(); + let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); + for i in 0..(4*4) { + skip_list_builder.insert(i, &()).unwrap(); + } + skip_list_builder.write::>(&mut output).unwrap(); + assert_eq!(output.len(), 65); assert_eq!(output[0], 128u8 + 3u8); } diff --git a/src/datastruct/skip/skiplist.rs b/src/datastruct/skip/skiplist.rs index 5323dcfcb..ef5491ac0 100644 --- a/src/datastruct/skip/skiplist.rs +++ b/src/datastruct/skip/skiplist.rs @@ -1,6 +1,5 @@ -use common::BinarySerializable; +use common::{BinarySerializable, VInt}; use std::marker::PhantomData; -use DocId; use std::cmp::max; static EMPTY: [u8; 0] = []; @@ -8,21 +7,20 @@ static EMPTY: [u8; 0] = []; struct Layer<'a, T> { data: &'a [u8], cursor: &'a [u8], - next_id: DocId, + next_id: Option, _phantom_: PhantomData, } impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> { - type Item = (DocId, T); + type Item = (u64, T); - fn next(&mut self) -> Option<(DocId, T)> { - if self.next_id == u32::max_value() { - None - } else { + fn next(&mut self) -> Option<(u64, T)> { + if let Some(cur_id) = self.next_id { let cur_val = T::deserialize(&mut self.cursor).unwrap(); - let cur_id = self.next_id; - self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value()); + self.next_id = VInt::deserialize_u64(&mut self.cursor).ok(); Some((cur_id, cur_val)) + } else { + None } } } @@ -30,7 +28,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> { impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> { fn from(data: &'a [u8]) -> Layer<'a, T> { let mut cursor = data; - let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value()); + let next_id = VInt::deserialize_u64(&mut cursor).ok(); Layer { data, cursor, @@ -45,14 +43,14 @@ impl<'a, T: BinarySerializable> Layer<'a, T> { Layer { data: &EMPTY, cursor: &EMPTY, - next_id: DocId::max_value(), + next_id: None, _phantom_: PhantomData, } } fn seek_offset(&mut self, offset: usize) { self.cursor = &self.data[offset..]; - self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value()); + self.next_id = VInt::deserialize_u64(&mut self.cursor).ok(); } // Returns the last element (key, val) @@ -60,54 +58,61 @@ impl<'a, T: BinarySerializable> Layer<'a, T> { // // If there is no such element anymore, // returns None. - fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> { - let mut val = None; - while self.next_id < doc_id { - match self.next() { - None => { - break; - } - v => { - val = v; + // + // If the element exists, it will be returned + // at the next call to `.next()`. + fn seek(&mut self, key: u64) -> Option<(u64, T)> { + let mut result: Option<(u64, T)> = None; + loop { + if let Some(next_id) = self.next_id { + if next_id < key { + if let Some(v) = self.next() { + result = Some(v); + continue; + } } } + return result; } - val } } pub struct SkipList<'a, T: BinarySerializable> { data_layer: Layer<'a, T>, - skip_layers: Vec>, + skip_layers: Vec>, } impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> { - type Item = (DocId, T); + type Item = (u64, T); - fn next(&mut self) -> Option<(DocId, T)> { + fn next(&mut self) -> Option<(u64, T)> { self.data_layer.next() } } impl<'a, T: BinarySerializable> SkipList<'a, T> { - pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> { - let mut next_layer_skip: Option<(DocId, u32)> = None; + pub fn seek(&mut self, key: u64) -> Option<(u64, T)> { + let mut next_layer_skip: Option<(u64, u64)> = None; for skip_layer in &mut self.skip_layers { if let Some((_, offset)) = next_layer_skip { skip_layer.seek_offset(offset as usize); } - next_layer_skip = skip_layer.seek(doc_id); + next_layer_skip = skip_layer.seek(key); } if let Some((_, offset)) = next_layer_skip { self.data_layer.seek_offset(offset as usize); } - self.data_layer.seek(doc_id) + self.data_layer.seek(key) } } impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> { fn from(mut data: &'a [u8]) -> SkipList<'a, T> { - let offsets: Vec = Vec::deserialize(&mut data).unwrap(); + let offsets: Vec = Vec::::deserialize(&mut data) + .unwrap() + .into_iter() + .map(|el| el.0) + .collect(); let num_layers = offsets.len(); let layers_data: &[u8] = data; let data_layer: Layer<'a, T> = if num_layers == 0 { diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index 166c0bf0a..63aec23dd 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -1,13 +1,12 @@ use std::io::Write; -use common::BinarySerializable; +use common::{is_power_of_2, VInt, BinarySerializable}; use std::marker::PhantomData; -use DocId; use std::io; + struct LayerBuilder { - period: usize, + period_mask: usize, buffer: Vec, - remaining: usize, len: usize, _phantom_: PhantomData, } @@ -23,34 +22,33 @@ impl LayerBuilder { } fn with_period(period: usize) -> LayerBuilder { + assert!(is_power_of_2(period), "The period has to be a power of 2."); LayerBuilder { - period, + period_mask: (period - 1), buffer: Vec::new(), - remaining: period, len: 0, _phantom_: PhantomData, } } - fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result> { - self.remaining -= 1; + fn insert(&mut self, key: u64, value: &T) -> io::Result> { self.len += 1; - let offset = self.written_size() as u32; - doc_id.serialize(&mut self.buffer)?; + let offset = self.written_size() as u64; + VInt(key).serialize(&mut self.buffer)?; value.serialize(&mut self.buffer)?; - Ok(if self.remaining == 0 { - self.remaining = self.period; - Some((doc_id, offset)) + let emit_skip_info = (self.period_mask & self.len) == 0; + if emit_skip_info { + Ok(Some((key, offset))) } else { - None - }) + Ok(None) + } } } pub struct SkipListBuilder { period: usize, data_layer: LayerBuilder, - skip_layers: Vec>, + skip_layers: Vec>, } impl SkipListBuilder { @@ -62,7 +60,7 @@ impl SkipListBuilder { } } - fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder { + fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder { if layer_id == self.skip_layers.len() { let layer_builder = LayerBuilder::with_period(self.period); self.skip_layers.push(layer_builder); @@ -70,9 +68,9 @@ impl SkipListBuilder { &mut self.skip_layers[layer_id] } - pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> { + pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> { let mut layer_id = 0; - let mut skip_pointer = self.data_layer.insert(doc_id, dest)?; + let mut skip_pointer = self.data_layer.insert(key, dest)?; loop { skip_pointer = match skip_pointer { Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id) @@ -86,13 +84,11 @@ impl SkipListBuilder { } pub fn write(self, output: &mut W) -> io::Result<()> { - let mut size: u32 = 0; - let mut layer_sizes: Vec = Vec::new(); - size += self.data_layer.buffer.len() as u32; - layer_sizes.push(size); + let mut size: u64 = self.data_layer.buffer.len() as u64; + let mut layer_sizes = vec![VInt(size)]; for layer in self.skip_layers.iter().rev() { - size += layer.buffer.len() as u32; - layer_sizes.push(size); + size += layer.buffer.len() as u64; + layer_sizes.push(VInt(size)); } layer_sizes.serialize(output)?; self.data_layer.write(output)?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 6dcf2b480..ffca841b7 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -348,7 +348,7 @@ mod tests { b.iter(|| { let n = test::black_box(7000u32); let mut a = 0u64; - for i in Iterator::step_by((0u32..n), 7) { + for i in Iterator::step_by(0u32..n, 7) { a ^= permutation[i as usize]; } a @@ -394,7 +394,7 @@ mod tests { b.iter(|| { let n = test::black_box(7000u32); let mut a = 0u64; - for i in Iterator::step_by((0u32..n), 7) { + for i in Iterator::step_by(0u32..n, 7) { a ^= fast_field_reader.get(i); } a diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 1142c25d8..003a75a8e 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,6 +1,7 @@ use directory::ReadOnlySource; use common::{self, BinarySerializable}; -use common::bitpacker::{compute_num_bits, BitUnpacker}; +use common::compute_num_bits; +use common::bitpacker::BitUnpacker; use DocId; use schema::SchemaBuilder; use std::path::Path; @@ -117,11 +118,11 @@ impl FastFieldReader for U64FastFieldReader { let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); let owning_ref = OwningRef::new(data).map(|data| &data[16..]); - let bit_unpacker = BitUnpacker::new(owning_ref, num_bits as usize); + let bit_unpacker = BitUnpacker::new(owning_ref, num_bits); U64FastFieldReader { - min_value: min_value, - max_value: max_value, - bit_unpacker: bit_unpacker, + min_value, + max_value, + bit_unpacker, } } } diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index bde080e0e..8fab68e95 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -1,7 +1,8 @@ use common::BinarySerializable; use directory::WritePtr; use schema::Field; -use common::bitpacker::{compute_num_bits, BitPacker}; +use common::bitpacker::BitPacker; +use common::compute_num_bits; use common::CountingWriter; use common::CompositeWrite; use std::io::{self, Write}; @@ -74,6 +75,7 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> { bit_packer: BitPacker, write: &'a mut W, min_value: u64, + num_bits: u8, } impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { @@ -86,18 +88,19 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { let amplitude = max_value - min_value; amplitude.serialize(write)?; let num_bits = compute_num_bits(amplitude); - let bit_packer = BitPacker::new(num_bits as usize); + let bit_packer = BitPacker::new(); Ok(FastSingleFieldSerializer { - write: write, - bit_packer: bit_packer, - min_value: min_value, + write, + bit_packer, + min_value, + num_bits }) } /// Pushes a new value to the currently open u64 fast field. pub fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; - self.bit_packer.write(val_to_write, &mut self.write)?; + self.bit_packer.write(val_to_write, self.num_bits,&mut self.write)?; Ok(()) } diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 654ba2ab2..a6af45e8a 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,4 +1,4 @@ -use common::BinarySerializable; +use common::{BinarySerializable, FixedSize}; use std::io; /// `TermInfo` contains all of the information @@ -23,10 +23,13 @@ pub struct TermInfo { pub positions_inner_offset: u8, } -impl TermInfo { - /// Size required to encode the `TermInfo`. - // TODO make this smaller when positions are unused for instance. - pub(crate) const SIZE_IN_BYTES: usize = 4 + 8 + 8 + 1; +impl FixedSize for TermInfo { + /// Size required for the binary serialization of `TermInfo`. + /// This is large, but in practise, all `TermInfo` but the first one + /// of the block are bitpacked. + /// + /// See `TermInfoStore`. + const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2*u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES; } impl BinarySerializable for TermInfo { @@ -50,3 +53,15 @@ impl BinarySerializable for TermInfo { }) } } + +#[cfg(test)] +mod tests { + + use super::TermInfo; + use common::test::fixed_size_test; + + #[test] + fn test_fixed_size() { + fixed_size_test::(); + } +} diff --git a/src/store/reader.rs b/src/store/reader.rs index 7f4343f8f..f1d139d6d 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -39,7 +39,10 @@ impl StoreReader { } fn block_offset(&self, doc_id: DocId) -> (DocId, u64) { - self.block_index().seek(doc_id + 1).unwrap_or((0u32, 0u64)) + self.block_index() + .seek(doc_id as u64 + 1) + .map(|(doc, offset)| (doc as DocId, offset)) + .unwrap_or((0u32, 0u64)) } pub(crate) fn block_data(&self) -> &[u8] { diff --git a/src/store/writer.rs b/src/store/writer.rs index 34261c4cb..ad356e870 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -34,7 +34,7 @@ impl StoreWriter { pub fn new(writer: WritePtr) -> StoreWriter { StoreWriter { doc: 0, - offset_index_writer: SkipListBuilder::new(3), + offset_index_writer: SkipListBuilder::new(4), writer: CountingWriter::wrap(writer), intermediary_buffer: Vec::new(), current_block: Vec::new(), @@ -67,7 +67,7 @@ impl StoreWriter { if !self.current_block.is_empty() { self.write_and_compress_block()?; self.offset_index_writer - .insert(self.doc, &(self.writer.written_bytes() as u64))?; + .insert(self.doc as u64, &(self.writer.written_bytes() as u64))?; } let doc_offset = self.doc; let start_offset = self.writer.written_bytes() as u64; @@ -78,9 +78,9 @@ impl StoreWriter { // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. for (next_doc_id, block_addr) in store_reader.block_index() { - self.doc = doc_offset + next_doc_id; + self.doc = doc_offset + next_doc_id as u32; self.offset_index_writer - .insert(self.doc, &(start_offset + block_addr))?; + .insert(self.doc as u64, &(start_offset + block_addr))?; } Ok(()) } @@ -96,7 +96,7 @@ impl StoreWriter { (self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?; self.writer.write_all(&self.intermediary_buffer)?; self.offset_index_writer - .insert(self.doc, &(self.writer.written_bytes() as u64))?; + .insert(self.doc as u64, &(self.writer.written_bytes() as u64))?; self.current_block.clear(); Ok(()) } diff --git a/src/termdict/fstdict/mod.rs b/src/termdict/fstdict/mod.rs index a244ac2b4..0f31b6e15 100644 --- a/src/termdict/fstdict/mod.rs +++ b/src/termdict/fstdict/mod.rs @@ -16,8 +16,10 @@ Keys (`&[u8]`) in this datastructure are sorted. mod termdict; mod streamer; +mod term_info_store; pub use self::termdict::TermDictionaryImpl; pub use self::termdict::TermDictionaryBuilderImpl; +pub use self::term_info_store::{TermInfoStore, TermInfoStoreWriter}; pub use self::streamer::TermStreamerImpl; pub use self::streamer::TermStreamerBuilderImpl; diff --git a/src/termdict/fstdict/term_info_store.rs b/src/termdict/fstdict/term_info_store.rs new file mode 100644 index 000000000..407b68b00 --- /dev/null +++ b/src/termdict/fstdict/term_info_store.rs @@ -0,0 +1,318 @@ +use std::io; +use std::cmp; +use std::io::{Read, Write}; +use postings::TermInfo; +use common::{BinarySerializable, FixedSize}; +use common::compute_num_bits; +use common::Endianness; +use common::bitpacker::BitPacker; +use directory::ReadOnlySource; +use termdict::TermOrdinal; +use byteorder::ByteOrder; + + +const BLOCK_LEN: usize = 256; + + +#[derive(Debug, Eq, PartialEq, Default)] +struct TermInfoBlockMeta { + offset: u64, + ref_term_info: TermInfo, + doc_freq_nbits: u8, + postings_offset_nbits: u8, + positions_offset_nbits: u8, +} + +impl BinarySerializable for TermInfoBlockMeta { + fn serialize(&self, write: &mut W) -> io::Result<()> { + self.offset.serialize(write)?; + self.ref_term_info.serialize(write)?; + write.write_all(&[self.doc_freq_nbits, + self.postings_offset_nbits, + self.positions_offset_nbits])?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let offset = u64::deserialize(reader)?; + let ref_term_info = TermInfo::deserialize(reader)?; + let mut buffer = [0u8; 3]; + reader.read_exact(&mut buffer)?; + Ok(TermInfoBlockMeta { + offset, + ref_term_info, + doc_freq_nbits: buffer[0], + postings_offset_nbits: buffer[1], + positions_offset_nbits: buffer[2] + }) + } +} + +impl FixedSize for TermInfoBlockMeta { + const SIZE_IN_BYTES: usize = u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES; +} + +impl TermInfoBlockMeta { + + fn num_bits(&self) -> u8 { + self.doc_freq_nbits + self.postings_offset_nbits + self.positions_offset_nbits + 7 + } + + fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { + let num_bits = self.num_bits() as usize; + let mut cursor = num_bits * inner_offset; + + let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32; + cursor += self.doc_freq_nbits as usize; + + let postings_offset = extract_bits(data, cursor, self.postings_offset_nbits); + cursor += self.postings_offset_nbits as usize; + + let positions_offset = extract_bits(data, cursor, self.positions_offset_nbits); + cursor += self.positions_offset_nbits as usize; + + let positions_inner_offset = extract_bits(data, cursor, 7) as u8; + + TermInfo { + doc_freq, + postings_offset: postings_offset + self.ref_term_info.postings_offset, + positions_offset: positions_offset + self.ref_term_info.positions_offset, + positions_inner_offset, + } + } +} + + +pub struct TermInfoStore { + num_terms: usize, + block_meta_source: ReadOnlySource, + term_info_source: ReadOnlySource +} + +fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { + assert!(num_bits <= 56); + let addr_byte = addr_bits / 8; + let bit_shift = (addr_bits % 8) as u64; + let val_unshifted_unmasked: u64 = unsafe { *(data[addr_byte..].as_ptr() as *const u64) }; + let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift; + let mask = (1u64 << (num_bits as u64)) - 1; + val_shifted_unmasked & mask +} + +impl TermInfoStore { + pub fn open(data: ReadOnlySource) -> TermInfoStore { + let buffer = data.as_slice(); + let len = Endianness::read_u64(&buffer[0..8]) as usize; + let num_terms = Endianness::read_u64(&buffer[8..16]) as usize; + let block_meta_source = data.slice(16, 16 + len); + let term_info_source = data.slice_from(16 + len); + TermInfoStore { + num_terms, + block_meta_source, + term_info_source + } + } + + pub fn get(&self, term_ord: TermOrdinal) -> TermInfo { + let block_id = (term_ord as usize) / BLOCK_LEN; + let buffer = self.block_meta_source.as_slice(); + let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..]; + let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data).expect("Failed to deserialize terminfoblockmeta"); + let inner_offset = (term_ord as usize) % BLOCK_LEN; + if inner_offset == 0 { + term_info_block_data.ref_term_info + } else { + let term_info_data = self.term_info_source.as_slice(); + term_info_block_data.deserialize_term_info(&term_info_data[term_info_block_data.offset as usize..], inner_offset - 1) + } + } + + pub fn num_terms(&self) -> usize { + self.num_terms + } +} + +pub struct TermInfoStoreWriter { + buffer_block_metas: Vec, + buffer_term_infos: Vec, + term_infos: Vec, + num_terms: u64, +} + +fn bitpack_serialize( + write: &mut W, + bit_packer: &mut BitPacker, + term_info_block_meta: &TermInfoBlockMeta, + term_info: &TermInfo) -> io::Result<()> { + bit_packer.write(term_info.doc_freq as u64, term_info_block_meta.doc_freq_nbits, write)?; + bit_packer.write(term_info.postings_offset, term_info_block_meta.postings_offset_nbits, write)?; + bit_packer.write(term_info.positions_offset, term_info_block_meta.positions_offset_nbits, write)?; + bit_packer.write(term_info.positions_inner_offset as u64, 7, write)?; + Ok(()) +} + +impl TermInfoStoreWriter { + pub fn new() -> TermInfoStoreWriter { + TermInfoStoreWriter { + buffer_block_metas: Vec::new(), + buffer_term_infos: Vec::new(), + term_infos: Vec::with_capacity(BLOCK_LEN), + num_terms: 0u64 + } + } + + fn flush_block(&mut self) -> io::Result<()> { + if self.term_infos.is_empty() { + return Ok(()); + } + let mut bit_packer = BitPacker::new(); + let ref_term_info = self.term_infos[0].clone(); + for term_info in &mut self.term_infos[1..] { + term_info.postings_offset -= ref_term_info.postings_offset; + term_info.positions_offset -= ref_term_info.positions_offset; + } + + let mut max_doc_freq: u32 = 0u32; + let mut max_postings_offset: u64 = 0u64; + let mut max_positions_offset: u64 = 0u64; + for term_info in &self.term_infos[1..] { + max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq); + max_postings_offset = cmp::max(max_postings_offset, term_info.postings_offset); + max_positions_offset = cmp::max(max_positions_offset, term_info.positions_offset); + } + + let max_doc_freq_nbits: u8 = compute_num_bits(max_doc_freq as u64); + let max_postings_offset_nbits = compute_num_bits(max_postings_offset); + let max_positions_offset_nbits = compute_num_bits(max_positions_offset); + + let term_info_block_meta = TermInfoBlockMeta { + offset: self.buffer_term_infos.len() as u64, + ref_term_info, + doc_freq_nbits: max_doc_freq_nbits, + postings_offset_nbits: max_postings_offset_nbits, + positions_offset_nbits: max_positions_offset_nbits, + }; + + term_info_block_meta.serialize(&mut self.buffer_block_metas)?; + for term_info in self.term_infos[1..].iter().cloned() { + bitpack_serialize( + &mut self.buffer_term_infos, + &mut bit_packer, + &term_info_block_meta, + &term_info + )?; + } + + // Block need end up at the end of a byte. + bit_packer.flush(&mut self.buffer_term_infos)?; + self.term_infos.clear(); + + Ok(()) + } + + pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> { + self.num_terms += 1u64; + self.term_infos.push(term_info.clone()); + if self.term_infos.len() >= BLOCK_LEN { + self.flush_block()?; + } + Ok(()) + } + + pub fn serialize(&mut self, write: &mut W) -> io::Result<()> { + if !self.term_infos.is_empty() { + self.flush_block()?; + } + let len = self.buffer_block_metas.len() as u64; + len.serialize(write)?; + self.num_terms.serialize(write)?; + write.write_all(&self.buffer_block_metas)?; + write.write_all(&self.buffer_term_infos)?; + write.write_all(&[0u8; 7])?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use super::extract_bits; + use common::bitpacker::BitPacker; + use common::BinarySerializable; + use super::TermInfoBlockMeta; + use super::{TermInfoStore, TermInfoStoreWriter}; + use directory::ReadOnlySource; + use postings::TermInfo; + use common::compute_num_bits; + use common; + + #[test] + fn test_term_info_block() { + common::test::fixed_size_test::(); + } + + #[test] + fn test_bitpacked() { + let mut buffer = Vec::new(); + let mut bitpack = BitPacker::new(); + bitpack.write(321u64, 9, &mut buffer).unwrap(); + assert_eq!(compute_num_bits(321u64), 9); + bitpack.write(2u64, 2, &mut buffer).unwrap(); + assert_eq!(compute_num_bits(2u64), 2); + bitpack.write(51, 6, &mut buffer).unwrap(); + assert_eq!(compute_num_bits(51), 6); + bitpack.close(&mut buffer).unwrap(); + assert_eq!(buffer.len(), 3 + 7); + assert_eq!(extract_bits(&buffer[..], 0, 9), 321u64); + assert_eq!(extract_bits(&buffer[..], 9, 2), 2u64); + assert_eq!(extract_bits(&buffer[..], 11, 6), 51u64); + } + + #[test] + fn test_term_info_block_meta_serialization() { + let term_info_block_meta = TermInfoBlockMeta { + offset: 2009, + ref_term_info: TermInfo { + doc_freq: 512, + postings_offset: 51, + positions_offset: 3584, + positions_inner_offset: 0 + }, + doc_freq_nbits: 10, + postings_offset_nbits: 5, + positions_offset_nbits: 11 + }; + let mut buffer: Vec = Vec::new(); + term_info_block_meta.serialize(&mut buffer).unwrap(); + let mut cursor: &[u8] = &buffer[..]; + let term_info_block_meta_serde = TermInfoBlockMeta::deserialize(&mut cursor).unwrap(); + assert_eq!(term_info_block_meta_serde, term_info_block_meta); + } + + #[test] + fn test_pack() { + let mut store_writer = TermInfoStoreWriter::new(); + let mut term_infos = vec!(); + for i in 0..1000 { + let term_info = TermInfo { + doc_freq: i as u32, + postings_offset: (i / 10) as u64, + positions_offset: (i * 7) as u64, + positions_inner_offset: (i % 128) as u8, + }; + store_writer.write_term_info(&term_info).unwrap(); + term_infos.push(term_info); + } + let mut buffer = Vec::new(); + store_writer + .serialize(&mut buffer) + .unwrap(); + let term_info_store = TermInfoStore::open(ReadOnlySource::from(buffer)); + for i in 0..1000 { + assert_eq!(term_info_store.get(i as u64), term_infos[i]); + } + } + +} + + diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index b36be64ae..f2d1dfaa6 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -3,10 +3,11 @@ use fst; use fst::raw::Fst; use directory::ReadOnlySource; use common::BinarySerializable; +use common::CountingWriter; use schema::FieldType; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder, TermOrdinal}; -use super::{TermStreamerBuilderImpl, TermStreamerImpl}; +use super::{TermStreamerBuilderImpl, TermStreamerImpl, TermInfoStoreWriter, TermInfoStore}; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -15,7 +16,7 @@ fn convert_fst_error(e: fst::Error) -> io::Error { /// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html) pub struct TermDictionaryBuilderImpl { fst_builder: fst::MapBuilder, - data: Vec, + term_info_store_writer: TermInfoStoreWriter, term_ord: u64, } @@ -41,8 +42,8 @@ where /// # Warning /// /// Horribly dangerous internal API. See `.insert_key(...)`. - pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> { - value.serialize(&mut self.data)?; + pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> { + self.term_info_store_writer.write_term_info(term_info)?; Ok(()) } } @@ -55,7 +56,7 @@ where let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilderImpl { fst_builder, - data: Vec::new(), + term_info_store_writer: TermInfoStoreWriter::new(), term_ord: 0, }) } @@ -67,12 +68,15 @@ where Ok(()) } - fn finish(self) -> io::Result { + fn finish(mut self) -> io::Result { let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?; - let footer_size = self.data.len() as u32; - file.write_all(&self.data)?; - (footer_size as u32).serialize(&mut file)?; - file.flush()?; + { + let mut counting_writer = CountingWriter::wrap(&mut file); + self.term_info_store_writer.serialize(&mut counting_writer)?; + let footer_size = counting_writer.written_bytes(); + (footer_size as u64).serialize(&mut counting_writer)?; + counting_writer.flush()?; + } Ok(file) } } @@ -92,7 +96,7 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map { /// See [`TermDictionary`](./trait.TermDictionary.html) pub struct TermDictionaryImpl { fst_index: fst::Map, - values_mmap: ReadOnlySource, + term_info_store: TermInfoStore, } impl<'a> TermDictionary<'a> for TermDictionaryImpl { @@ -102,22 +106,22 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { fn from_source(source: ReadOnlySource) -> Self { let total_len = source.len(); - let length_offset = total_len - 4; + let length_offset = total_len - 8; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u32::deserialize(&mut split_len_buffer) - .expect("Deserializing 4 bytes should always work") as usize; + let footer_size = u64::deserialize(&mut split_len_buffer) + .expect("Deserializing 8 bytes should always work") as usize; let split_len = length_offset - footer_size; let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); let fst_index = open_fst_index(fst_source); TermDictionaryImpl { fst_index, - values_mmap: values_source, + term_info_store: TermInfoStore::open(values_source), } } fn num_terms(&self) -> usize { - self.values_mmap.len() / TermInfo::SIZE_IN_BYTES + self.term_info_store.num_terms() } fn term_ord>(&self, key: K) -> Option { @@ -145,11 +149,7 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl { } fn term_info_from_ord(&self, term_ord: TermOrdinal) -> TermInfo { - let buffer = self.values_mmap.as_slice(); - let offset = term_ord as usize * TermInfo::SIZE_IN_BYTES; - let mut cursor = &buffer[offset..]; - TermInfo::deserialize(&mut cursor) - .expect("The fst is corrupted. Failed to deserialize a value.") + self.term_info_store.get(term_ord) } fn get>(&self, key: K) -> Option { diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 0dbc6667d..64ff08732 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -5,7 +5,6 @@ that serves as an address in their respective posting list. The term dictionary API makes it possible to iterate through a range of keys in a sorted manner. -``` # Implementations @@ -471,7 +470,7 @@ mod tests { } { - for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) { + for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { let &(ref target_key, _) = &ids[i]; let mut streamer = term_dictionary .range() @@ -487,7 +486,7 @@ mod tests { } { - for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) { + for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { for j in 0..3 { let &(ref fst_key, _) = &ids[i]; let &(ref last_key, _) = &ids[i + j]; From a7ffc0e610c10195e0a3214943fa50a930e79bc3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 12 Feb 2018 10:31:29 +0900 Subject: [PATCH 7/7] Rustfmt --- src/common/bitpacker.rs | 42 +++++------ src/common/bitset.rs | 74 ++++++++----------- src/common/mod.rs | 3 - src/common/serialize.rs | 10 +-- .../pack/compression_pack_nosimd.rs | 8 +- src/compression/pack/compression_pack_simd.rs | 4 +- src/core/index_meta.rs | 3 +- src/datastruct/skip/mod.rs | 6 +- src/datastruct/skip/skiplist_builder.rs | 3 +- src/fastfield/serializer.rs | 5 +- src/lib.rs | 6 +- src/postings/term_info.rs | 2 +- src/query/bitset/mod.rs | 26 ++++--- src/query/range_query.rs | 74 +++++++++---------- src/query/scorer.rs | 2 - src/schema/int_options.rs | 3 +- src/termdict/fstdict/term_info_store.rs | 72 ++++++++++-------- src/termdict/fstdict/termdict.rs | 2 +- src/tokenizer/facet_tokenizer.rs | 9 +-- 19 files changed, 173 insertions(+), 181 deletions(-) diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 992e2d1db..1521fd2af 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -4,21 +4,25 @@ use common::serialize::BinarySerializable; use std::mem; use std::ops::Deref; - pub(crate) struct BitPacker { mini_buffer: u64, - mini_buffer_written: usize + mini_buffer_written: usize, } impl BitPacker { pub fn new() -> BitPacker { BitPacker { mini_buffer: 0u64, - mini_buffer_written: 0 + mini_buffer_written: 0, } } - pub fn write(&mut self, val: u64, num_bits: u8, output: &mut TWrite) -> io::Result<()> { + pub fn write( + &mut self, + val: u64, + num_bits: u8, + output: &mut TWrite, + ) -> io::Result<()> { let val_u64 = val as u64; let num_bits = num_bits as usize; if self.mini_buffer_written + num_bits > 64 { @@ -58,8 +62,8 @@ impl BitPacker { #[derive(Clone)] pub struct BitUnpacker - where - Data: Deref, +where + Data: Deref, { num_bits: usize, mask: u64, @@ -67,16 +71,15 @@ pub struct BitUnpacker } impl BitUnpacker - where - Data: Deref, +where + Data: Deref, { pub fn new(data: Data, num_bits: u8) -> BitUnpacker { - let mask: u64 = - if num_bits == 64 { - !0u64 - } else { - (1u64 << num_bits) - 1u64 - }; + let mask: u64 = if num_bits == 64 { + !0u64 + } else { + (1u64 << num_bits) - 1u64 + }; BitUnpacker { num_bits: num_bits as usize, mask, @@ -102,8 +105,7 @@ impl BitUnpacker addr + 8 <= data.len(), "The fast field field should have been padded with 7 bytes." ); - let val_unshifted_unmasked: u64 = - unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & mask) } else { @@ -134,8 +136,7 @@ impl BitUnpacker for output_val in output.iter_mut() { let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; - let val_unshifted_unmasked: u64 = - unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; *output_val = val_shifted & mask; addr_in_bits += num_bits; @@ -148,7 +149,6 @@ impl BitUnpacker mod test { use super::{BitPacker, BitUnpacker}; - fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker>, Vec) { let mut data = Vec::new(); let mut bitpacker = BitPacker::new(); @@ -157,10 +157,10 @@ mod test { .map(|i| if max_val == 0 { 0 } else { i % max_val }) .collect(); for &val in &vals { - bitpacker.write(val, num_bits,&mut data).unwrap(); + bitpacker.write(val, num_bits, &mut data).unwrap(); } bitpacker.close(&mut data).unwrap(); - assert_eq!(data.len(), ((num_bits as usize)* len + 7) / 8 + 7); + assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7); let bitunpacker = BitUnpacker::new(data, num_bits); (bitunpacker, vals) } diff --git a/src/common/bitset.rs b/src/common/bitset.rs index fb01e961e..9381929d4 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -27,7 +27,6 @@ impl IntoIterator for TinySet { } impl TinySet { - /// Returns an empty `TinySet`. pub fn empty() -> TinySet { TinySet(0u64) @@ -38,7 +37,6 @@ impl TinySet { TinySet(!self.0) } - /// Returns true iff the `TinySet` contains the element `el`. pub fn contains(&self, el: u32) -> bool { !self.intersect(TinySet::singleton(el)).is_empty() @@ -137,7 +135,6 @@ fn num_buckets(max_val: u32) -> u32 { } impl BitSet { - /// Create a new `BitSet` that may contain elements /// within `[0, max_val[`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -146,7 +143,7 @@ impl BitSet { BitSet { tinysets: tinybisets, len: 0, - max_value + max_value, } } @@ -167,18 +164,16 @@ impl BitSet { // we do not check saturated els. let higher = el / 64u32; let lower = el % 64u32; - self.len += - if self.tinysets[higher as usize].insert_mut(lower) { - 1 - } else { - 0 - }; + self.len += if self.tinysets[higher as usize].insert_mut(lower) { + 1 + } else { + 0 + }; } /// Returns true iff the elements is in the `BitSet`. pub fn contains(&self, el: u32) -> bool { - self.tinyset(el / 64u32) - .contains(el % 64) + self.tinyset(el / 64u32).contains(el % 64) } /// Returns the first non-empty `TinySet` associated to a bucket lower @@ -206,7 +201,6 @@ impl BitSet { } } - #[cfg(test)] mod tests { @@ -229,9 +223,7 @@ mod tests { assert!(u.pop_lowest().is_none()) } { - let mut u = TinySet::empty() - .insert(1u32) - .insert(1u32); + let mut u = TinySet::empty().insert(1u32).insert(1u32); assert_eq!(u.pop_lowest(), Some(1u32)); assert!(u.pop_lowest().is_none()) } @@ -275,7 +267,6 @@ mod tests { test_against_hashset(&[62u32, 63u32], 64); } - #[test] fn test_bitset_large() { let arr = generate_nonunique_unsorted(1_000_000, 50_000); @@ -310,16 +301,27 @@ mod tests { #[test] fn test_tinyset_range() { - assert_eq!(TinySet::range_lower(3).into_iter().collect::>(), [0, 1, 2]); + assert_eq!( + TinySet::range_lower(3).into_iter().collect::>(), + [0, 1, 2] + ); assert!(TinySet::range_lower(0).is_empty()); assert_eq!( TinySet::range_lower(63).into_iter().collect::>(), (0u32..63u32).collect::>() ); - assert_eq!(TinySet::range_lower(1).into_iter().collect::>(), [0]); - assert_eq!(TinySet::range_lower(2).into_iter().collect::>(), [0, 1]); assert_eq!( - TinySet::range_greater_or_equal(3).into_iter().collect::>(), + TinySet::range_lower(1).into_iter().collect::>(), + [0] + ); + assert_eq!( + TinySet::range_lower(2).into_iter().collect::>(), + [0, 1] + ); + assert_eq!( + TinySet::range_greater_or_equal(3) + .into_iter() + .collect::>(), (3u32..64u32).collect::>() ); } @@ -350,47 +352,31 @@ mod tests { assert!(els.iter().all(|el| bitset.contains(*el))); bitset.clear(); for el in 0u32..1000u32 { - assert!(!bitset.contains(el)); + assert!(!bitset.contains(el)); } } #[bench] fn bench_tinyset_pop(b: &mut test::Bencher) { - b.iter(|| { - test::black_box(TinySet::singleton(31u32)) - .pop_lowest() - }); + b.iter(|| test::black_box(TinySet::singleton(31u32)).pop_lowest()); } #[bench] fn bench_tinyset_sum(b: &mut test::Bencher) { - let tiny_set = TinySet::empty() - .insert(10u32) - .insert(14u32) - .insert(21u32); + let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32); b.iter(|| { - assert_eq!( - test::black_box(tiny_set).into_iter().sum::(), - 45u32); + assert_eq!(test::black_box(tiny_set).into_iter().sum::(), 45u32); }); } #[bench] fn bench_tinyarr_sum(b: &mut test::Bencher) { - let v = [10u32, 14u32, 21u32] ; - b.iter(|| { - test::black_box(v) - .iter() - .cloned() - .sum::() - }); + let v = [10u32, 14u32, 21u32]; + b.iter(|| test::black_box(v).iter().cloned().sum::()); } #[bench] fn bench_bitset_initialize(b: &mut test::Bencher) { - b.iter(|| { - BitSet::with_max_value(1_000_000) - }); + b.iter(|| BitSet::with_max_value(1_000_000)); } } - diff --git a/src/common/mod.rs b/src/common/mod.rs index c103b468d..66e4bbfde 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -52,7 +52,6 @@ pub(crate) fn compute_num_bits(n: u64) -> u8 { } } - pub(crate) fn is_power_of_2(n: usize) -> bool { (n > 0) && (n & (n - 1) == 0) } @@ -128,7 +127,6 @@ pub(crate) mod test { } } - #[test] fn test_compute_num_bits() { assert_eq!(compute_num_bits(1), 1u8); @@ -141,4 +139,3 @@ pub(crate) mod test { assert_eq!(compute_num_bits(5_000_000_000), 33u8); } } - diff --git a/src/common/serialize.rs b/src/common/serialize.rs index 9012c0eb2..543b72b19 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -14,7 +14,6 @@ pub trait BinarySerializable: fmt::Debug + Sized { fn deserialize(reader: &mut R) -> io::Result; } - /// `FixedSize` marks a `BinarySerializable` as /// always serializing to the same size. pub trait FixedSize: BinarySerializable { @@ -103,7 +102,6 @@ impl FixedSize for i64 { const SIZE_IN_BYTES: usize = 8; } - impl BinarySerializable for u8 { fn serialize(&self, writer: &mut W) -> io::Result<()> { writer.write_u8(*self) @@ -134,21 +132,18 @@ impl BinarySerializable for String { } } - #[cfg(test)] pub mod test { use common::VInt; use super::*; - pub fn fixed_size_test() { let mut buffer = Vec::new(); O::default().serialize(&mut buffer).unwrap(); assert_eq!(buffer.len(), O::SIZE_IN_BYTES); } - fn serialize_test(v: T) -> usize { let mut buffer: Vec = Vec::new(); v.serialize(&mut buffer).unwrap(); @@ -186,7 +181,10 @@ pub mod test { fn test_serialize_string() { assert_eq!(serialize_test(String::from("")), 1); assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4); - assert_eq!(serialize_test(String::from("富士さん見える。")), 1 + 3 * 8); + assert_eq!( + serialize_test(String::from("富士さん見える。")), + 1 + 3 * 8 + ); } #[test] diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 420cd5dbe..8a083e145 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -25,7 +25,9 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz let mut bit_packer = BitPacker::new(); for val in vals { - bit_packer.write(*val as u64, num_bits,&mut counting_writer).unwrap(); + bit_packer + .write(*val as u64, num_bits, &mut counting_writer) + .unwrap(); } counting_writer.written_bytes() } @@ -63,7 +65,9 @@ impl BlockEncoder { counting_writer.write_all(&[num_bits]).unwrap(); let mut bit_packer = BitPacker::new(); for val in vals { - bit_packer.write(*val as u64, num_bits, &mut counting_writer).unwrap(); + bit_packer + .write(*val as u64, num_bits, &mut counting_writer) + .unwrap(); } for _ in vals.len()..COMPRESSION_BLOCK_SIZE { bit_packer diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 2db372630..2a900e9ed 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -25,9 +25,7 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize { } fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { - unsafe { - simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) - } + unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) } } fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize { diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index a7c11ea88..9382dd3f0 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -14,7 +14,8 @@ pub struct IndexMeta { pub segments: Vec, pub schema: Schema, pub opstamp: u64, - #[serde(skip_serializing_if = "Option::is_none")] pub payload: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub payload: Option, } impl IndexMeta { diff --git a/src/datastruct/skip/mod.rs b/src/datastruct/skip/mod.rs index 260393e72..7f99888d2 100644 --- a/src/datastruct/skip/mod.rs +++ b/src/datastruct/skip/mod.rs @@ -132,7 +132,7 @@ mod tests { fn test_skiplist9() { let mut output: Vec = Vec::new(); let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(4); - for i in 0..4*4*4 { + for i in 0..4 * 4 * 4 { skip_list_builder.insert(i, &i).unwrap(); } skip_list_builder.write::>(&mut output).unwrap(); @@ -145,7 +145,7 @@ mod tests { // checking that void gets serialized to nothing. let mut output: Vec = Vec::new(); let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); - for i in 0..((4*4*4) - 1) { + for i in 0..((4 * 4 * 4) - 1) { skip_list_builder.insert(i, &()).unwrap(); } skip_list_builder.write::>(&mut output).unwrap(); @@ -158,7 +158,7 @@ mod tests { // checking that void gets serialized to nothing. let mut output: Vec = Vec::new(); let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4); - for i in 0..(4*4) { + for i in 0..(4 * 4) { skip_list_builder.insert(i, &()).unwrap(); } skip_list_builder.write::>(&mut output).unwrap(); diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index 63aec23dd..0d8b7d416 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -1,9 +1,8 @@ use std::io::Write; -use common::{is_power_of_2, VInt, BinarySerializable}; +use common::{BinarySerializable, VInt, is_power_of_2}; use std::marker::PhantomData; use std::io; - struct LayerBuilder { period_mask: usize, buffer: Vec, diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 8fab68e95..43b55daf0 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -93,14 +93,15 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { write, bit_packer, min_value, - num_bits + num_bits, }) } /// Pushes a new value to the currently open u64 fast field. pub fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; - self.bit_packer.write(val_to_write, self.num_bits,&mut self.write)?; + self.bit_packer + .write(val_to_write, self.num_bits, &mut self.write)?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index ec38e0936..6cdefc7e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,9 +115,6 @@ //! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / //! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) - - - #[macro_use] extern crate lazy_static; @@ -286,7 +283,7 @@ mod tests { use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader}; use Postings; use rand::{Rng, SeedableRng, XorShiftRng}; - use rand::distributions::{Range, IndependentSample}; + use rand::distributions::{IndependentSample, Range}; fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; @@ -306,7 +303,6 @@ mod tests { .collect::>() } - pub fn generate_array(n: usize, ratio: f32) -> Vec { generate_array_with_seed(n, ratio, 4) } diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index a6af45e8a..ab42d7253 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -29,7 +29,7 @@ impl FixedSize for TermInfo { /// of the block are bitpacked. /// /// See `TermInfoStore`. - const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2*u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES; + const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES; } impl BinarySerializable for TermInfo { diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index cb8ecde7b..084940a1e 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -50,14 +50,14 @@ impl DocSet for BitSetDocSet { return true; } if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) { - self.go_to_bucket(cursor_bucket); - let lower = self.cursor_tinybitset.pop_lowest().unwrap(); - self.doc = (cursor_bucket * 64u32) | lower; - true + self.go_to_bucket(cursor_bucket); + let lower = self.cursor_tinybitset.pop_lowest().unwrap(); + self.doc = (cursor_bucket * 64u32) | lower; + true } else { false } -} + } fn skip_next(&mut self, target: DocId) -> SkipResult { // skip is required to advance. @@ -232,14 +232,15 @@ mod tests { } } - #[bench] fn bench_bitset_1pct_insert(b: &mut test::Bencher) { use tests; let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); b.iter(|| { let mut bitset = BitSet::with_max_value(1_000_000); - for el in els.iter().cloned() { bitset.insert(el); } + for el in els.iter().cloned() { + bitset.insert(el); + } }); } @@ -248,8 +249,10 @@ mod tests { use tests; let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); let mut bitset = BitSet::with_max_value(1_000_000); - for el in els { bitset.insert(el); } - b.iter(|| { bitset.clone() }); + for el in els { + bitset.insert(el); + } + b.iter(|| bitset.clone()); } #[bench] @@ -258,11 +261,12 @@ mod tests { use DocSet; let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000); let mut bitset = BitSet::with_max_value(1_000_000); - for el in els { bitset.insert(el); } + for el in els { + bitset.insert(el); + } b.iter(|| { let mut docset = BitSetDocSet::from(bitset.clone()); while docset.advance() {} }); } } - diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 3b9e65994..162254e1d 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,18 +11,18 @@ use query::ConstScorer; use std::collections::Bound; use std::collections::range::RangeArgument; - -fn map_boundVec >(bound: Bound, transform: &Transform) -> Bound> { +fn map_bound Vec>( + bound: Bound, + transform: &Transform, +) -> Bound> { use self::Bound::*; match bound { Excluded(from_val) => Excluded(transform(from_val)), Included(from_val) => Included(transform(from_val)), - Unbounded => Unbounded + Unbounded => Unbounded, } } - - /// `RangeQuery` match all documents that have at least one term within a defined range. /// /// Matched document will all get a constant `Score` of one. @@ -88,40 +88,42 @@ pub struct RangeQuery { } impl RangeQuery { - /// Create a new `RangeQuery` over a `i64` field. - pub fn new_i64>(field: Field, range: TRangeArgument) -> RangeQuery { - let make_term_val = |val: &i64| { - Term::from_field_i64(field, *val).value_bytes().to_owned() - }; + pub fn new_i64>( + field: Field, + range: TRangeArgument, + ) -> RangeQuery { + let make_term_val = |val: &i64| Term::from_field_i64(field, *val).value_bytes().to_owned(); RangeQuery { field, left_bound: map_bound(range.start(), &make_term_val), - right_bound: map_bound(range.end(), &make_term_val) + right_bound: map_bound(range.end(), &make_term_val), } } /// Create a new `RangeQuery` over a `u64` field. - pub fn new_u64>(field: Field, range: TRangeArgument) -> RangeQuery { - let make_term_val = |val: &u64| { - Term::from_field_u64(field, *val).value_bytes().to_owned() - }; + pub fn new_u64>( + field: Field, + range: TRangeArgument, + ) -> RangeQuery { + let make_term_val = |val: &u64| Term::from_field_u64(field, *val).value_bytes().to_owned(); RangeQuery { field, left_bound: map_bound(range.start(), &make_term_val), - right_bound: map_bound(range.end(), &make_term_val) + right_bound: map_bound(range.end(), &make_term_val), } } /// Create a new `RangeQuery` over a `Str` field. - pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>(field: Field, range: TRangeArgument) -> RangeQuery { - let make_term_val = |val: &&str| { - val.as_bytes().to_vec() - }; + pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>( + field: Field, + range: TRangeArgument, + ) -> RangeQuery { + let make_term_val = |val: &&str| val.as_bytes().to_vec(); RangeQuery { field, left_bound: map_bound(range.start(), &make_term_val), - right_bound: map_bound(range.end(), &make_term_val) + right_bound: map_bound(range.end(), &make_term_val), } } } @@ -135,7 +137,7 @@ impl Query for RangeQuery { Ok(box RangeWeight { field: self.field, left_bound: self.left_bound.clone(), - right_bound: self.right_bound.clone() + right_bound: self.right_bound.clone(), }) } } @@ -148,8 +150,8 @@ pub struct RangeWeight { impl RangeWeight { fn term_range<'a, T>(&self, term_dict: &'a T) -> T::Streamer - where - T: TermDictionary<'a> + 'a, + where + T: TermDictionary<'a> + 'a, { use std::collections::Bound::*; let mut term_stream_builder = term_dict.range(); @@ -203,10 +205,9 @@ mod tests { #[test] fn test_range_query_simple() { - fn run() -> Result<()> { let mut schema_builder = SchemaBuilder::new(); - let year_field= schema_builder.add_u64_field("year", INT_INDEXED); + let year_field = schema_builder.add_u64_field("year", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -233,7 +234,6 @@ mod tests { } run().unwrap(); - } #[test] @@ -271,22 +271,22 @@ mod tests { count_collector.count() }; + assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9); assert_eq!( - count_multiples(RangeQuery::new_i64(int_field, 10..11)), - 9 - ); - assert_eq!( - count_multiples(RangeQuery::new_i64(int_field, (Bound::Included(10), Bound::Included(11)) )), + count_multiples(RangeQuery::new_i64( + int_field, + (Bound::Included(10), Bound::Included(11)) + )), 18 ); assert_eq!( - count_multiples(RangeQuery::new_i64(int_field, (Bound::Excluded(9), Bound::Included(10)))), + count_multiples(RangeQuery::new_i64( + int_field, + (Bound::Excluded(9), Bound::Included(10)) + )), 9 ); - assert_eq!( - count_multiples(RangeQuery::new_i64(int_field, 9..)), - 91 - ); + assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 9..)), 91); } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 2cbeb001d..619e580aa 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -62,7 +62,6 @@ impl Scorer for EmptyScorer { } } - /// Wraps a `DocSet` and simply returns a constant `Scorer`. /// The `ConstScorer` is useful if you have a `DocSet` where /// you needed a scorer. @@ -75,7 +74,6 @@ pub struct ConstScorer { } impl ConstScorer { - /// Creates a new `ConstScorer`. pub fn new(docset: TDocSet) -> ConstScorer { ConstScorer { diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index cd1fd8a22..b4a69cf67 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -16,7 +16,8 @@ pub enum Cardinality { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntOptions { indexed: bool, - #[serde(skip_serializing_if = "Option::is_none")] fast: Option, + #[serde(skip_serializing_if = "Option::is_none")] + fast: Option, stored: bool, } diff --git a/src/termdict/fstdict/term_info_store.rs b/src/termdict/fstdict/term_info_store.rs index 407b68b00..63d8d4957 100644 --- a/src/termdict/fstdict/term_info_store.rs +++ b/src/termdict/fstdict/term_info_store.rs @@ -10,10 +10,8 @@ use directory::ReadOnlySource; use termdict::TermOrdinal; use byteorder::ByteOrder; - const BLOCK_LEN: usize = 256; - #[derive(Debug, Eq, PartialEq, Default)] struct TermInfoBlockMeta { offset: u64, @@ -27,9 +25,11 @@ impl BinarySerializable for TermInfoBlockMeta { fn serialize(&self, write: &mut W) -> io::Result<()> { self.offset.serialize(write)?; self.ref_term_info.serialize(write)?; - write.write_all(&[self.doc_freq_nbits, - self.postings_offset_nbits, - self.positions_offset_nbits])?; + write.write_all(&[ + self.doc_freq_nbits, + self.postings_offset_nbits, + self.positions_offset_nbits, + ])?; Ok(()) } @@ -43,17 +43,17 @@ impl BinarySerializable for TermInfoBlockMeta { ref_term_info, doc_freq_nbits: buffer[0], postings_offset_nbits: buffer[1], - positions_offset_nbits: buffer[2] + positions_offset_nbits: buffer[2], }) } } impl FixedSize for TermInfoBlockMeta { - const SIZE_IN_BYTES: usize = u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES; + const SIZE_IN_BYTES: usize = + u64::SIZE_IN_BYTES + TermInfo::SIZE_IN_BYTES + 3 * u8::SIZE_IN_BYTES; } impl TermInfoBlockMeta { - fn num_bits(&self) -> u8 { self.doc_freq_nbits + self.postings_offset_nbits + self.positions_offset_nbits + 7 } @@ -82,11 +82,10 @@ impl TermInfoBlockMeta { } } - pub struct TermInfoStore { num_terms: usize, block_meta_source: ReadOnlySource, - term_info_source: ReadOnlySource + term_info_source: ReadOnlySource, } fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { @@ -109,7 +108,7 @@ impl TermInfoStore { TermInfoStore { num_terms, block_meta_source, - term_info_source + term_info_source, } } @@ -117,13 +116,17 @@ impl TermInfoStore { let block_id = (term_ord as usize) / BLOCK_LEN; let buffer = self.block_meta_source.as_slice(); let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..]; - let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data).expect("Failed to deserialize terminfoblockmeta"); + let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data) + .expect("Failed to deserialize terminfoblockmeta"); let inner_offset = (term_ord as usize) % BLOCK_LEN; if inner_offset == 0 { term_info_block_data.ref_term_info } else { let term_info_data = self.term_info_source.as_slice(); - term_info_block_data.deserialize_term_info(&term_info_data[term_info_block_data.offset as usize..], inner_offset - 1) + term_info_block_data.deserialize_term_info( + &term_info_data[term_info_block_data.offset as usize..], + inner_offset - 1, + ) } } @@ -140,13 +143,26 @@ pub struct TermInfoStoreWriter { } fn bitpack_serialize( - write: &mut W, - bit_packer: &mut BitPacker, - term_info_block_meta: &TermInfoBlockMeta, - term_info: &TermInfo) -> io::Result<()> { - bit_packer.write(term_info.doc_freq as u64, term_info_block_meta.doc_freq_nbits, write)?; - bit_packer.write(term_info.postings_offset, term_info_block_meta.postings_offset_nbits, write)?; - bit_packer.write(term_info.positions_offset, term_info_block_meta.positions_offset_nbits, write)?; + write: &mut W, + bit_packer: &mut BitPacker, + term_info_block_meta: &TermInfoBlockMeta, + term_info: &TermInfo, +) -> io::Result<()> { + bit_packer.write( + term_info.doc_freq as u64, + term_info_block_meta.doc_freq_nbits, + write, + )?; + bit_packer.write( + term_info.postings_offset, + term_info_block_meta.postings_offset_nbits, + write, + )?; + bit_packer.write( + term_info.positions_offset, + term_info_block_meta.positions_offset_nbits, + write, + )?; bit_packer.write(term_info.positions_inner_offset as u64, 7, write)?; Ok(()) } @@ -157,7 +173,7 @@ impl TermInfoStoreWriter { buffer_block_metas: Vec::new(), buffer_term_infos: Vec::new(), term_infos: Vec::with_capacity(BLOCK_LEN), - num_terms: 0u64 + num_terms: 0u64, } } @@ -199,7 +215,7 @@ impl TermInfoStoreWriter { &mut self.buffer_term_infos, &mut bit_packer, &term_info_block_meta, - &term_info + &term_info, )?; } @@ -276,11 +292,11 @@ mod tests { doc_freq: 512, postings_offset: 51, positions_offset: 3584, - positions_inner_offset: 0 + positions_inner_offset: 0, }, doc_freq_nbits: 10, postings_offset_nbits: 5, - positions_offset_nbits: 11 + positions_offset_nbits: 11, }; let mut buffer: Vec = Vec::new(); term_info_block_meta.serialize(&mut buffer).unwrap(); @@ -292,7 +308,7 @@ mod tests { #[test] fn test_pack() { let mut store_writer = TermInfoStoreWriter::new(); - let mut term_infos = vec!(); + let mut term_infos = vec![]; for i in 0..1000 { let term_info = TermInfo { doc_freq: i as u32, @@ -304,9 +320,7 @@ mod tests { term_infos.push(term_info); } let mut buffer = Vec::new(); - store_writer - .serialize(&mut buffer) - .unwrap(); + store_writer.serialize(&mut buffer).unwrap(); let term_info_store = TermInfoStore::open(ReadOnlySource::from(buffer)); for i in 0..1000 { assert_eq!(term_info_store.get(i as u64), term_infos[i]); @@ -314,5 +328,3 @@ mod tests { } } - - diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index f2d1dfaa6..4a4d1be9a 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -7,7 +7,7 @@ use common::CountingWriter; use schema::FieldType; use postings::TermInfo; use termdict::{TermDictionary, TermDictionaryBuilder, TermOrdinal}; -use super::{TermStreamerBuilderImpl, TermStreamerImpl, TermInfoStoreWriter, TermInfoStore}; +use super::{TermInfoStore, TermInfoStoreWriter, TermStreamerBuilderImpl, TermStreamerImpl}; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 982c35f7b..95b0d3711 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -87,6 +87,7 @@ mod tests { use tokenizer::{Token, TokenStream, Tokenizer}; use super::FacetTokenizer; use schema::Facet; + use std::str; #[test] fn test_facet_tokenizer() { @@ -98,9 +99,7 @@ mod tests { tokens.push(format!("{}", facet)); }; FacetTokenizer - .token_stream(unsafe { - ::std::str::from_utf8_unchecked(facet.encoded_bytes()) - }) + .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) .process(&mut add_token); } assert_eq!(tokens.len(), 4); @@ -120,9 +119,7 @@ mod tests { tokens.push(format!("{}", facet)); }; FacetTokenizer - .token_stream(unsafe { - ::std::str::from_utf8_unchecked(facet.encoded_bytes()) - }) + .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) .process(&mut add_token); } assert_eq!(tokens.len(), 1);