diff --git a/Cargo.toml b/Cargo.toml index dd699e1b1..f830a57c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,7 +45,7 @@ rust-stemmers = "1.2.0" downcast-rs = "1.2.0" bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] } census = "0.4.0" -fnv = "1.0.7" +rustc-hash = "1.1.0" thiserror = "1.0.30" htmlescape = "0.3.1" fail = "0.5.0" diff --git a/src/aggregation/agg_result.rs b/src/aggregation/agg_result.rs index 11c9f03e7..93e2c9302 100644 --- a/src/aggregation/agg_result.rs +++ b/src/aggregation/agg_result.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; -use fnv::FnvHashMap; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use super::agg_req::BucketAggregationInternal; @@ -145,7 +145,7 @@ pub enum BucketEntries { /// Vector format bucket entries Vec(Vec), /// HashMap format bucket entries - HashMap(FnvHashMap), + HashMap(FxHashMap), } /// This is the default entry for a bucket, which contains a key, count, and optionally diff --git a/src/aggregation/bucket/range.rs b/src/aggregation/bucket/range.rs index 33645cb8f..333727536 100644 --- a/src/aggregation/bucket/range.rs +++ b/src/aggregation/bucket/range.rs @@ -1,7 +1,7 @@ use std::fmt::Debug; use std::ops::Range; -use fnv::FnvHashMap; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use crate::aggregation::agg_req_with_accessor::{ @@ -176,7 +176,7 @@ impl SegmentRangeCollector { ) -> crate::Result { let field_type = self.field_type; - let buckets: FnvHashMap = self + let buckets: FxHashMap = self .buckets .into_iter() .map(move |range_bucket| { diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index 10469f9c5..eca2d411f 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -1,7 +1,7 @@ use std::fmt::Debug; -use fnv::FnvHashMap; use itertools::Itertools; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use super::{CustomOrder, Order, OrderTarget}; @@ -199,7 +199,7 @@ impl TermsAggregationInternal { #[derive(Clone, Debug, PartialEq)] /// Container to store term_ids and their buckets. struct TermBuckets { - pub(crate) entries: FnvHashMap, + pub(crate) entries: FxHashMap, blueprint: Option, } @@ -397,7 +397,7 @@ impl SegmentTermCollector { .expect("internal error: inverted index not loaded for term aggregation"); let term_dict = inverted_index.terms(); - let mut dict: FnvHashMap = Default::default(); + let mut dict: FxHashMap = Default::default(); let mut buffer = vec![]; for (term_id, entry) in entries { term_dict @@ -1129,9 +1129,9 @@ mod tests { assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma"); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4); - assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb"); + assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc"); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0); - assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc"); + assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb"); assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0); assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); diff --git a/src/aggregation/intermediate_agg_result.rs b/src/aggregation/intermediate_agg_result.rs index 4baeec902..8462d6c9e 100644 --- a/src/aggregation/intermediate_agg_result.rs +++ b/src/aggregation/intermediate_agg_result.rs @@ -5,8 +5,8 @@ use std::cmp::Ordering; use std::collections::HashMap; -use fnv::FnvHashMap; use itertools::Itertools; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use super::agg_req::{ @@ -288,7 +288,7 @@ impl IntermediateBucketResult { .keyed; let buckets = if is_keyed { let mut bucket_map = - FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); + FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); for bucket in buckets { bucket_map.insert(bucket.key.to_string(), bucket); } @@ -308,7 +308,7 @@ impl IntermediateBucketResult { let buckets = if req.as_histogram().unwrap().keyed { let mut bucket_map = - FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); + FxHashMap::with_capacity_and_hasher(buckets.len(), Default::default()); for bucket in buckets { bucket_map.insert(bucket.key.to_string(), bucket); } @@ -396,13 +396,13 @@ impl IntermediateBucketResult { #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] /// Range aggregation including error counts pub struct IntermediateRangeBucketResult { - pub(crate) buckets: FnvHashMap, + pub(crate) buckets: FxHashMap, } #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] /// Term aggregation including error counts pub struct IntermediateTermBucketResult { - pub(crate) entries: FnvHashMap, + pub(crate) entries: FxHashMap, pub(crate) sum_other_doc_count: u64, pub(crate) doc_count_error_upper_bound: u64, } @@ -499,8 +499,8 @@ trait MergeFruits { } fn merge_maps( - entries_left: &mut FnvHashMap, - mut entries_right: FnvHashMap, + entries_left: &mut FxHashMap, + mut entries_right: FxHashMap, ) { for (name, entry_left) in entries_left.iter_mut() { if let Some(entry_right) = entries_right.remove(name) { @@ -626,7 +626,7 @@ mod tests { fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults { let mut map = HashMap::new(); - let mut buckets = FnvHashMap::default(); + let mut buckets = FxHashMap::default(); for (key, doc_count) in data { buckets.insert( key.to_string(), @@ -653,7 +653,7 @@ mod tests { data: &[(String, u64, String, u64)], ) -> IntermediateAggregationResults { let mut map = HashMap::new(); - let mut buckets: FnvHashMap<_, _> = Default::default(); + let mut buckets: FxHashMap<_, _> = Default::default(); for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data { buckets.insert( key.to_string(), diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index fa248b5e1..213b7d8bf 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -3,7 +3,7 @@ use std::io; use fastfield_codecs::{ Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, }; -use fnv::FnvHashMap; +use rustc_hash::FxHashMap; use super::get_fastfield_codecs_for_multivalue; use crate::fastfield::writer::unexpected_value; @@ -144,7 +144,7 @@ impl MultiValuedFastFieldWriter { pub fn serialize( mut self, serializer: &mut CompositeFastFieldSerializer, - term_mapping_opt: Option<&FnvHashMap>, + term_mapping_opt: Option<&FxHashMap>, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { { diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 82dccf09a..45f56c8db 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -3,7 +3,7 @@ use std::io; use common; use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; -use fnv::FnvHashMap; +use rustc_hash::FxHashMap; use tantivy_bitpacker::BlockedBitpacker; use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; @@ -256,7 +256,7 @@ impl FastFieldsWriter { pub fn serialize( self, serializer: &mut CompositeFastFieldSerializer, - mapping: &HashMap>, + mapping: &HashMap>, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { for field_writer in self.term_id_writers { diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 68e0955c2..770ac15cd 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -1,6 +1,6 @@ use fastfield_codecs::MonotonicallyMappableToU64; -use fnv::FnvHashMap; use murmurhash32::murmurhash2; +use rustc_hash::FxHashMap; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; @@ -52,7 +52,7 @@ use crate::{DatePrecision, DateTime, DocId, Term}; /// path map to the same index position as long as the probability is relatively low. #[derive(Default)] struct IndexingPositionsPerPath { - positions_per_path: FnvHashMap, + positions_per_path: FxHashMap, } impl IndexingPositionsPerPath { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index a98d2c875..656a05bc3 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -3,7 +3,7 @@ use std::io; use std::marker::PhantomData; use std::ops::Range; -use fnv::FnvHashMap; +use rustc_hash::FxHashMap; use super::stacker::Addr; use crate::fastfield::MultiValuedFastFieldWriter; @@ -56,12 +56,12 @@ pub(crate) fn serialize_postings( doc_id_map: Option<&DocIdMapping>, schema: &Schema, serializer: &mut InvertedIndexSerializer, -) -> crate::Result>> { +) -> crate::Result>> { let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = Vec::with_capacity(ctx.term_index.len()); term_offsets.extend(ctx.term_index.iter()); term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone()); - let mut unordered_term_mappings: HashMap> = + let mut unordered_term_mappings: HashMap> = HashMap::new(); let field_offsets = make_field_partition(&term_offsets); @@ -74,7 +74,7 @@ pub(crate) fn serialize_postings( let unordered_term_ids = term_offsets[byte_offsets.clone()] .iter() .map(|&(_, _, bucket)| bucket); - let mapping: FnvHashMap = unordered_term_ids + let mapping: FxHashMap = unordered_term_ids .enumerate() .map(|(term_ord, unord_term_id)| { (unord_term_id as UnorderedTermId, term_ord as TermOrdinal) diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index cf0a3b638..c228e5f8b 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -10,28 +10,21 @@ //! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert!(stream.next().is_none()); //! ``` -use std::collections::HashSet; -use std::hash::BuildHasherDefault; - -use fnv::FnvHasher; +use rustc_hash::FxHashSet; use super::{Token, TokenFilter, TokenStream}; use crate::tokenizer::BoxTokenStream; -// configure our hashers for SPEED -type StopWordHasher = BuildHasherDefault; -type StopWordHashSet = HashSet; - /// `TokenFilter` that removes stop words from a token stream #[derive(Clone)] pub struct StopWordFilter { - words: StopWordHashSet, + words: FxHashSet, } impl StopWordFilter { /// Creates a `StopWordFilter` given a list of words to remove pub fn remove(words: Vec) -> StopWordFilter { - let mut set = StopWordHashSet::default(); + let mut set = FxHashSet::default(); for word in words { set.insert(word); @@ -52,7 +45,7 @@ impl StopWordFilter { } pub struct StopWordFilterStream<'a> { - words: StopWordHashSet, + words: FxHashSet, tail: BoxTokenStream<'a>, }