use std::hash::BuildHasherDefault; use std::marker::PhantomData; use std::num::NonZeroUsize; use std::ops::Index; use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName}; pub trait InternId: Sized + 'static { fn get_interner() -> &'static StringInterner; } pub struct StringInterner { inner: ThreadedRodeo>, _id: PhantomData, } #[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)] pub struct InternedString { inner: Spur, _id: PhantomData, } impl std::fmt::Display for InternedString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.as_str().fmt(f) } } impl InternedString { pub(crate) fn as_str(&self) -> &'static str { Id::get_interner().inner.resolve(&self.inner) } pub(crate) fn get(s: &str) -> Option { Id::get_interner().get(s) } } impl AsRef for InternedString { fn as_ref(&self) -> &str { self.as_str() } } impl std::ops::Deref for InternedString { type Target = str; fn deref(&self) -> &str { self.as_str() } } impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { fn deserialize>(d: D) -> Result { struct Visitor(PhantomData); impl serde::de::Visitor<'_> for Visitor { type Value = InternedString; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { formatter.write_str("a string") } fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { Ok(Id::get_interner().get_or_intern(v)) } } d.deserialize_str(Visitor::(PhantomData)) } } impl serde::Serialize for InternedString { fn serialize(&self, s: S) -> Result { self.as_str().serialize(s) } } impl StringInterner { pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")), // unbounded MemoryLimits::for_memory_usage(usize::MAX), BuildHasherDefault::::default(), ), _id: PhantomData, } } #[cfg(test)] fn len(&self) -> usize { self.inner.len() } #[cfg(test)] fn current_memory_usage(&self) -> usize { self.inner.current_memory_usage() } pub(crate) fn get_or_intern(&self, s: &str) -> InternedString { InternedString { inner: self.inner.get_or_intern(s), _id: PhantomData, } } pub(crate) fn get(&self, s: &str) -> Option> { Some(InternedString { inner: self.inner.get(s)?, _id: PhantomData, }) } } impl Index> for StringInterner { type Output = str; fn index(&self, index: InternedString) -> &Self::Output { self.inner.resolve(&index.inner) } } impl Default for StringInterner { fn default() -> Self { Self::new() } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct RoleNameTag; impl InternId for RoleNameTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type RoleNameInt = InternedString; impl From<&RoleName> for RoleNameInt { fn from(value: &RoleName) -> Self { RoleNameTag::get_interner().get_or_intern(value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct EndpointIdTag; impl InternId for EndpointIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type EndpointIdInt = InternedString; impl From<&EndpointId> for EndpointIdInt { fn from(value: &EndpointId) -> Self { EndpointIdTag::get_interner().get_or_intern(value) } } impl From for EndpointIdInt { fn from(value: EndpointId) -> Self { EndpointIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; impl InternId for BranchIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type BranchIdInt = InternedString; impl From<&BranchId> for BranchIdInt { fn from(value: &BranchId) -> Self { BranchIdTag::get_interner().get_or_intern(value) } } impl From for BranchIdInt { fn from(value: BranchId) -> Self { BranchIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; impl InternId for ProjectIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type ProjectIdInt = InternedString; impl From<&ProjectId> for ProjectIdInt { fn from(value: &ProjectId) -> Self { ProjectIdTag::get_interner().get_or_intern(value) } } impl From for ProjectIdInt { fn from(value: ProjectId) -> Self { ProjectIdTag::get_interner().get_or_intern(&value) } } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct AccountIdTag; impl InternId for AccountIdTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } pub type AccountIdInt = InternedString; impl From<&AccountId> for AccountIdInt { fn from(value: &AccountId) -> Self { AccountIdTag::get_interner().get_or_intern(value) } } impl From for AccountIdInt { fn from(value: AccountId) -> Self { AccountIdTag::get_interner().get_or_intern(&value) } } #[cfg(test)] mod tests { use std::sync::OnceLock; use super::InternId; use crate::intern::StringInterner; struct MyId; impl InternId for MyId { fn get_interner() -> &'static StringInterner { pub(crate) static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } #[test] fn push_many_strings() { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000.0, 0.8).unwrap(); let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); let interner = MyId::get_interner(); const N: usize = 100_000; let mut verify = Vec::with_capacity(N); for endpoint in endpoints.take(N) { let endpoint = format!("ep-string-interning-{endpoint}"); let key = interner.get_or_intern(&endpoint); verify.push((endpoint, key)); } for (s, key) in verify { assert_eq!(interner[key], s); } // 2031616/59861 = 34 bytes per string assert_eq!(interner.len(), 59_861); // will have other overhead for the internal hashmaps that are not accounted for. assert_eq!(interner.current_memory_usage(), 2_031_616); } }