use std::borrow::{Borrow, Cow}; use std::fmt::{self, Debug, Display, Formatter}; use std::io::{self, Read, Write}; use std::str; use std::string::FromUtf8Error; use common::BinarySerializable; use once_cell::sync::Lazy; use regex::Regex; use serde::de::Error as _; use serde::{Deserialize, Deserializer, Serialize, Serializer}; const SLASH_BYTE: u8 = b'/'; const ESCAPE_BYTE: u8 = b'\\'; /// BYTE used as a level separation in the binary /// representation of facets. pub const FACET_SEP_BYTE: u8 = 0u8; /// `char` used as a level separation in the binary /// representation of facets. (It is the null codepoint.) pub const FACET_SEP_CHAR: char = '\u{0}'; /// An error enum for facet parser. #[derive(Debug, PartialEq, Eq, Error)] pub enum FacetParseError { /// The facet text representation is unparsable. #[error("Failed to parse the facet string: '{0}'")] FacetParseError(String), } /// A Facet represent a point in a given hierarchy. /// /// They are typically represented similarly to a filepath. /// For instance, an e-commerce website could /// have a `Facet` for `/electronics/tv_and_video/led_tv`. /// /// A document can be associated with any number of facets. /// The hierarchy implicitly imply that a document /// belonging to a facet also belongs to the ancestor of /// its facet. In the example above, `/electronics/tv_and_video/` /// and `/electronics`. #[derive(Clone, Default, Eq, Hash, PartialEq, Ord, PartialOrd)] pub struct Facet(pub(crate) String); impl Facet { /// Returns a new instance of the "root facet" /// Equivalent to `/`. pub fn root() -> Facet { Facet("".to_string()) } /// Returns true if the facet is the root facet `/`. pub fn is_root(&self) -> bool { self.encoded_str().is_empty() } /// Returns a binary representation of the facet. /// /// In this representation, `0u8` is used as a separator /// and the string parts of the facet are unescaped. /// (The first `/` is not encoded at all). /// /// This representation has the benefit of making it possible to /// express "being a child of a given facet" as a range over /// the term ordinals. pub fn encoded_str(&self) -> &str { &self.0 } pub(crate) fn from_encoded_string(facet_string: String) -> Facet { Facet(facet_string) } /// Creates a `Facet` from its binary representation. pub fn from_encoded(encoded_bytes: Vec) -> Result { // facet bytes validation. `0u8` is used a separator but that is still legal utf-8 String::from_utf8(encoded_bytes).map(Facet) } /// Parse a text representation of a facet. /// /// If one of the segments of this path /// contains a `/`, it should be escaped /// using an anti-slash `\`. pub fn from_text(path: &T) -> Result where T: ?Sized + AsRef { #[derive(Copy, Clone)] enum State { Escaped, Idle, } let path_ref = path.as_ref(); if path_ref.is_empty() { return Err(FacetParseError::FacetParseError(path_ref.to_string())); } if !path_ref.starts_with('/') { return Err(FacetParseError::FacetParseError(path_ref.to_string())); } let mut facet_encoded = String::new(); let mut state = State::Idle; let path_bytes = path_ref.as_bytes(); let mut last_offset = 1; for i in 1..path_bytes.len() { let c = path_bytes[i]; match (state, c) { (State::Idle, ESCAPE_BYTE) => { facet_encoded.push_str(&path_ref[last_offset..i]); last_offset = i + 1; state = State::Escaped } (State::Idle, SLASH_BYTE) => { facet_encoded.push_str(&path_ref[last_offset..i]); facet_encoded.push(FACET_SEP_CHAR); last_offset = i + 1; } (State::Escaped, _escaped_char) => { state = State::Idle; } (State::Idle, _any_char) => {} } } facet_encoded.push_str(&path_ref[last_offset..]); Ok(Facet(facet_encoded)) } /// Returns a `Facet` from an iterator over the different /// steps of the facet path. /// /// The steps are expected to be unescaped. pub fn from_path(path: Path) -> Facet where Path: IntoIterator, Path::Item: AsRef, { let mut facet_string: String = String::with_capacity(100); let mut step_it = path.into_iter(); if let Some(step) = step_it.next() { facet_string.push_str(step.as_ref()); } for step in step_it { facet_string.push(FACET_SEP_CHAR); facet_string.push_str(step.as_ref()); } Facet(facet_string) } /// Returns `true` if other is a `strict` subfacet of `self`. /// /// Disclaimer: By strict we mean that the relation is not reflexive. /// `/happy` is not a prefix of `/happy`. pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_str = self.encoded_str(); let other_str = other.encoded_str(); // Fast path, but also required to ensure that / is not a prefix of /. if other_str.len() <= self_str.len() { return false; } // Root is a prefix of every other path. // This is not just an optimisation. It is necessary for correctness. if self.is_root() { return true; } other_str.starts_with(self_str) && other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE } /// Extract path from the `Facet`. pub fn to_path(&self) -> Vec<&str> { self.encoded_str().split(|c| c == FACET_SEP_CHAR).collect() } /// This function is the inverse of Facet::from(&str). pub fn to_path_string(&self) -> String { format!("{self}") } } impl Borrow for Facet { fn borrow(&self) -> &str { self.encoded_str() } } impl<'a, T: ?Sized + AsRef> From<&'a T> for Facet { fn from(path_asref: &'a T) -> Facet { Facet::from_text(path_asref).unwrap() } } impl BinarySerializable for Facet { fn serialize(&self, writer: &mut W) -> io::Result<()> { ::serialize(&self.0, writer) } fn deserialize(reader: &mut R) -> io::Result { Ok(Facet(::deserialize(reader)?)) } } impl Display for Facet { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { for step in self.0.split(FACET_SEP_CHAR) { write!(f, "/")?; write!(f, "{}", escape_slashes(step))?; } Ok(()) } } fn escape_slashes(s: &str) -> Cow<'_, str> { static SLASH_PTN: Lazy = Lazy::new(|| Regex::new(r"[\\/]").unwrap()); SLASH_PTN.replace_all(s, "\\/") } impl Serialize for Facet { fn serialize(&self, serializer: S) -> Result where S: Serializer { serializer.serialize_str(&self.to_string()) } } impl<'de> Deserialize<'de> for Facet { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { as Deserialize<'de>>::deserialize(deserializer).and_then(|path| { Facet::from_text(&*path).map_err(|err| D::Error::custom(err.to_string())) }) } } impl Debug for Facet { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "Facet({self})")?; Ok(()) } } #[cfg(test)] mod tests { use super::{Facet, FacetParseError}; #[test] fn test_root() { assert_eq!(Facet::root(), Facet::from("/")); assert_eq!(format!("{}", Facet::root()), "/"); assert!(Facet::root().is_root()); assert_eq!(Facet::root().encoded_str(), ""); } #[test] fn test_from_path() { assert_eq!( Facet::from_path(vec!["top", "a", "firstdoc"]), Facet::from("/top/a/firstdoc") ); } #[test] fn test_facet_display() { { let v = ["first", "second", "third"]; let facet = Facet::from_path(v.iter()); assert_eq!(format!("{facet}"), "/first/second/third"); } { let v = ["first", "sec/ond", "third"]; let facet = Facet::from_path(v.iter()); assert_eq!(format!("{facet}"), "/first/sec\\/ond/third"); } } #[test] fn test_facet_debug() { let v = ["first", "second", "third"]; let facet = Facet::from_path(v.iter()); assert_eq!(format!("{facet:?}"), "Facet(/first/second/third)"); } #[test] fn test_to_path() { let v = ["first", "second", "third\\/not_fourth"]; let facet = Facet::from_path(v.iter()); assert_eq!(facet.to_path(), v); } #[test] fn test_to_path_string() { let v = ["first", "second", "third/not_fourth"]; let facet = Facet::from_path(v.iter()); assert_eq!( facet.to_path_string(), String::from("/first/second/third\\/not_fourth") ); } #[test] fn test_to_path_string_empty() { let v: Vec<&str> = vec![]; let facet = Facet::from_path(v.iter()); assert_eq!(facet.to_path_string(), "/"); } #[test] fn test_from_text() { assert_eq!( Err(FacetParseError::FacetParseError("INVALID".to_string())), Facet::from_text("INVALID") ); } #[test] fn only_proper_prefixes() { assert!(Facet::from("/foo").is_prefix_of(&Facet::from("/foo/bar"))); assert!(!Facet::from("/foo/bar").is_prefix_of(&Facet::from("/foo/bar"))); } #[test] fn root_is_a_prefix() { assert!(Facet::from("/").is_prefix_of(&Facet::from("/foobar"))); assert!(!Facet::from("/").is_prefix_of(&Facet::from("/"))); } #[test] fn deserialize_from_borrowed_string() { let facet = serde_json::from_str::(r#""/foo/bar""#).unwrap(); assert_eq!(facet, Facet::from_path(["foo", "bar"])); } #[test] fn deserialize_from_owned_string() { let facet = serde_json::from_str::(r#""/foo/\u263A""#).unwrap(); assert_eq!(facet, Facet::from_path(["foo", "☺"])); } #[test] fn deserialize_from_invalid_string() { let error = serde_json::from_str::(r#""foo/bar""#).unwrap_err(); assert_eq!( error.to_string(), "Failed to parse the facet string: 'foo/bar'" ); } }