From 5f37b6635299533fa2d56f2b66384fec5204679b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20DOUIN?= Date: Mon, 20 Feb 2023 14:09:23 +0100 Subject: [PATCH] Improve mailbox parsing using chumsky (#839) --- Cargo.toml | 5 + README.md | 2 +- benches/mailbox_parsing.rs | 27 +++ src/address/types.rs | 4 + src/message/header/mailbox.rs | 35 +++- src/message/mailbox/mod.rs | 1 + src/message/mailbox/parsers/mod.rs | 5 + src/message/mailbox/parsers/rfc2234.rs | 32 ++++ src/message/mailbox/parsers/rfc2822.rs | 248 +++++++++++++++++++++++++ src/message/mailbox/parsers/rfc5336.rs | 17 ++ src/message/mailbox/serde.rs | 4 +- src/message/mailbox/types.rs | 72 ++----- src/transport/mod.rs | 2 +- src/transport/smtp/extension.rs | 2 +- 14 files changed, 392 insertions(+), 64 deletions(-) create mode 100644 benches/mailbox_parsing.rs create mode 100644 src/message/mailbox/parsers/mod.rs create mode 100644 src/message/mailbox/parsers/rfc2234.rs create mode 100644 src/message/mailbox/parsers/rfc2822.rs create mode 100644 src/message/mailbox/parsers/rfc5336.rs diff --git a/Cargo.toml b/Cargo.toml index 024a87a..c9f897b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ is-it-maintained-open-issues = { repository = "lettre/lettre" } maintenance = { status = "actively-developed" } [dependencies] +chumsky = "0.8.0" idna = "0.3" once_cell = { version = "1", optional = true } tracing = { version = "0.1.16", default-features = false, features = ["std"], optional = true } # feature @@ -89,6 +90,10 @@ maud = "0.24" harness = false name = "transport_smtp" +[[bench]] +harness = false +name = "mailbox_parsing" + [features] default = ["smtp-transport", "pool", "native-tls", "hostname", "builder"] builder = ["dep:httpdate", "dep:mime", "dep:fastrand", "dep:quoted_printable", "dep:email-encoding"] diff --git a/README.md b/README.md index 4065a19..d5806b5 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ let mailer = SmtpTransport::relay("smtp.gmail.com") // Send the email match mailer.send(&email) { Ok(_) => println!("Email sent successfully!"), - Err(e) => panic!("Could not send email: {:?}", e), + Err(e) => panic!("Could not send email: {e:?}"), } ``` diff --git a/benches/mailbox_parsing.rs b/benches/mailbox_parsing.rs new file mode 100644 index 0000000..dd18e18 --- /dev/null +++ b/benches/mailbox_parsing.rs @@ -0,0 +1,27 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use lettre::message::{Mailbox, Mailboxes}; + +fn bench_parse_single(mailbox: &str) { + assert!(mailbox.parse::().is_ok()); +} + +fn bench_parse_multiple(mailboxes: &str) { + assert!(mailboxes.parse::().is_ok()); +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("parse single mailbox", |b| { + b.iter(|| bench_parse_single(black_box("\"Benchmark test\" "))) + }); + + c.bench_function("parse multiple mailboxes", |b| { + b.iter(|| { + bench_parse_multiple(black_box( + "\"Benchmark test\" , Test , , test@mail.local", + )) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/src/address/types.rs b/src/address/types.rs index d5bb5f7..c493f92 100644 --- a/src/address/types.rs +++ b/src/address/types.rs @@ -227,6 +227,7 @@ fn check_address(val: &str) -> Result { } #[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[non_exhaustive] /// Errors in email addresses parsing pub enum AddressError { /// Missing domain or user @@ -237,6 +238,8 @@ pub enum AddressError { InvalidUser, /// Invalid email domain InvalidDomain, + /// Invalid input found + InvalidInput, } impl Error for AddressError {} @@ -248,6 +251,7 @@ impl Display for AddressError { AddressError::Unbalanced => f.write_str("Unbalanced angle bracket"), AddressError::InvalidUser => f.write_str("Invalid email user"), AddressError::InvalidDomain => f.write_str("Invalid email domain"), + AddressError::InvalidInput => f.write_str("Invalid input"), } } } diff --git a/src/message/header/mailbox.rs b/src/message/header/mailbox.rs index 7e2ae77..6a24c6b 100644 --- a/src/message/header/mailbox.rs +++ b/src/message/header/mailbox.rs @@ -306,14 +306,30 @@ mod test { #[test] fn parse_multi_with_name_containing_comma() { let from: Vec = vec![ - "Test, test <1@example.com>".parse().unwrap(), - "Test2, test2 <2@example.com>".parse().unwrap(), + "\"Test, test\" <1@example.com>".parse().unwrap(), + "\"Test2, test2\" <2@example.com>".parse().unwrap(), ]; let mut headers = Headers::new(); headers.insert_raw(HeaderValue::new( HeaderName::new_from_ascii_str("From"), - "Test, test <1@example.com>, Test2, test2 <2@example.com>".to_owned(), + "\"Test, test\" <1@example.com>, \"Test2, test2\" <2@example.com>".to_owned(), + )); + + assert_eq!(headers.get::(), Some(From(from.into()))); + } + + #[test] + fn parse_multi_with_name_containing_double_quotes() { + let from: Vec = vec![ + "\"Test, test\" <1@example.com>".parse().unwrap(), + "\"Test2, \"test2\"\" <2@example.com>".parse().unwrap(), + ]; + + let mut headers = Headers::new(); + headers.insert_raw(HeaderValue::new( + HeaderName::new_from_ascii_str("From"), + "\"Test, test\" <1@example.com>, \"Test2, \"test2\"\" <2@example.com>".to_owned(), )); assert_eq!(headers.get::(), Some(From(from.into()))); @@ -324,9 +340,20 @@ mod test { let mut headers = Headers::new(); headers.insert_raw(HeaderValue::new( HeaderName::new_from_ascii_str("From"), - "Test, test <1@example.com>, Test2, test2".to_owned(), + "\"Test, test\" <1@example.com>, \"Test2, test2\"".to_owned(), )); assert_eq!(headers.get::(), None); } + + #[test] + fn mailbox_format_address_with_angle_bracket() { + assert_eq!( + format!( + "{}", + Mailbox::new(Some("<3".into()), "i@love.example".parse().unwrap()) + ), + r#""<3" "# + ); + } } diff --git a/src/message/mailbox/mod.rs b/src/message/mailbox/mod.rs index 7fee942..9ccbcdf 100644 --- a/src/message/mailbox/mod.rs +++ b/src/message/mailbox/mod.rs @@ -1,3 +1,4 @@ +mod parsers; #[cfg(feature = "serde")] mod serde; mod types; diff --git a/src/message/mailbox/parsers/mod.rs b/src/message/mailbox/parsers/mod.rs new file mode 100644 index 0000000..c6a3591 --- /dev/null +++ b/src/message/mailbox/parsers/mod.rs @@ -0,0 +1,5 @@ +mod rfc2234; +mod rfc2822; +mod rfc5336; + +pub(crate) use rfc2822::{mailbox, mailbox_list}; diff --git a/src/message/mailbox/parsers/rfc2234.rs b/src/message/mailbox/parsers/rfc2234.rs new file mode 100644 index 0000000..0ac88ce --- /dev/null +++ b/src/message/mailbox/parsers/rfc2234.rs @@ -0,0 +1,32 @@ +//! Partial parsers implementation of [RFC2234]: Augmented BNF for +//! Syntax Specifications: ABNF. +//! +//! [RFC2234]: https://datatracker.ietf.org/doc/html/rfc2234 + +use chumsky::{error::Cheap, prelude::*}; + +// 6.1 Core Rules +// https://datatracker.ietf.org/doc/html/rfc2234#section-6.1 + +// ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +pub(super) fn alpha() -> impl Parser> { + filter(|c: &char| c.is_ascii_alphabetic()) +} + +// DIGIT = %x30-39 +// ; 0-9 +pub(super) fn digit() -> impl Parser> { + filter(|c: &char| c.is_ascii_digit()) +} + +// DQUOTE = %x22 +// ; " (Double Quote) +pub(super) fn dquote() -> impl Parser> { + just('"') +} + +// WSP = SP / HTAB +// ; white space +pub(super) fn wsp() -> impl Parser> { + choice((just(' '), just('\t'))) +} diff --git a/src/message/mailbox/parsers/rfc2822.rs b/src/message/mailbox/parsers/rfc2822.rs new file mode 100644 index 0000000..1f756d7 --- /dev/null +++ b/src/message/mailbox/parsers/rfc2822.rs @@ -0,0 +1,248 @@ +//! Partial parsers implementation of [RFC2822]: Internet Message +//! Format. +//! +//! [RFC2822]: https://datatracker.ietf.org/doc/html/rfc2822 + +use chumsky::{error::Cheap, prelude::*}; + +use super::{rfc2234, rfc5336}; + +// 3.2.1. Primitive Tokens +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.1 + +// NO-WS-CTL = %d1-8 / ; US-ASCII control characters +// %d11 / ; that do not include the +// %d12 / ; carriage return, line feed, +// %d14-31 / ; and white space characters +// %d127 +fn no_ws_ctl() -> impl Parser> { + filter(|c| matches!(u32::from(*c), 1..=8 | 11 | 12 | 14..=31 | 127)) +} + +// text = %d1-9 / ; Characters excluding CR and LF +// %d11 / +// %d12 / +// %d14-127 / +// obs-text +fn text() -> impl Parser> { + filter(|c| matches!(u32::from(*c), 1..=9 | 11 | 12 | 14..=127)) +} + +// 3.2.2. Quoted characters +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.2 + +// quoted-pair = ("\" text) / obs-qp +fn quoted_pair() -> impl Parser> { + just('\\').ignore_then(text()) +} + +// 3.2.3. Folding white space and comments +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.3 + +// FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space +// obs-FWS +pub fn fws() -> impl Parser, Error = Cheap> { + rfc2234::wsp() + .or_not() + .then_ignore(rfc2234::wsp().ignored().repeated()) +} + +// CFWS = *([FWS] comment) (([FWS] comment) / FWS) +pub fn cfws() -> impl Parser, Error = Cheap> { + // TODO: comment are not currently supported, so for now a cfws is + // the same as a fws. + fws() +} + +// 3.2.4. Atom +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.4 + +// atext = ALPHA / DIGIT / ; Any character except controls, +// "!" / "#" / ; SP, and specials. +// "$" / "%" / ; Used for atoms +// "&" / "'" / +// "*" / "+" / +// "-" / "/" / +// "=" / "?" / +// "^" / "_" / +// "`" / "{" / +// "|" / "}" / +// "~" +pub(super) fn atext() -> impl Parser> { + choice(( + rfc2234::alpha(), + rfc2234::digit(), + filter(|c| { + matches!( + *c, + '!' | '#' + | '$' + | '%' + | '&' + | '\'' + | '*' + | '+' + | '-' + | '/' + | '=' + | '?' + | '^' + | '_' + | '`' + | '{' + | '|' + | '}' + | '~' + ) + }), + // also allow non ASCII UTF8 chars + rfc5336::utf8_non_ascii(), + )) +} + +// atom = [CFWS] 1*atext [CFWS] +pub(super) fn atom() -> impl Parser, Error = Cheap> { + cfws().chain(atext().repeated().at_least(1)) +} + +// dot-atom = [CFWS] dot-atom-text [CFWS] +pub fn dot_atom() -> impl Parser, Error = Cheap> { + cfws().chain(dot_atom_text()) +} + +// dot-atom-text = 1*atext *("." 1*atext) +pub fn dot_atom_text() -> impl Parser, Error = Cheap> { + atext().repeated().at_least(1).chain( + just('.') + .chain(atext().repeated().at_least(1)) + .repeated() + .at_least(1) + .flatten(), + ) +} + +// 3.2.5. Quoted strings +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.5 + +// qtext = NO-WS-CTL / ; Non white space controls +// +// %d33 / ; The rest of the US-ASCII +// %d35-91 / ; characters not including "\" +// %d93-126 ; or the quote character +fn qtext() -> impl Parser> { + choice(( + filter(|c| matches!(u32::from(*c), 33 | 35..=91 | 93..=126)), + no_ws_ctl(), + )) +} + +// qcontent = qtext / quoted-pair +pub(super) fn qcontent() -> impl Parser> { + choice((qtext(), quoted_pair(), rfc5336::utf8_non_ascii())) +} + +// quoted-string = [CFWS] +// DQUOTE *([FWS] qcontent) [FWS] DQUOTE +// [CFWS] +fn quoted_string() -> impl Parser, Error = Cheap> { + rfc2234::dquote() + .ignore_then(fws().chain(qcontent()).repeated().flatten()) + .then_ignore(text::whitespace()) + .then_ignore(rfc2234::dquote()) +} + +// 3.2.6. Miscellaneous tokens +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.6 + +// word = atom / quoted-string +fn word() -> impl Parser, Error = Cheap> { + choice((quoted_string(), atom())) +} + +// phrase = 1*word / obs-phrase +fn phrase() -> impl Parser, Error = Cheap> { + choice((obs_phrase(), word().repeated().at_least(1).flatten())) +} + +// 3.4. Address Specification +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.4 + +// mailbox = name-addr / addr-spec +pub(crate) fn mailbox() -> impl Parser, (String, String)), Error = Cheap> +{ + choice((name_addr(), addr_spec().map(|addr| (None, addr)))).then_ignore(end()) +} + +// name-addr = [display-name] angle-addr +fn name_addr() -> impl Parser, (String, String)), Error = Cheap> { + display_name().collect().or_not().then(angle_addr()) +} + +// angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr +fn angle_addr() -> impl Parser> { + addr_spec() + .delimited_by(just('<').ignored(), just('>').ignored()) + .padded() +} + +// display-name = phrase +fn display_name() -> impl Parser, Error = Cheap> { + phrase() +} + +// mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list +pub(crate) fn mailbox_list( +) -> impl Parser, (String, String))>, Error = Cheap> { + choice((name_addr(), addr_spec().map(|addr| (None, addr)))) + .separated_by(just(',').padded()) + .then_ignore(end()) +} + +// 3.4.1. Addr-spec specification +// https://datatracker.ietf.org/doc/html/rfc2822#section-3.4.1 + +// addr-spec = local-part "@" domain +pub fn addr_spec() -> impl Parser> { + local_part() + .collect() + .then_ignore(just('@')) + .then(domain().collect()) +} + +// local-part = dot-atom / quoted-string / obs-local-part +pub fn local_part() -> impl Parser, Error = Cheap> { + choice((dot_atom(), quoted_string(), obs_local_part())) +} + +// domain = dot-atom / domain-literal / obs-domain +pub fn domain() -> impl Parser, Error = Cheap> { + // NOTE: omitting domain-literal since it may never be used + choice((dot_atom(), obs_domain())) +} + +// 4.1. Miscellaneous obsolete tokens +// https://datatracker.ietf.org/doc/html/rfc2822#section-4.1 + +// obs-phrase = word *(word / "." / CFWS) +fn obs_phrase() -> impl Parser, Error = Cheap> { + // NOTE: the CFWS is already captured by the word, no need to add + // it there. + word().chain( + choice((word(), just('.').repeated().exactly(1))) + .repeated() + .flatten(), + ) +} + +// 4.4. Obsolete Addressing +// https://datatracker.ietf.org/doc/html/rfc2822#section-4.4 + +// obs-local-part = word *("." word) +pub fn obs_local_part() -> impl Parser, Error = Cheap> { + word().chain(just('.').chain(word()).repeated().flatten()) +} + +// obs-domain = atom *("." atom) +pub fn obs_domain() -> impl Parser, Error = Cheap> { + atom().chain(just('.').chain(atom()).repeated().flatten()) +} diff --git a/src/message/mailbox/parsers/rfc5336.rs b/src/message/mailbox/parsers/rfc5336.rs new file mode 100644 index 0000000..4f1df96 --- /dev/null +++ b/src/message/mailbox/parsers/rfc5336.rs @@ -0,0 +1,17 @@ +//! Partial parsers implementation of [RFC5336]: SMTP Extension for +//! Internationalized Email Addresses. +//! +//! [RFC5336]: https://datatracker.ietf.org/doc/html/rfc5336 + +use chumsky::{error::Cheap, prelude::*}; + +// 3.3. Extended Mailbox Address Syntax +// https://datatracker.ietf.org/doc/html/rfc5336#section-3.3 + +// UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4 +// UTF8-2 = +// UTF8-3 = +// UTF8-4 = +pub(super) fn utf8_non_ascii() -> impl Parser> { + filter(|c: &char| c.len_utf8() > 1) +} diff --git a/src/message/mailbox/serde.rs b/src/message/mailbox/serde.rs index 390c6fa..495929e 100644 --- a/src/message/mailbox/serde.rs +++ b/src/message/mailbox/serde.rs @@ -198,7 +198,7 @@ mod test { from_str(r#""yin@dtb.com, Hei , Kai ""#).unwrap(); assert_eq!( m, - ", Hei , Kai " + "yin@dtb.com, Hei , Kai " .parse() .unwrap() ); @@ -211,7 +211,7 @@ mod test { .unwrap(); assert_eq!( m, - ", Hei , Kai " + "yin@dtb.com, Hei , Kai " .parse() .unwrap() ); diff --git a/src/message/mailbox/types.rs b/src/message/mailbox/types.rs index 5eeb18c..3043a5e 100644 --- a/src/message/mailbox/types.rs +++ b/src/message/mailbox/types.rs @@ -5,8 +5,10 @@ use std::{ str::FromStr, }; +use chumsky::prelude::*; use email_encoding::headers::EmailWriter; +use super::parsers; use crate::address::{Address, AddressError}; /// Represents an email address with an optional name for the sender/recipient. @@ -108,40 +110,18 @@ impl, T: Into> TryFrom<(S, T)> for Mailbox { } } -/* -impl, T: AsRef<&str>> TryFrom<(S, T)> for Mailbox { - type Error = AddressError; - - fn try_from(header: (S, T)) -> Result { - let (name, address) = header; - Ok(Mailbox::new(Some(name.as_ref()), address.as_ref().parse()?)) - } -}*/ - impl FromStr for Mailbox { type Err = AddressError; fn from_str(src: &str) -> Result { - match (src.find('<'), src.find('>')) { - (Some(addr_open), Some(addr_close)) if addr_open < addr_close => { - let name = src.split_at(addr_open).0; - let addr_open = addr_open + 1; - let addr = src.split_at(addr_open).1.split_at(addr_close - addr_open).0; - let addr = addr.parse()?; - let name = name.trim(); - let name = if name.is_empty() { - None - } else { - Some(name.into()) - }; - Ok(Mailbox::new(name, addr)) - } - (Some(_), _) => Err(AddressError::Unbalanced), - _ => { - let addr = src.parse()?; - Ok(Mailbox::new(None, addr)) - } - } + let (name, (user, domain)) = parsers::mailbox().parse(src).map_err(|_errs| { + // TODO: improve error management + AddressError::InvalidInput + })?; + + let mailbox = Mailbox::new(name, Address::new(user, domain)?); + + Ok(mailbox) } } @@ -356,34 +336,16 @@ impl Display for Mailboxes { impl FromStr for Mailboxes { type Err = AddressError; - fn from_str(mut src: &str) -> Result { + fn from_str(src: &str) -> Result { let mut mailboxes = Vec::new(); - if !src.is_empty() { - // n-1 elements - let mut skip = 0; - while let Some(i) = src[skip..].find(',') { - let left = &src[..skip + i]; + let parsed_mailboxes = parsers::mailbox_list().parse(src).map_err(|_errs| { + // TODO: improve error management + AddressError::InvalidInput + })?; - match left.trim().parse() { - Ok(mailbox) => { - mailboxes.push(mailbox); - - src = &src[left.len() + ",".len()..]; - skip = 0; - } - Err(AddressError::MissingParts) => { - skip = left.len() + ",".len(); - } - Err(err) => { - return Err(err); - } - } - } - - // last element - let mailbox = src.trim().parse()?; - mailboxes.push(mailbox); + for (name, (user, domain)) in parsed_mailboxes { + mailboxes.push(Mailbox::new(name, Address::new(user, domain)?)) } Ok(Mailboxes(mailboxes)) diff --git a/src/transport/mod.rs b/src/transport/mod.rs index 369101e..bcbda31 100644 --- a/src/transport/mod.rs +++ b/src/transport/mod.rs @@ -75,7 +75,7 @@ //! // Send the email //! match mailer.send(&email) { //! Ok(_) => println!("Email sent successfully!"), -//! Err(e) => panic!("Could not send email: {:?}", e), +//! Err(e) => panic!("Could not send email: {e:?}"), //! } //! # Ok(()) //! # } diff --git a/src/transport/smtp/extension.rs b/src/transport/smtp/extension.rs index d3c405f..66e2af3 100644 --- a/src/transport/smtp/extension.rs +++ b/src/transport/smtp/extension.rs @@ -281,7 +281,7 @@ impl Display for RcptParameter { RcptParameter::Other { ref keyword, value: Some(ref value), - } => write!(f, "{}={}", keyword, XText(value)), + } => write!(f, "{keyword}={}", XText(value)), RcptParameter::Other { ref keyword, value: None,