Improve mailbox parsing using chumsky (#839)

This commit is contained in:
Clément DOUIN
2023-02-20 14:09:23 +01:00
committed by GitHub
parent 69e5974024
commit 5f37b66352
14 changed files with 392 additions and 64 deletions

View File

@@ -19,6 +19,7 @@ is-it-maintained-open-issues = { repository = "lettre/lettre" }
maintenance = { status = "actively-developed" }
[dependencies]
chumsky = "0.8.0"
idna = "0.3"
once_cell = { version = "1", optional = true }
tracing = { version = "0.1.16", default-features = false, features = ["std"], optional = true } # feature
@@ -89,6 +90,10 @@ maud = "0.24"
harness = false
name = "transport_smtp"
[[bench]]
harness = false
name = "mailbox_parsing"
[features]
default = ["smtp-transport", "pool", "native-tls", "hostname", "builder"]
builder = ["dep:httpdate", "dep:mime", "dep:fastrand", "dep:quoted_printable", "dep:email-encoding"]

View File

@@ -91,7 +91,7 @@ let mailer = SmtpTransport::relay("smtp.gmail.com")
// Send the email
match mailer.send(&email) {
Ok(_) => println!("Email sent successfully!"),
Err(e) => panic!("Could not send email: {:?}", e),
Err(e) => panic!("Could not send email: {e:?}"),
}
```

View File

@@ -0,0 +1,27 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use lettre::message::{Mailbox, Mailboxes};
fn bench_parse_single(mailbox: &str) {
assert!(mailbox.parse::<Mailbox>().is_ok());
}
fn bench_parse_multiple(mailboxes: &str) {
assert!(mailboxes.parse::<Mailboxes>().is_ok());
}
fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("parse single mailbox", |b| {
b.iter(|| bench_parse_single(black_box("\"Benchmark test\" <test@mail.local>")))
});
c.bench_function("parse multiple mailboxes", |b| {
b.iter(|| {
bench_parse_multiple(black_box(
"\"Benchmark test\" <test@mail.local>, Test <test@mail.local>, <test@mail.local>, test@mail.local",
))
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -227,6 +227,7 @@ fn check_address(val: &str) -> Result<usize, AddressError> {
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
#[non_exhaustive]
/// Errors in email addresses parsing
pub enum AddressError {
/// Missing domain or user
@@ -237,6 +238,8 @@ pub enum AddressError {
InvalidUser,
/// Invalid email domain
InvalidDomain,
/// Invalid input found
InvalidInput,
}
impl Error for AddressError {}
@@ -248,6 +251,7 @@ impl Display for AddressError {
AddressError::Unbalanced => f.write_str("Unbalanced angle bracket"),
AddressError::InvalidUser => f.write_str("Invalid email user"),
AddressError::InvalidDomain => f.write_str("Invalid email domain"),
AddressError::InvalidInput => f.write_str("Invalid input"),
}
}
}

View File

@@ -306,14 +306,30 @@ mod test {
#[test]
fn parse_multi_with_name_containing_comma() {
let from: Vec<Mailbox> = vec![
"Test, test <1@example.com>".parse().unwrap(),
"Test2, test2 <2@example.com>".parse().unwrap(),
"\"Test, test\" <1@example.com>".parse().unwrap(),
"\"Test2, test2\" <2@example.com>".parse().unwrap(),
];
let mut headers = Headers::new();
headers.insert_raw(HeaderValue::new(
HeaderName::new_from_ascii_str("From"),
"Test, test <1@example.com>, Test2, test2 <2@example.com>".to_owned(),
"\"Test, test\" <1@example.com>, \"Test2, test2\" <2@example.com>".to_owned(),
));
assert_eq!(headers.get::<From>(), Some(From(from.into())));
}
#[test]
fn parse_multi_with_name_containing_double_quotes() {
let from: Vec<Mailbox> = vec![
"\"Test, test\" <1@example.com>".parse().unwrap(),
"\"Test2, \"test2\"\" <2@example.com>".parse().unwrap(),
];
let mut headers = Headers::new();
headers.insert_raw(HeaderValue::new(
HeaderName::new_from_ascii_str("From"),
"\"Test, test\" <1@example.com>, \"Test2, \"test2\"\" <2@example.com>".to_owned(),
));
assert_eq!(headers.get::<From>(), Some(From(from.into())));
@@ -324,9 +340,20 @@ mod test {
let mut headers = Headers::new();
headers.insert_raw(HeaderValue::new(
HeaderName::new_from_ascii_str("From"),
"Test, test <1@example.com>, Test2, test2".to_owned(),
"\"Test, test\" <1@example.com>, \"Test2, test2\"".to_owned(),
));
assert_eq!(headers.get::<From>(), None);
}
#[test]
fn mailbox_format_address_with_angle_bracket() {
assert_eq!(
format!(
"{}",
Mailbox::new(Some("<3".into()), "i@love.example".parse().unwrap())
),
r#""<3" <i@love.example>"#
);
}
}

View File

@@ -1,3 +1,4 @@
mod parsers;
#[cfg(feature = "serde")]
mod serde;
mod types;

View File

@@ -0,0 +1,5 @@
mod rfc2234;
mod rfc2822;
mod rfc5336;
pub(crate) use rfc2822::{mailbox, mailbox_list};

View File

@@ -0,0 +1,32 @@
//! Partial parsers implementation of [RFC2234]: Augmented BNF for
//! Syntax Specifications: ABNF.
//!
//! [RFC2234]: https://datatracker.ietf.org/doc/html/rfc2234
use chumsky::{error::Cheap, prelude::*};
// 6.1 Core Rules
// https://datatracker.ietf.org/doc/html/rfc2234#section-6.1
// ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
pub(super) fn alpha() -> impl Parser<char, char, Error = Cheap<char>> {
filter(|c: &char| c.is_ascii_alphabetic())
}
// DIGIT = %x30-39
// ; 0-9
pub(super) fn digit() -> impl Parser<char, char, Error = Cheap<char>> {
filter(|c: &char| c.is_ascii_digit())
}
// DQUOTE = %x22
// ; " (Double Quote)
pub(super) fn dquote() -> impl Parser<char, char, Error = Cheap<char>> {
just('"')
}
// WSP = SP / HTAB
// ; white space
pub(super) fn wsp() -> impl Parser<char, char, Error = Cheap<char>> {
choice((just(' '), just('\t')))
}

View File

@@ -0,0 +1,248 @@
//! Partial parsers implementation of [RFC2822]: Internet Message
//! Format.
//!
//! [RFC2822]: https://datatracker.ietf.org/doc/html/rfc2822
use chumsky::{error::Cheap, prelude::*};
use super::{rfc2234, rfc5336};
// 3.2.1. Primitive Tokens
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.1
// NO-WS-CTL = %d1-8 / ; US-ASCII control characters
// %d11 / ; that do not include the
// %d12 / ; carriage return, line feed,
// %d14-31 / ; and white space characters
// %d127
fn no_ws_ctl() -> impl Parser<char, char, Error = Cheap<char>> {
filter(|c| matches!(u32::from(*c), 1..=8 | 11 | 12 | 14..=31 | 127))
}
// text = %d1-9 / ; Characters excluding CR and LF
// %d11 /
// %d12 /
// %d14-127 /
// obs-text
fn text() -> impl Parser<char, char, Error = Cheap<char>> {
filter(|c| matches!(u32::from(*c), 1..=9 | 11 | 12 | 14..=127))
}
// 3.2.2. Quoted characters
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.2
// quoted-pair = ("\" text) / obs-qp
fn quoted_pair() -> impl Parser<char, char, Error = Cheap<char>> {
just('\\').ignore_then(text())
}
// 3.2.3. Folding white space and comments
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.3
// FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space
// obs-FWS
pub fn fws() -> impl Parser<char, Option<char>, Error = Cheap<char>> {
rfc2234::wsp()
.or_not()
.then_ignore(rfc2234::wsp().ignored().repeated())
}
// CFWS = *([FWS] comment) (([FWS] comment) / FWS)
pub fn cfws() -> impl Parser<char, Option<char>, Error = Cheap<char>> {
// TODO: comment are not currently supported, so for now a cfws is
// the same as a fws.
fws()
}
// 3.2.4. Atom
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.4
// atext = ALPHA / DIGIT / ; Any character except controls,
// "!" / "#" / ; SP, and specials.
// "$" / "%" / ; Used for atoms
// "&" / "'" /
// "*" / "+" /
// "-" / "/" /
// "=" / "?" /
// "^" / "_" /
// "`" / "{" /
// "|" / "}" /
// "~"
pub(super) fn atext() -> impl Parser<char, char, Error = Cheap<char>> {
choice((
rfc2234::alpha(),
rfc2234::digit(),
filter(|c| {
matches!(
*c,
'!' | '#'
| '$'
| '%'
| '&'
| '\''
| '*'
| '+'
| '-'
| '/'
| '='
| '?'
| '^'
| '_'
| '`'
| '{'
| '|'
| '}'
| '~'
)
}),
// also allow non ASCII UTF8 chars
rfc5336::utf8_non_ascii(),
))
}
// atom = [CFWS] 1*atext [CFWS]
pub(super) fn atom() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
cfws().chain(atext().repeated().at_least(1))
}
// dot-atom = [CFWS] dot-atom-text [CFWS]
pub fn dot_atom() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
cfws().chain(dot_atom_text())
}
// dot-atom-text = 1*atext *("." 1*atext)
pub fn dot_atom_text() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
atext().repeated().at_least(1).chain(
just('.')
.chain(atext().repeated().at_least(1))
.repeated()
.at_least(1)
.flatten(),
)
}
// 3.2.5. Quoted strings
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.5
// qtext = NO-WS-CTL / ; Non white space controls
//
// %d33 / ; The rest of the US-ASCII
// %d35-91 / ; characters not including "\"
// %d93-126 ; or the quote character
fn qtext() -> impl Parser<char, char, Error = Cheap<char>> {
choice((
filter(|c| matches!(u32::from(*c), 33 | 35..=91 | 93..=126)),
no_ws_ctl(),
))
}
// qcontent = qtext / quoted-pair
pub(super) fn qcontent() -> impl Parser<char, char, Error = Cheap<char>> {
choice((qtext(), quoted_pair(), rfc5336::utf8_non_ascii()))
}
// quoted-string = [CFWS]
// DQUOTE *([FWS] qcontent) [FWS] DQUOTE
// [CFWS]
fn quoted_string() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
rfc2234::dquote()
.ignore_then(fws().chain(qcontent()).repeated().flatten())
.then_ignore(text::whitespace())
.then_ignore(rfc2234::dquote())
}
// 3.2.6. Miscellaneous tokens
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.2.6
// word = atom / quoted-string
fn word() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
choice((quoted_string(), atom()))
}
// phrase = 1*word / obs-phrase
fn phrase() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
choice((obs_phrase(), word().repeated().at_least(1).flatten()))
}
// 3.4. Address Specification
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.4
// mailbox = name-addr / addr-spec
pub(crate) fn mailbox() -> impl Parser<char, (Option<String>, (String, String)), Error = Cheap<char>>
{
choice((name_addr(), addr_spec().map(|addr| (None, addr)))).then_ignore(end())
}
// name-addr = [display-name] angle-addr
fn name_addr() -> impl Parser<char, (Option<String>, (String, String)), Error = Cheap<char>> {
display_name().collect().or_not().then(angle_addr())
}
// angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
fn angle_addr() -> impl Parser<char, (String, String), Error = Cheap<char>> {
addr_spec()
.delimited_by(just('<').ignored(), just('>').ignored())
.padded()
}
// display-name = phrase
fn display_name() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
phrase()
}
// mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
pub(crate) fn mailbox_list(
) -> impl Parser<char, Vec<(Option<String>, (String, String))>, Error = Cheap<char>> {
choice((name_addr(), addr_spec().map(|addr| (None, addr))))
.separated_by(just(',').padded())
.then_ignore(end())
}
// 3.4.1. Addr-spec specification
// https://datatracker.ietf.org/doc/html/rfc2822#section-3.4.1
// addr-spec = local-part "@" domain
pub fn addr_spec() -> impl Parser<char, (String, String), Error = Cheap<char>> {
local_part()
.collect()
.then_ignore(just('@'))
.then(domain().collect())
}
// local-part = dot-atom / quoted-string / obs-local-part
pub fn local_part() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
choice((dot_atom(), quoted_string(), obs_local_part()))
}
// domain = dot-atom / domain-literal / obs-domain
pub fn domain() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
// NOTE: omitting domain-literal since it may never be used
choice((dot_atom(), obs_domain()))
}
// 4.1. Miscellaneous obsolete tokens
// https://datatracker.ietf.org/doc/html/rfc2822#section-4.1
// obs-phrase = word *(word / "." / CFWS)
fn obs_phrase() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
// NOTE: the CFWS is already captured by the word, no need to add
// it there.
word().chain(
choice((word(), just('.').repeated().exactly(1)))
.repeated()
.flatten(),
)
}
// 4.4. Obsolete Addressing
// https://datatracker.ietf.org/doc/html/rfc2822#section-4.4
// obs-local-part = word *("." word)
pub fn obs_local_part() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
word().chain(just('.').chain(word()).repeated().flatten())
}
// obs-domain = atom *("." atom)
pub fn obs_domain() -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
atom().chain(just('.').chain(atom()).repeated().flatten())
}

View File

@@ -0,0 +1,17 @@
//! Partial parsers implementation of [RFC5336]: SMTP Extension for
//! Internationalized Email Addresses.
//!
//! [RFC5336]: https://datatracker.ietf.org/doc/html/rfc5336
use chumsky::{error::Cheap, prelude::*};
// 3.3. Extended Mailbox Address Syntax
// https://datatracker.ietf.org/doc/html/rfc5336#section-3.3
// UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
// UTF8-2 = <See Section 4 of RFC 3629>
// UTF8-3 = <See Section 4 of RFC 3629>
// UTF8-4 = <See Section 4 of RFC 3629>
pub(super) fn utf8_non_ascii() -> impl Parser<char, char, Error = Cheap<char>> {
filter(|c: &char| c.len_utf8() > 1)
}

View File

@@ -198,7 +198,7 @@ mod test {
from_str(r#""yin@dtb.com, Hei <hei@dtb.com>, Kai <kayo@example.com>""#).unwrap();
assert_eq!(
m,
"<yin@dtb.com>, Hei <hei@dtb.com>, Kai <kayo@example.com>"
"yin@dtb.com, Hei <hei@dtb.com>, Kai <kayo@example.com>"
.parse()
.unwrap()
);
@@ -211,7 +211,7 @@ mod test {
.unwrap();
assert_eq!(
m,
"<yin@dtb.com>, Hei <hei@dtb.com>, Kai <kayo@example.com>"
"yin@dtb.com, Hei <hei@dtb.com>, Kai <kayo@example.com>"
.parse()
.unwrap()
);

View File

@@ -5,8 +5,10 @@ use std::{
str::FromStr,
};
use chumsky::prelude::*;
use email_encoding::headers::EmailWriter;
use super::parsers;
use crate::address::{Address, AddressError};
/// Represents an email address with an optional name for the sender/recipient.
@@ -108,40 +110,18 @@ impl<S: Into<String>, T: Into<String>> TryFrom<(S, T)> for Mailbox {
}
}
/*
impl<S: AsRef<&str>, T: AsRef<&str>> TryFrom<(S, T)> for Mailbox {
type Error = AddressError;
fn try_from(header: (S, T)) -> Result<Self, Self::Error> {
let (name, address) = header;
Ok(Mailbox::new(Some(name.as_ref()), address.as_ref().parse()?))
}
}*/
impl FromStr for Mailbox {
type Err = AddressError;
fn from_str(src: &str) -> Result<Mailbox, Self::Err> {
match (src.find('<'), src.find('>')) {
(Some(addr_open), Some(addr_close)) if addr_open < addr_close => {
let name = src.split_at(addr_open).0;
let addr_open = addr_open + 1;
let addr = src.split_at(addr_open).1.split_at(addr_close - addr_open).0;
let addr = addr.parse()?;
let name = name.trim();
let name = if name.is_empty() {
None
} else {
Some(name.into())
};
Ok(Mailbox::new(name, addr))
}
(Some(_), _) => Err(AddressError::Unbalanced),
_ => {
let addr = src.parse()?;
Ok(Mailbox::new(None, addr))
}
}
let (name, (user, domain)) = parsers::mailbox().parse(src).map_err(|_errs| {
// TODO: improve error management
AddressError::InvalidInput
})?;
let mailbox = Mailbox::new(name, Address::new(user, domain)?);
Ok(mailbox)
}
}
@@ -356,34 +336,16 @@ impl Display for Mailboxes {
impl FromStr for Mailboxes {
type Err = AddressError;
fn from_str(mut src: &str) -> Result<Self, Self::Err> {
fn from_str(src: &str) -> Result<Self, Self::Err> {
let mut mailboxes = Vec::new();
if !src.is_empty() {
// n-1 elements
let mut skip = 0;
while let Some(i) = src[skip..].find(',') {
let left = &src[..skip + i];
let parsed_mailboxes = parsers::mailbox_list().parse(src).map_err(|_errs| {
// TODO: improve error management
AddressError::InvalidInput
})?;
match left.trim().parse() {
Ok(mailbox) => {
mailboxes.push(mailbox);
src = &src[left.len() + ",".len()..];
skip = 0;
}
Err(AddressError::MissingParts) => {
skip = left.len() + ",".len();
}
Err(err) => {
return Err(err);
}
}
}
// last element
let mailbox = src.trim().parse()?;
mailboxes.push(mailbox);
for (name, (user, domain)) in parsed_mailboxes {
mailboxes.push(Mailbox::new(name, Address::new(user, domain)?))
}
Ok(Mailboxes(mailboxes))

View File

@@ -75,7 +75,7 @@
//! // Send the email
//! match mailer.send(&email) {
//! Ok(_) => println!("Email sent successfully!"),
//! Err(e) => panic!("Could not send email: {:?}", e),
//! Err(e) => panic!("Could not send email: {e:?}"),
//! }
//! # Ok(())
//! # }

View File

@@ -281,7 +281,7 @@ impl Display for RcptParameter {
RcptParameter::Other {
ref keyword,
value: Some(ref value),
} => write!(f, "{}={}", keyword, XText(value)),
} => write!(f, "{keyword}={}", XText(value)),
RcptParameter::Other {
ref keyword,
value: None,