Skip to content

Commit b6ad06e

Browse files
authored
Merge pull request #43 from robinst/check-domains
More strict parsing of hostname (authority) part of URLs
2 parents 9a6ce39 + 97152fa commit b6ad06e

File tree

10 files changed

+662
-236
lines changed

10 files changed

+662
-236
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
66
This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html),
77
with the exception that 0.x versions can break between minor versions.
88

9+
## [Unreleased]
10+
### Changed
11+
- More strict parsing of hostname (authority) part of URLs. Applies to
12+
emails, plain domains URLs (e.g. `example.com/foo`) and URLs with
13+
schemes where a host is expected (e.g. `https`).
14+
15+
This fixes a few problems that have been reported over time, namely:
16+
17+
- `https://www.example..com` is no longer parsed as an URL (#41)
18+
- `[email protected]` is no longer parsed as an email address (#29)
19+
- `https://*.example.org` is no longer parsed as an URL (#38)
20+
21+
It's a tricky change and hopefully this solves some problems while
22+
not introducing too many new ones. If anything unexpectedly changed
23+
for you, please let us know!
24+
925
## [0.8.1] - 2022-04-14
1026
### Changed
1127
- Skip parsing very short strings for URLs as a performance optimization
@@ -76,6 +92,7 @@ Initial release of linkify, a Rust library to find links such as URLs and email
7692
addresses in plain text, handling surrounding punctuation correctly.
7793

7894

95+
[Unreleased]: https://github.com/robinst/linkify/compare/0.8.1...HEAD
7996
[0.8.1]: https://github.com/robinst/linkify/compare/0.8.0...0.8.1
8097
[0.8.0]: https://github.com/robinst/linkify/compare/0.7.0...0.8.0
8198
[0.7.0]: https://github.com/robinst/linkify/compare/0.6.0...0.7.0

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ memchr = "2.0.1"
1818

1919
[dev-dependencies]
2020
criterion = "0.3"
21+
plotters-backend = "= 0.3.2" # 0.3.4 requires later Rust
2122
doc-comment = "0.3.3"
2223

2324

src/domains.rs

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
//! Domain name related scanning, used by both email and URL scanners.
2+
//!
3+
//! This is called domains for familiarity but it's about the authority part of URLs as defined in
4+
//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
5+
//!
6+
//! ```text
7+
//! authority = [ userinfo "@" ] host [ ":" port ]
8+
//!
9+
//!
10+
//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
11+
//!
12+
//! host = IP-literal / IPv4address / reg-name
13+
//!
14+
//! IP-literal = "[" ( IPv6address / IPvFuture ) "]"
15+
//!
16+
//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
17+
//!
18+
//! reg-name = *( unreserved / pct-encoded / sub-delims )
19+
//!
20+
//!
21+
//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
22+
//!
23+
//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
24+
//!
25+
//! pct-encoded = "%" HEXDIG HEXDIG
26+
//! ```
27+
28+
use std::char;
29+
30+
pub(crate) fn find_authority_end(
31+
s: &str,
32+
mut userinfo_allowed: bool,
33+
require_host: bool,
34+
port_allowed: bool,
35+
) -> (Option<usize>, Option<usize>) {
36+
let mut end = Some(0);
37+
38+
let mut maybe_last_dot = None;
39+
let mut last_dot = None;
40+
let mut dot_allowed = false;
41+
let mut hyphen_allowed = false;
42+
let mut all_numeric = true;
43+
let mut maybe_host = true;
44+
let mut host_ended = false;
45+
46+
for (i, c) in s.char_indices() {
47+
let can_be_last = match c {
48+
// ALPHA
49+
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
50+
// Can start or end a domain label, but not numeric
51+
dot_allowed = true;
52+
hyphen_allowed = true;
53+
last_dot = maybe_last_dot;
54+
all_numeric = false;
55+
56+
if host_ended {
57+
maybe_host = false;
58+
}
59+
60+
!require_host || !host_ended
61+
}
62+
// DIGIT
63+
'0'..='9' => {
64+
// Same as above, except numeric
65+
dot_allowed = true;
66+
hyphen_allowed = true;
67+
last_dot = maybe_last_dot;
68+
69+
if host_ended {
70+
maybe_host = false;
71+
}
72+
73+
!require_host || !host_ended
74+
}
75+
// unreserved
76+
'-' => {
77+
// Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com`
78+
if !hyphen_allowed {
79+
maybe_host = false;
80+
}
81+
// Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com`
82+
dot_allowed = false;
83+
all_numeric = false;
84+
85+
!require_host
86+
}
87+
'.' => {
88+
if !dot_allowed {
89+
// Label can't be empty, e.g. `.example.com` or `a..com`
90+
host_ended = true;
91+
}
92+
dot_allowed = false;
93+
hyphen_allowed = false;
94+
maybe_last_dot = Some(i);
95+
96+
false
97+
}
98+
'_' | '~' => {
99+
// Hostnames can't contain these and we don't want to treat them as delimiters.
100+
maybe_host = false;
101+
102+
false
103+
}
104+
// sub-delims
105+
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => {
106+
// Can't be in hostnames, but we treat them as delimiters
107+
host_ended = true;
108+
109+
if !userinfo_allowed && require_host {
110+
// We don't have to look further
111+
break;
112+
}
113+
114+
false
115+
}
116+
':' => {
117+
// Could be in userinfo, or we're getting a port now.
118+
if !userinfo_allowed && !port_allowed {
119+
break;
120+
}
121+
122+
// Don't advance the last dot when we get to port numbers
123+
maybe_last_dot = last_dot;
124+
125+
false
126+
}
127+
'@' => {
128+
if !userinfo_allowed {
129+
// We already had userinfo, can't have another `@` in a valid authority.
130+
return (None, None);
131+
}
132+
133+
// Sike! Everything before this has been userinfo, so let's reset our
134+
// opinions about all the host bits.
135+
userinfo_allowed = false;
136+
137+
maybe_last_dot = None;
138+
last_dot = None;
139+
dot_allowed = false;
140+
hyphen_allowed = false;
141+
all_numeric = true;
142+
maybe_host = true;
143+
host_ended = false;
144+
145+
false
146+
}
147+
'/' => {
148+
if !require_host {
149+
// For schemes where we allow anything, we want to stop at delimiter characters
150+
// except if we get a slash closing the URL, which happened here.
151+
end = Some(i);
152+
}
153+
break;
154+
}
155+
_ => {
156+
// Anything else, this might be the end of the authority (can be empty).
157+
// Now let the rest of the code handle checking whether the end of the URL is
158+
// valid.
159+
break;
160+
}
161+
};
162+
163+
if can_be_last {
164+
end = Some(i + c.len_utf8());
165+
}
166+
}
167+
168+
if require_host {
169+
if maybe_host {
170+
// Can't have just a number without dots as the authority
171+
if all_numeric && last_dot.is_none() && end != Some(0) {
172+
return (None, None);
173+
}
174+
175+
// If we have something that is not just numeric (not an IP address),
176+
// check that the TLD looks reasonable. This is to avoid linking things like
177+
178+
if !all_numeric {
179+
if let Some(last_dot) = last_dot {
180+
if !valid_tld(&s[last_dot + 1..]) {
181+
return (None, None);
182+
}
183+
}
184+
}
185+
186+
return (end, last_dot);
187+
} else {
188+
return (None, None);
189+
}
190+
} else {
191+
return (end, last_dot);
192+
}
193+
}
194+
195+
fn valid_tld(tld: &str) -> bool {
196+
tld.chars()
197+
.take_while(|c| c.is_ascii_alphabetic())
198+
.take(2)
199+
.count()
200+
>= 2
201+
}

src/email.rs

Lines changed: 6 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::ops::Range;
22

3+
use crate::domains::find_authority_end;
34
use crate::scanner::Scanner;
45

56
/// Scan for email address starting from the trigger character "@".
@@ -40,6 +41,9 @@ impl EmailScanner {
4041
break;
4142
}
4243
atom_boundary = true;
44+
} else if c == '@' {
45+
// In `@[email protected]`, we don't want to extract `[email protected]`.
46+
return None;
4347
} else {
4448
break;
4549
}
@@ -49,40 +53,8 @@ impl EmailScanner {
4953

5054
// See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531
5155
fn find_end(&self, s: &str) -> Option<usize> {
52-
let mut first_in_sub_domain = true;
53-
let mut can_end_sub_domain = false;
54-
let mut first_dot = None;
55-
let mut end = None;
56-
57-
for (i, c) in s.char_indices() {
58-
if first_in_sub_domain {
59-
if Self::sub_domain_allowed(c) {
60-
end = Some(i + c.len_utf8());
61-
first_in_sub_domain = false;
62-
can_end_sub_domain = true;
63-
} else {
64-
break;
65-
}
66-
} else if c == '.' {
67-
if !can_end_sub_domain {
68-
break;
69-
}
70-
first_in_sub_domain = true;
71-
if first_dot.is_none() {
72-
first_dot = Some(i);
73-
}
74-
} else if c == '-' {
75-
can_end_sub_domain = false;
76-
} else if Self::sub_domain_allowed(c) {
77-
end = Some(i + c.len_utf8());
78-
can_end_sub_domain = true;
79-
} else {
80-
break;
81-
}
82-
}
83-
84-
if let Some(end) = end {
85-
if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) {
56+
if let (Some(end), last_dot) = find_authority_end(s, false, true, false) {
57+
if !self.domain_must_have_dot || last_dot.is_some() {
8658
Some(end)
8759
} else {
8860
None
@@ -120,27 +92,4 @@ impl EmailScanner {
12092
_ => c >= '\u{80}',
12193
}
12294
}
123-
124-
// See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified,
125-
// this can also match invalid domains.
126-
fn sub_domain_allowed(c: char) -> bool {
127-
match c {
128-
'a'..='z' | 'A'..='Z' | '0'..='9' => true,
129-
_ => c >= '\u{80}',
130-
}
131-
}
132-
}
133-
134-
/// Helper function to check if given string is considered an email address.
135-
#[inline]
136-
pub(crate) fn is_mail(input: &str) -> bool {
137-
input
138-
.char_indices()
139-
.filter(|(_, c)| *c == '@')
140-
.any(|(i, _)| {
141-
let scanner = EmailScanner {
142-
domain_must_have_dot: true,
143-
};
144-
scanner.scan(input, i).is_some()
145-
})
14695
}

src/finder.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use memchr::{memchr, memchr2, memchr3};
55

66
use crate::email::EmailScanner;
77
use crate::scanner::Scanner;
8-
use crate::url::UrlScanner;
8+
use crate::url::{DomainScanner, UrlScanner};
99

1010
/// A link found in the input text.
1111
#[derive(Debug)]
@@ -112,6 +112,7 @@ pub struct Links<'t> {
112112
trigger_finder: Box<dyn Fn(&[u8]) -> Option<usize>>,
113113
email_scanner: EmailScanner,
114114
url_scanner: UrlScanner,
115+
domain_scanner: DomainScanner,
115116
}
116117

117118
/// Iterator over spans.
@@ -213,6 +214,7 @@ impl<'t> Links<'t> {
213214
email_domain_must_have_dot: bool,
214215
) -> Links<'t> {
215216
let url_scanner = UrlScanner;
217+
let domain_scanner = DomainScanner;
216218
let email_scanner = EmailScanner {
217219
domain_must_have_dot: email_domain_must_have_dot,
218220
};
@@ -232,6 +234,7 @@ impl<'t> Links<'t> {
232234
trigger_finder,
233235
email_scanner,
234236
url_scanner,
237+
domain_scanner,
235238
}
236239
}
237240
}
@@ -246,7 +249,8 @@ impl<'t> Iterator for Links<'t> {
246249
while let Some(i) = (self.trigger_finder)(slice[find_from..].as_bytes()) {
247250
let trigger = slice.as_bytes()[find_from + i];
248251
let (scanner, kind): (&dyn Scanner, LinkKind) = match trigger {
249-
b':' | b'.' => (&self.url_scanner, LinkKind::Url),
252+
b':' => (&self.url_scanner, LinkKind::Url),
253+
b'.' => (&self.domain_scanner, LinkKind::Url),
250254
b'@' => (&self.email_scanner, LinkKind::Email),
251255
_ => unreachable!(),
252256
};

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
#![deny(missing_docs)]
121121
#![deny(missing_debug_implementations)]
122122

123+
mod domains;
123124
mod email;
124125
mod finder;
125126
mod scanner;

0 commit comments

Comments
 (0)