Skip to content

Commit cacf9de

Browse files
committed
Exclude more Unicode whitespace characters (#14)
1 parent ff07502 commit cacf9de

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99
### Changed
1010
- Don't autolink if authority is only "end" characters, e.g. like `http://.` or
1111
`http://"` (#15)
12+
- Stop URLs at Unicode whitespace characters such as U+00A0 NO-BREAK SPACE,
13+
thanks @otopba!
1214

1315
## [0.6.0] - 2016-11-07
1416
### Added

src/main/java/org/nibor/autolink/internal/Scanners.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,31 @@ public static int findUrlEnd(CharSequence input, int beginIndex) {
9797
case '\u009D':
9898
case '\u009E':
9999
case '\u009F':
100-
case '\u00A0': //non-breaking space
101-
// These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
100+
// The above can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
102101
// Some characters are not in the above list, even they are not in "unreserved" or "reserved":
103102
// '"', '\\', '^', '`', '{', '|', '}'
104103
// The reason for this is that other link detectors also allow them. Also see below, we require
105104
// the quote and the braces to be balanced.
105+
case '\u00A0': // no-break space
106+
case '\u2000': // en quad
107+
case '\u2001': // em quad
108+
case '\u2002': // en space
109+
case '\u2003': // em space
110+
case '\u2004': // three-per-em space
111+
case '\u2005': // four-per-em space
112+
case '\u2006': // six-per-em space
113+
case '\u2007': // figure space
114+
case '\u2008': // punctuation space
115+
case '\u2009': // thin space
116+
case '\u200A': // hair space
117+
case '\u2028': // line separator
118+
case '\u2029': // paragraph separator
119+
case '\u202F': // narrow no-break space
120+
case '\u205F': // medium mathematical space
121+
case '\u3000': // ideographic space
122+
// While these are allowed by RFC 3987, they are Unicode whitespace characters
123+
// that look like a space, so it would be confusing not to end URLs.
124+
// They are also excluded from IDNs by some browsers.
106125
break loop;
107126
case '?':
108127
case '!':

src/test/java/org/nibor/autolink/AutolinkUrlTest.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,33 @@ public void international() {
190190
assertLinked("http://example.org/\u00A2", "|http://example.org/\u00A2|");
191191
}
192192

193+
@Test
194+
public void unicodeWhitespace() {
195+
char[] whitespace = new char[] {
196+
'\u00A0', // no-break space
197+
'\u2000', // en quad
198+
'\u2001', // em quad
199+
'\u2002', // en space
200+
'\u2003', // em space
201+
'\u2004', // three-per-em space
202+
'\u2005', // four-per-em space
203+
'\u2006', // six-per-em space
204+
'\u2007', // figure space
205+
'\u2008', // punctuation space
206+
'\u2009', // thin space
207+
'\u200A', // hair space
208+
'\u2028', // line separator
209+
'\u2029', // paragraph separator
210+
'\u202F', // narrow no-break space
211+
'\u205F', // medium mathematical space
212+
'\u3000', // ideographic space
213+
};
214+
215+
for (char c : whitespace) {
216+
assertLinked("http://example.org" + c, "|http://example.org|" + c);
217+
}
218+
}
219+
193220
@Test
194221
public void replyLevel() {
195222
assertLinked(">http://example.org/", ">|http://example.org/|");

0 commit comments

Comments
 (0)