Skip to content

Commit 09659ad

Browse files
Support brackets in URL detection (#273)
1 parent 45c8dde commit 09659ad

File tree

2 files changed

+44
-2
lines changed

2 files changed

+44
-2
lines changed

src/main/scala/io/lambdaworks/detection/UrlDetector.scala

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ final class UrlDetector private (
3939
.detect()
4040
.asScala
4141
.toList
42-
.map(lUrl => AbsoluteUrl.parse(sanitize(lUrl.toString)))
42+
.map(url => AbsoluteUrl.parse(sanitize(cleanUrlForBracketMatch(content, url.toString))))
4343
.filter(url => allowedUrl(url) && notEmail(url) && validTopLevelDomain(url))
4444
.toSet
4545
}
@@ -86,6 +86,19 @@ final class UrlDetector private (
8686
private def allowedUrl(url: AbsoluteUrl): Boolean =
8787
allowedWithoutWww.forall(containsHost(_, url)) && deniedWithoutWww.forall(!containsHost(_, url))
8888

89+
private def cleanUrlForBracketMatch(content: String, url: String): String = {
90+
def isAllowedUrlChar(c: Char): Boolean =
91+
c.isLetterOrDigit || AllowedSpecialChars.contains(c)
92+
93+
Option(content.indexOf(url)).filter(_ >= 0).fold(url) { from =>
94+
val extendedUrl = content.substring(from).takeWhile(isAllowedUrlChar)
95+
96+
EmptyParensRegex
97+
.findFirstMatchIn(extendedUrl)
98+
.fold(extendedUrl)(m => extendedUrl.substring(0, m.start))
99+
}
100+
}
101+
89102
private def containsHost(hosts: NonEmptySet[Host], url: AbsoluteUrl): Boolean =
90103
hosts.exists(host => host.subdomain.fold(host.apexDomain.exists(url.apexDomain.contains))(_ => host == url.host))
91104

@@ -131,7 +144,12 @@ object UrlDetector {
131144
*/
132145
lazy val default: UrlDetector = UrlDetector(UrlDetectorOptions.Default)
133146

134-
private final val SanitizeRegex: Regex = "[,!-.`/]+$".r
147+
private final val AllowedSpecialChars: Set[Char] = Set(
148+
'-', '.', '_', '~', ':', '/', '?', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '%'
149+
)
150+
151+
private final val EmptyParensRegex: Regex = "\\(\\)[^()]*".r
152+
private final val SanitizeRegex: Regex = "[,!-.`/]+$".r
135153

136154
implicit private[detection] val orderingHost: Ordering[Host] = orderHost.toOrdering
137155

src/test/scala/io/lambdaworks/detection/UrlDetectorSpec.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,14 @@ final class UrlDetectorSpec extends AnyFlatSpec with Matchers {
4343
Url.parse("http://test.link/aaa")
4444
)
4545
),
46+
(
47+
"Parse https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN",
48+
Set(
49+
Url.parse(
50+
"https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"
51+
)
52+
)
53+
),
4654
(
4755
"192.168.1.3 255.255.1.34 1234.34.34.5 0.0.0.0 192.168.1.257 2.3.4.5",
4856
Set(
@@ -286,6 +294,22 @@ final class UrlDetectorSpec extends AnyFlatSpec with Matchers {
286294
Url.parse("http://test.link/HWRqhq"),
287295
Url.parse("http://test.link/KeKy")
288296
)
297+
),
298+
(
299+
"Parse https://site.com/(v=1.2)",
300+
Set(Url.parse("https://site.com/(v=1.2"))
301+
),
302+
(
303+
"Parse https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN " +
304+
"and http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product",
305+
Set(
306+
Url.parse(
307+
"https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"
308+
),
309+
Url.parse(
310+
"http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product"
311+
)
312+
)
289313
)
290314
)
291315

0 commit comments

Comments
 (0)