Skip to content

Commit a0ba679

Browse files
committed
Initial import
1 parent 2e29edb commit a0ba679

File tree

15 files changed

+927
-2
lines changed

15 files changed

+927
-2
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# IDEA
2+
*.iml
3+
/.idea/
4+
5+
# mvn
6+
target/

README.md

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,89 @@
1-
# autolink-java
2-
Java library to extract links (URLs, email addresses) from plain text; fast, small and hopefully smart
1+
autolink-java
2+
=============
3+
4+
Java library to extract links such as URLs and email addresses from plain text.
5+
Fast, small and tries to be smart with matching (text is hard).
6+
7+
Inspired by [Rinku](https://github.com/vmg/rinku). Similar to it, regular
8+
expressions are not used. Instead, the input text is parsed in one pass with
9+
limited backtracking.
10+
11+
Features
12+
--------
13+
14+
### URL extraction
15+
16+
Extracts URLs of the form `scheme://example` with any scheme. URIs such
17+
as `example:test` are not matched (may be added as an option in the future).
18+
If only certain schemes should be allowed, the result can be filtered.
19+
20+
Includes heuristics for not including trailing delimiters such as punctuation
21+
and unbalanced parentheses, see examples below.
22+
23+
Supports internationalized domain names (IDN). Note that they are not validated
24+
and as a result, invalid URLs may be matched.
25+
26+
Example input and the extracted link:
27+
28+
* `http://example.com.``http://example.com`
29+
* `http://example.com,``http://example.com`
30+
* `(http://example.com)``http://example.com`
31+
* `(... (see http://example.com))``http://example.com`
32+
* `https://en.wikipedia.org/wiki/Link_(The_Legend_of_Zelda)`
33+
`https://en.wikipedia.org/wiki/Link_(The_Legend_of_Zelda)`
34+
* `http://üñîçøðé.com/``http://üñîçøðé.com/`
35+
36+
### Email address extraction
37+
38+
Extracts emails such as `foo@example.com`. Doesn't support quoted local parts
39+
such as `"this is sparta"@example.com`. Matches international email addresses,
40+
but doesn't verify the domain name (may match too much).
41+
42+
Examples:
43+
44+
* `foo@example.com``foo@example.com`
45+
* `foo@example.com.``foo@example.com`
46+
* `foo@example.com,``foo@example.com`
47+
* `üñîçøðé@üñîçøðé.com``üñîçøðé@üñîçøðé.com`
48+
49+
Usage
50+
-----
51+
52+
Extract links:
53+
54+
```java
55+
import org.nibor.autolink.*;
56+
57+
String input = "wow, so example: http://test.com";
58+
LinkExtractor linkExtractor = LinkExtractor.builder().build();
59+
List<Link> links = linkExtractor.getLinks(input);
60+
Link link = links.get(0);
61+
link.getType(); // LinkType.URL
62+
link.getBeginIndex(); // 17
63+
link.getEndIndex(); // 32
64+
input.substring(link.getBeginIndex(), link.getEndIndex()); // "http://test.com"
65+
```
66+
67+
Wrapping URLs in an <a> tag (doesn't handle escaping, uses Java 8):
68+
69+
```java
70+
import org.nibor.autolink.*;
71+
72+
String input = "wow http://test.com such linked";
73+
LinkExtractor linkExtractor = LinkExtractor.builder()
74+
.linkTypes(EnumSet.of(LinkType.URL)) // limit to URLs
75+
.build();
76+
String result = Autolink.renderLinks(input, linkExtractor, (link, sb) -> {
77+
sb.append("<a href=\"");
78+
sb.append(input, link.getBeginIndex(), link.getEndIndex());
79+
sb.append("\">");
80+
sb.append(input, link.getBeginIndex(), link.getEndIndex());
81+
sb.append("</a>");
82+
});
83+
result; // wow <a href="http://test.com">http://test.com</a> such linked
84+
```
85+
86+
License
87+
-------
88+
89+
MIT, see [LICENSE] file.

pom.xml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<groupId>org.nibor.autolink</groupId>
8+
<artifactId>autolink</artifactId>
9+
<version>0.1.0-SNAPSHOT</version>
10+
11+
<name>autolink-java</name>
12+
<description>
13+
Java library to extract links (URLs, email addresses) from plain text; fast, small and hopefully smart
14+
</description>
15+
<url>https://github.com/robinst/autolink-java</url>
16+
17+
<properties>
18+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
19+
</properties>
20+
21+
<build>
22+
<plugins>
23+
<plugin>
24+
<groupId>org.apache.maven.plugins</groupId>
25+
<artifactId>maven-compiler-plugin</artifactId>
26+
<version>3.3</version>
27+
<configuration>
28+
<source>7</source>
29+
<target>7</target>
30+
</configuration>
31+
</plugin>
32+
</plugins>
33+
</build>
34+
35+
<dependencies>
36+
<dependency>
37+
<groupId>junit</groupId>
38+
<artifactId>junit</artifactId>
39+
<version>4.12</version>
40+
<scope>test</scope>
41+
</dependency>
42+
</dependencies>
43+
44+
<developers>
45+
<developer>
46+
<name>Robin Stocker</name>
47+
<email>robin@nibor.org</email>
48+
<url>https://github.com/robinst/</url>
49+
</developer>
50+
</developers>
51+
52+
<licenses>
53+
<license>
54+
<name>MIT License</name>
55+
<url>http://www.opensource.org/licenses/mit-license.php</url>
56+
</license>
57+
</licenses>
58+
59+
<scm>
60+
<connection>scm:git:git@github.com:robinst/autolink-java.git</connection>
61+
<developerConnection>scm:git:git@github.com:robinst/autolink-java.git</developerConnection>
62+
<url>https://github.com/robinst/autolink-java</url>
63+
</scm>
64+
65+
</project>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package org.nibor.autolink;
2+
3+
import java.util.List;
4+
5+
public class Autolink {
6+
7+
public static String renderLinks(CharSequence input, LinkExtractor linkExtractor, LinkRenderer linkRenderer) {
8+
List<Link> links = linkExtractor.getLinks(input);
9+
StringBuilder sb = new StringBuilder(input.length() + 16);
10+
int lastIndex = 0;
11+
for (Link link : links) {
12+
sb.append(input, lastIndex, link.getBeginIndex());
13+
linkRenderer.render(link, sb);
14+
lastIndex = link.getEndIndex();
15+
}
16+
if (lastIndex < input.length()) {
17+
sb.append(input, lastIndex, input.length());
18+
}
19+
return sb.toString();
20+
}
21+
22+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package org.nibor.autolink;
2+
3+
/**
4+
* Information for an extracted link.
5+
*/
6+
public interface Link {
7+
8+
/**
9+
* @return the type of link
10+
*/
11+
LinkType getType();
12+
13+
/**
14+
* @return begin index (inclusive) in the original input that this link starts at
15+
*/
16+
int getBeginIndex();
17+
18+
/**
19+
* @return end index (exclusive) in the original input that this link ends at; in other words, index of first
20+
* character after link
21+
*/
22+
int getEndIndex();
23+
24+
}
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package org.nibor.autolink;
2+
3+
import org.nibor.autolink.internal.EmailScanner;
4+
import org.nibor.autolink.internal.Scanner;
5+
import org.nibor.autolink.internal.UrlScanner;
6+
7+
import java.util.*;
8+
9+
/**
10+
* Extracts links from input.
11+
* <p>
12+
* Create and configure an extractor using {@link #builder()}.
13+
*/
14+
public class LinkExtractor {
15+
16+
private static Scanner URL_SCANNER = new UrlScanner();
17+
private static Scanner EMAIL_SCANNER = new EmailScanner();
18+
19+
private final Set<LinkType> linkTypes;
20+
21+
private LinkExtractor(Set<LinkType> linkTypes) {
22+
this.linkTypes = linkTypes;
23+
}
24+
25+
public static Builder builder() {
26+
return new Builder();
27+
}
28+
29+
/**
30+
* Extract the links from the input. Can be called multiple times with different inputs, is thread-safe.
31+
*
32+
* @param input the input text, must not be {@code null}
33+
* @return the links, in order that they appear in the input, never {@code null}
34+
*/
35+
public List<Link> getLinks(CharSequence input) {
36+
List<Link> links = new ArrayList<>();
37+
38+
int rewindIndex = 0;
39+
int[] result = new int[2];
40+
41+
int i = 0;
42+
int length = input.length();
43+
while (i < length) {
44+
Scanner scanner = trigger(input.charAt(i));
45+
if (scanner != null) {
46+
boolean found = scanner.scan(input, i, rewindIndex, result);
47+
if (found) {
48+
links.add(new LinkImpl(scanner.getLinkType(), result[0], result[1]));
49+
rewindIndex = result[1];
50+
i = result[1];
51+
} else {
52+
i++;
53+
}
54+
} else {
55+
i++;
56+
}
57+
}
58+
return links;
59+
}
60+
61+
private Scanner trigger(char c) {
62+
switch (c) {
63+
case ':':
64+
if (linkTypes.contains(LinkType.URL)) {
65+
return URL_SCANNER;
66+
}
67+
break;
68+
case '@':
69+
if (linkTypes.contains(LinkType.EMAIL)) {
70+
return EMAIL_SCANNER;
71+
}
72+
}
73+
return null;
74+
}
75+
76+
/**
77+
* Builder for configuring link extractor.
78+
*/
79+
public static class Builder {
80+
81+
private Set<LinkType> linkTypes = EnumSet.allOf(LinkType.class);
82+
83+
private Builder() {
84+
}
85+
86+
/**
87+
* @param linkTypes the link types that should be extracted (by default, all types are extracted)
88+
* @return this builder
89+
*/
90+
public Builder linkTypes(Set<LinkType> linkTypes) {
91+
this.linkTypes = Objects.requireNonNull(linkTypes, "linkTypes must not be null");
92+
return this;
93+
}
94+
95+
/**
96+
* @return the configured link extractor
97+
*/
98+
public LinkExtractor build() {
99+
return new LinkExtractor(linkTypes);
100+
}
101+
}
102+
103+
private static class LinkImpl implements Link {
104+
105+
private final LinkType linkType;
106+
private final int beginIndex;
107+
private final int endIndex;
108+
109+
private LinkImpl(LinkType linkType, int beginIndex, int endIndex) {
110+
this.linkType = linkType;
111+
this.beginIndex = beginIndex;
112+
this.endIndex = endIndex;
113+
}
114+
115+
@Override
116+
public LinkType getType() {
117+
return linkType;
118+
}
119+
120+
@Override
121+
public int getBeginIndex() {
122+
return beginIndex;
123+
}
124+
125+
@Override
126+
public int getEndIndex() {
127+
return endIndex;
128+
}
129+
130+
@Override
131+
public String toString() {
132+
return "Link{type=" + getType() + ", beginIndex=" + beginIndex + ", endIndex=" + endIndex + "}";
133+
}
134+
}
135+
136+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package org.nibor.autolink;
2+
3+
public interface LinkRenderer {
4+
5+
void render(Link link, StringBuilder sb);
6+
7+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
package org.nibor.autolink;
2+
3+
public enum LinkType {
4+
URL, EMAIL
5+
}

0 commit comments

Comments
 (0)