Skip to content

Commit d8f2c1d

Browse files
authored
Merge pull request #4 from audriga/separate-sml-extraction
Separate SML extraction and scanner from main code
2 parents 34b7156 + f32a156 commit d8f2c1d

34 files changed

+204
-161
lines changed

README.md

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,29 +24,28 @@ The goal of this library is to support and showcase multiple possible approaches
2424
To create structured email messages, simply use the generator to create a MIME message with structured data included in the HTML body via `<script>` tag:
2525

2626
```java
27-
import com.audriga.jakarta.sml.model.StructuredData;
28-
import com.audriga.jakarta.sml.mime.StructuredMimeMessageWrapper;
29-
import com.audriga.jakarta.sml.mime.InlineHtmlMessageBuilder;
27+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
28+
import com.audriga.jakarta.sml.extension.mime.StructuredMimeMessageWrapper;
29+
import com.audriga.jakarta.sml.extension.mime.InlineHtmlMessageBuilder;
3030
import jakarta.mail.MessagingException;
3131

3232
import java.util.ArrayList;
33-
import java.util.Collections;
3433

3534
public class Example {
3635

3736
public static void main(String[] args) throws MessagingException {
38-
37+
3938
// Comment email content elements
4039
String emailSubject = "My first structured email";
4140
String textEmailBody = "This is a test email";
4241
String htmlEmailBody = "<html><body>This is a <b>test email</b></body></html>";
43-
42+
4443
// Structured data
4544
String jsonLd = "{\r\n \"@context\": \"http://schema.org\",\r\n \"@type\": \"EventReservation\",\r\n \"reservationId\": \"MBE12345\",\r\n \"underName\": {\r\n \"@type\": \"Person\",\r\n \"name\": \"Noah Baumbach\"\r\n },\r\n \"reservationFor\": {\r\n \"@type\": \"Event\",\r\n \"name\": \"Make Better Email 2024\",\r\n \"startDate\": \"2024-10-15\",\r\n \"organizer\": {\r\n \"@type\": \"Organization\",\r\n \"name\": \"Fastmail Pty Ltd.\",\r\n \"logo\": \"https://www.fastmail.com/assets/images/FM-Logo-RGB-IiFj8alCx1-3073.webp\"\r\n },\r\n \"location\": {\r\n \"@type\": \"Place\",\r\n \"name\": \"Isode Ltd\",\r\n \"address\": {\r\n \"@type\": \"PostalAddress\",\r\n \"streetAddress\": \"14 Castle Mews\",\r\n \"addressLocality\": \"Hampton\",\r\n \"addressRegion\": \"Greater London\",\r\n \"postalCode\": \"TW12 2NP\",\r\n \"addressCountry\": \"UK\"\r\n }\r\n }\r\n }\r\n}";
4645

4746
List<StructuredData> structuredDataList = new ArrayList<>();
4847
structuredDataList.add(new StructuredData(jsonLd));
49-
48+
5049
StructuredMimeMessageWrapper message = new InlineHtmlMessageBuilder()
5150
.subject(emailSubject)
5251
.textBody(textEmailBody)
@@ -64,19 +63,17 @@ public class Example {
6463
To parse structured email messages, you can use the provided classes and methods to extract structured data from the email content.
6564

6665
```java
67-
import com.audriga.jakarta.sml.mime.StructuredMimeMessageWrapper;
66+
import com.audriga.jakarta.sml.extension.mime.StructuredMimeMessageWrapper;
6867
import com.audriga.jakarta.sml.parser.StructuredEmailParser;
69-
import com.audriga.jakarta.sml.model.StructuredData;
68+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
7069
import jakarta.mail.internet.MimeMessage;
7170

72-
import java.util.List;
73-
7471
public class Example {
7572

7673
public static void main(String[] args) throws Exception {
77-
74+
7875
MimeMessage message = ... // obtain a MimeMessage instance
79-
76+
8077
StructuredMimeMessageWrapper structuredMessage = new StructuredMimeParser().parseMessage(message);
8178

8279
for (StructuredData data : structuredMessage.getStructuredData()) {
@@ -88,7 +85,7 @@ public class Example {
8885

8986
### Further examples
9087

91-
For more complete examples, see the [MailProcessingTest](test/com/audriga/jakarta/sml/test/MailProcessingTest.java) class, which demonstrates parsing and creating mails.
88+
For more complete examples, see the [MailProcessingTest](test/com/audriga/jakarta/sml/extension/MailProcessingTest.java) class, which demonstrates parsing and creating mails.
9289

9390
## Building
9491

@@ -115,6 +112,11 @@ This project contains an IMAP account scanner command line tool, which can be us
115112

116113
The folder `test/resources/eml` contains several example files generated with this library. Refer to the [example files documentation](docs/example-files.md) for more information.
117114

115+
### H2LJ
116+
117+
Part of the source code of this project can be built as a separate JAR. This can be used to extract structured data
118+
from HTML input. It is called Html2JSONLD (Java). See the [H2LJ documentation](docs/h2lj.md) for details.
119+
118120
## Contributing
119121

120122
Contributions are welcome! Please open new issues or pull requests on GitHub.

build.xml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
</copy>
2828
</target>
2929

30-
<target name="jar" depends="resolve,compile" description="Create a jar file">
30+
<target name="jar-scanner" depends="resolve,compile" description="Create a jar file">
3131
<mkdir dir="${dist-dir}"/>
3232
<jar jarfile="${dist-dir}/__temp.jar">
3333
<zipgroupfileset dir="${lib-source}">
@@ -42,6 +42,20 @@
4242
</jar>
4343
</target>
4444

45+
<target name="jar-h2lj" depends="resolve,compile" description="Create a jar file">
46+
<mkdir dir="${dist-dir}"/>
47+
<jar jarfile="${dist-dir}/h2lj.jar" basedir="${bin}">
48+
<fileset dir="${bin}" includes="src/com/audriga/jakarta/sml/h2lj/**/*.class" />
49+
</jar>
50+
</target>
51+
52+
<target name="jar" depends="resolve,compile" description="Create a jar file">
53+
<mkdir dir="${dist-dir}"/>
54+
<jar jarfile="${dist-dir}/jakarta-structured-email.jar" basedir="${bin}">
55+
<fileset dir="${bin}" includes="src/com/audriga/jakarta/sml/h2lj/**/*.class" />
56+
<fileset dir="${bin}" includes="src/com/audriga/jakarta/sml/extension/**/*.class" />
57+
</jar>
58+
</target>
4559

4660
<target name="clean" description="Cleans this project">
4761
<delete dir="${bin}" failonerror="false" />

docs/h2lj.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# H2LJ
2+
3+
## Overview
4+
5+
H2LJ is a Java-based library designed to extract structured data from HTML.
6+
7+
## Building the Project
8+
9+
To compile the source code and create the JAR files, run:
10+
```sh
11+
ant jar-h2lj
12+
```

docs/scanner.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@ This is a command line tool for scanning an IMAP account.
44

55
Its goal is to **find existing messages which contain Schema.org markup** (JSON-LD or Microdata) and to optionally dump the findings as JSON-LD.
66

7-
**Please consider donating test data** (in anonymized/pseudonomized form) to the [schema-org-examples dataset](https://github.com/audriga/schema-org-examples/).
7+
**Please consider donating test data** (in anonymized/pseudonymous form) to the [schema-org-examples dataset](https://github.com/audriga/schema-org-examples/).
88

99
## Building
1010

11-
See build instructions in the [main README](../README.md#Building)
11+
To build the project, use the following command to create the `sml-account-scan.jar` file under `dist/`:
12+
13+
```shell
14+
ant jar-scanner
15+
```
1216

1317
## Running
1418

@@ -37,7 +41,7 @@ See also additional config options below.
3741

3842
### Using with FastMail/Gmail/Microsoft accounts
3943

40-
Some email providers, such as FastMail, Google, and Microsoft recommend OAuth as the default authentication mechnanism. Since this scanner currently does not support OAuth, you can alternatively set up a so-called "app-specific passwords" for those providers.
44+
Some email providers, such as FastMail, Google, and Microsoft recommend OAuth as the default authentication mechanism. Since this scanner currently does not support OAuth, you can alternatively set up a so-called "app-specific passwords" for those providers.
4145

4246
Please see the corresponding provider documentation for details:
4347

src/com/audriga/jakarta/sml/mime/AbstractMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/AbstractMessageBuilder.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
5-
import com.audriga.jakarta.sml.model.StructuredSyntax;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
5+
import com.audriga.jakarta.sml.h2lj.model.StructuredSyntax;
66
import jakarta.activation.DataHandler;
77
import jakarta.mail.Address;
88
import jakarta.mail.MessagingException;

src/com/audriga/jakarta/sml/mime/GenericStructuredMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/GenericStructuredMessageBuilder.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
44
import jakarta.mail.Message;
55
import jakarta.mail.MessagingException;
66
import jakarta.mail.Multipart;

src/com/audriga/jakarta/sml/mime/HtmlOnlyMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/HtmlOnlyMessageBuilder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.Message;
66
import jakarta.mail.MessagingException;
77

src/com/audriga/jakarta/sml/mime/InlineHtmlMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/InlineHtmlMessageBuilder.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.Message;
66
import jakarta.mail.MessagingException;
7-
import jakarta.mail.Session;
8-
9-
import java.util.Properties;
107

118
public class InlineHtmlMessageBuilder extends AbstractMessageBuilder<InlineHtmlMessageBuilder> {
129

src/com/audriga/jakarta/sml/mime/MimeMultipartBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/MimeMultipartBuilder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.activation.DataHandler;
66
import jakarta.mail.MessagingException;
77
import jakarta.mail.internet.MimeBodyPart;

src/com/audriga/jakarta/sml/mime/MultipartAlternativeMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/MultipartAlternativeMessageBuilder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.Message;
66
import jakarta.mail.MessagingException;
77

src/com/audriga/jakarta/sml/mime/MultipartRelatedMessageBuilder.java renamed to src/com/audriga/jakarta/sml/extension/mime/MultipartRelatedMessageBuilder.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.Message;
66
import jakarta.mail.MessagingException;
77
import jakarta.mail.internet.MimeMultipart;

src/com/audriga/jakarta/sml/mime/StructuredDataContentHandler.java renamed to src/com/audriga/jakarta/sml/extension/mime/StructuredDataContentHandler.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
44
import jakarta.activation.ActivationDataFlavor;
55
import jakarta.activation.DataSource;
66
import org.eclipse.angus.mail.handlers.text_plain;

src/com/audriga/jakarta/sml/mime/StructuredDataContentHandlerFactory.java renamed to src/com/audriga/jakarta/sml/extension/mime/StructuredDataContentHandlerFactory.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
44
import jakarta.activation.DataContentHandler;
55
import jakarta.activation.DataContentHandlerFactory;
66

src/com/audriga/jakarta/sml/mime/StructuredMimeMessageWrapper.java renamed to src/com/audriga/jakarta/sml/extension/mime/StructuredMimeMessageWrapper.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.mime;
1+
package com.audriga.jakarta.sml.extension.mime;
22

3-
import com.audriga.jakarta.sml.model.MimeTextContent;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.*;
66
import jakarta.mail.internet.InternetAddress;
77
import jakarta.mail.internet.MimeMessage;

src/com/audriga/jakarta/sml/model/MimeTextContent.java renamed to src/com/audriga/jakarta/sml/extension/model/MimeTextContent.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.audriga.jakarta.sml.model;
1+
package com.audriga.jakarta.sml.extension.model;
22

33
public class MimeTextContent {
44
private String text;

src/com/audriga/jakarta/sml/parser/JakartaMailUtils.java renamed to src/com/audriga/jakarta/sml/extension/parser/JakartaMailUtils.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
package com.audriga.jakarta.sml.parser;
1+
package com.audriga.jakarta.sml.extension.parser;
22

3-
import com.audriga.jakarta.sml.mime.StructuredMimeMessageWrapper;
3+
import com.audriga.jakarta.sml.extension.sender.StructuredMimeParseUtils;
4+
import com.audriga.jakarta.sml.h2lj.parser.StructuredDataExtractionUtils;
5+
import com.audriga.jakarta.sml.extension.mime.StructuredMimeMessageWrapper;
46
import jakarta.mail.Session;
57
import jakarta.mail.internet.MimeMessage;
68

src/com/audriga/jakarta/sml/sender/EmailSender.java renamed to src/com/audriga/jakarta/sml/extension/sender/EmailSender.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
package com.audriga.jakarta.sml.sender;
1+
package com.audriga.jakarta.sml.extension.sender;
22

3-
import com.audriga.jakarta.sml.mime.*;
4-
import com.audriga.jakarta.sml.model.StructuredData;
3+
import com.audriga.jakarta.sml.extension.mime.*;
4+
import com.audriga.jakarta.sml.h2lj.model.StructuredData;
55
import jakarta.mail.*;
66

77
import java.io.IOException;
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package com.audriga.jakarta.sml.extension.sender;
2+
3+
import com.audriga.jakarta.sml.extension.mime.StructuredMimeMessageWrapper;
4+
import com.audriga.jakarta.sml.extension.model.MimeTextContent;
5+
import jakarta.mail.BodyPart;
6+
import jakarta.mail.MessagingException;
7+
import jakarta.mail.internet.ContentType;
8+
import jakarta.mail.internet.MimeMessage;
9+
import jakarta.mail.internet.MimeMultipart;
10+
11+
import java.io.IOException;
12+
import java.util.Arrays;
13+
import java.util.Collections;
14+
import java.util.List;
15+
16+
public class StructuredMimeParseUtils {
17+
protected static final String TEXT = "text";
18+
protected static final String TEXT_PLAIN = "text/plain";
19+
protected static final String TEXT_ASCII = "text/ascii";
20+
protected static final String TEXT_HTML = "text/html";
21+
22+
public static StructuredMimeMessageWrapper parseMessage(MimeMessage message) throws MessagingException, IOException {
23+
StructuredMimeMessageWrapper smw = new StructuredMimeMessageWrapper(message);
24+
MimeTextContent htmlContent = parseBody(message, Collections.singletonList(TEXT_HTML));
25+
smw.setHtmlBody(htmlContent);
26+
smw.setTextBody(parseBody(message, Arrays.asList(TEXT, TEXT_PLAIN, TEXT_ASCII)));
27+
28+
return smw;
29+
}
30+
31+
public static MimeTextContent parseBody(MimeMessage message, List<String> mimeTypes) throws MessagingException, IOException {
32+
for (String mimeType : mimeTypes) {
33+
if (message.isMimeType(mimeType)) {
34+
return new MimeTextContent((String) message.getContent(), message.getEncoding());
35+
}
36+
}
37+
if (message.isMimeType("multipart/*")) {
38+
MimeMultipart mimeMultipart = (MimeMultipart) message.getContent();
39+
return getBodyFromMultipart(mimeMultipart, mimeTypes);
40+
}
41+
return null;
42+
}
43+
44+
private static MimeTextContent getBodyFromMultipart(MimeMultipart mimeMultipart, List<String> mimeTypes) throws MessagingException, IOException {
45+
for (int i = 0; i < mimeMultipart.getCount(); i++) {
46+
BodyPart bodyPart = mimeMultipart.getBodyPart(i);
47+
for (String mimeType : mimeTypes) {
48+
if (bodyPart.isMimeType(mimeType)) {
49+
String contentType = bodyPart.getContentType().toLowerCase();
50+
ContentType contentTypeObject = new ContentType(contentType);
51+
String charset = contentTypeObject.getParameter("charset");
52+
return new MimeTextContent((String) bodyPart.getContent(), charset);
53+
}
54+
}
55+
if (bodyPart.isMimeType("multipart/*")) {
56+
MimeMultipart nestedMultipart = (MimeMultipart) bodyPart.getContent();
57+
MimeTextContent body = getBodyFromMultipart(nestedMultipart, mimeTypes);
58+
if (body != null) {
59+
return body;
60+
}
61+
}
62+
}
63+
return null;
64+
}
65+
}

src/com/audriga/jakarta/sml/model/StructuredContext.java renamed to src/com/audriga/jakarta/sml/h2lj/model/StructuredContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.audriga.jakarta.sml.model;
1+
package com.audriga.jakarta.sml.h2lj.model;
22

33
public enum StructuredContext {
44
SCHEMA_ORG("https://schema.org");

src/com/audriga/jakarta/sml/model/StructuredData.java renamed to src/com/audriga/jakarta/sml/h2lj/model/StructuredData.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.audriga.jakarta.sml.model;
1+
package com.audriga.jakarta.sml.h2lj.model;
22

33
import org.json.JSONObject;
44
import org.jspecify.annotations.NonNull;

src/com/audriga/jakarta/sml/model/StructuredSyntax.java renamed to src/com/audriga/jakarta/sml/h2lj/model/StructuredSyntax.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.audriga.jakarta.sml.model;
1+
package com.audriga.jakarta.sml.h2lj.model;
22

33
public enum StructuredSyntax {
44
JSON_LD,

0 commit comments

Comments
 (0)