Skip to content

Commit 1a53723

Browse files
authored
Merge pull request #362 from marklogic/feature/marklogic-spark-api
Refactor: Extracted marklogic-spark-api
2 parents 48fc0ba + af95421 commit 1a53723

File tree

20 files changed

+228
-138
lines changed

20 files changed

+228
-138
lines changed

build.gradle

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,32 @@
1+
plugins {
2+
id "java-library"
3+
id "jacoco-report-aggregation"
4+
id "org.sonarqube" version "5.1.0.4882"
5+
}
6+
7+
// See https://docs.gradle.org/current/samples/sample_jvm_multi_project_with_code_coverage_standalone.html .
8+
// This isn't picking up jacoco coverage of all the subprojects. Will figure that out soon.
9+
dependencies {
10+
jacocoAggregation project(':marklogic-langchain4j')
11+
jacocoAggregation project(':marklogic-spark-api')
12+
jacocoAggregation project(':marklogic-spark-langchain4j')
13+
jacocoAggregation project(':marklogic-spark-connector')
14+
}
15+
16+
sonar {
17+
properties {
18+
property "sonar.projectKey", "marklogic-spark"
19+
property "sonar.host.url", "http://localhost:9000"
20+
}
21+
}
22+
123
subprojects {
224
apply plugin: "java-library"
25+
apply plugin: "jacoco"
26+
apply plugin: "org.sonarqube"
327

428
group = "com.marklogic"
5-
version '2.5-SNAPSHOT'
29+
version "2.5-SNAPSHOT"
630

731
java {
832
sourceCompatibility = 11
@@ -19,9 +43,18 @@ subprojects {
1943

2044
test {
2145
useJUnitPlatform()
46+
finalizedBy jacocoTestReport
2247
testLogging {
2348
events 'started', 'passed', 'skipped', 'failed'
2449
exceptionFormat 'full'
2550
}
2651
}
52+
53+
// See https://docs.gradle.org/current/userguide/jacoco_plugin.html .
54+
jacocoTestReport {
55+
dependsOn test
56+
reports {
57+
xml.required = true
58+
}
59+
}
2760
}

gradle/wrapper/gradle-wrapper.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
distributionBase=GRADLE_USER_HOME
22
distributionPath=wrapper/dists
3-
distributionUrl=https\://services.gradle.org/distributions/gradle-8.9-bin.zip
3+
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
44
networkTimeout=10000
55
validateDistributionUrl=true
66
zipStoreBase=GRADLE_USER_HOME

marklogic-spark-api/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This module defines the public API that marklogic-spark-langchain4j can depend on without depending on Spark or the
2+
connector itself.

marklogic-spark-api/build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
dependencies {
2+
implementation "org.slf4j:jcl-over-slf4j:2.0.13"
3+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
3+
*/
4+
package com.marklogic.spark;
5+
6+
import java.io.Serializable;
7+
import java.util.Map;
8+
import java.util.stream.Stream;
9+
10+
public abstract class Context implements Serializable {
11+
12+
protected final Map<String, String> properties;
13+
14+
protected Context(Map<String, String> properties) {
15+
this.properties = properties;
16+
}
17+
18+
public final boolean hasOption(String... options) {
19+
return Stream.of(options)
20+
.anyMatch(option -> properties.get(option) != null && properties.get(option).trim().length() > 0);
21+
}
22+
23+
public final String getStringOption(String option) {
24+
return getStringOption(option, null);
25+
}
26+
27+
public final String getStringOption(String option, String defaultValue) {
28+
return hasOption(option) ? properties.get(option).trim() : defaultValue;
29+
}
30+
31+
public final int getIntOption(String optionName, int defaultValue, int minimumValue) {
32+
return (int) getNumericOption(optionName, defaultValue, minimumValue);
33+
}
34+
35+
public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
36+
try {
37+
long value = this.getProperties().containsKey(optionName) ?
38+
Long.parseLong(this.getProperties().get(optionName)) :
39+
defaultValue;
40+
if (value != defaultValue && value < minimumValue) {
41+
throw new ConnectorException(String.format("The value of '%s' must be %d or greater.", getOptionNameForMessage(optionName), minimumValue));
42+
}
43+
return value;
44+
} catch (NumberFormatException ex) {
45+
throw new ConnectorException(String.format("The value of '%s' must be numeric.", getOptionNameForMessage(optionName)), ex);
46+
}
47+
}
48+
49+
public final boolean getBooleanOption(String option, boolean defaultValue) {
50+
return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;
51+
}
52+
53+
public final String getOptionNameForMessage(String option) {
54+
return Util.getOptionNameForErrorMessage(option);
55+
}
56+
57+
public Map<String, String> getProperties() {
58+
return properties;
59+
}
60+
}

marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java renamed to marklogic-spark-api/src/main/java/com/marklogic/spark/Util.java

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
*/
44
package com.marklogic.spark;
55

6-
import com.marklogic.client.document.DocumentManager;
76
import org.slf4j.Logger;
87
import org.slf4j.LoggerFactory;
98

@@ -73,21 +72,4 @@ static String getOptionNameForErrorMessage(String option) {
7372
String optionName = bundle.getString(option);
7473
return optionName != null && optionName.trim().length() > 0 ? optionName.trim() : option;
7574
}
76-
77-
static Set<DocumentManager.Metadata> getRequestedMetadata(ContextSupport context) {
78-
Set<DocumentManager.Metadata> set = new HashSet<>();
79-
if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
80-
for (String category : context.getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
81-
if ("content".equalsIgnoreCase(category)) {
82-
continue;
83-
}
84-
if ("metadata".equalsIgnoreCase(category)) {
85-
set.add(DocumentManager.Metadata.ALL);
86-
} else {
87-
set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
88-
}
89-
}
90-
}
91-
return set;
92-
}
9375
}

marklogic-spark-connector/build.gradle

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ plugins {
33
id 'com.gradleup.shadow' version '8.3.3'
44
id "com.marklogic.ml-gradle" version "5.0.0"
55
id 'maven-publish'
6-
id "jacoco"
7-
id "org.sonarqube" version "5.1.0.4882"
86
}
97

108
configurations {
@@ -29,12 +27,14 @@ configurations.all {
2927

3028
dependencies {
3129
// This is compileOnly as any environment this is used in will provide the Spark dependencies itself.
32-
compileOnly ('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
30+
compileOnly('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
3331
// Excluded from our ETL tool for size reasons, so excluded here as well to ensure we don't need it.
3432
exclude module: "rocksdbjni"
3533
}
3634

37-
shadowDependencies (project(":marklogic-langchain4j")) {
35+
shadowDependencies project(":marklogic-spark-api")
36+
37+
shadowDependencies("com.marklogic:marklogic-client-api:7.0.0") {
3838
// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
3939
// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
4040
// So the 4 Jackson modules are excluded to allow for Spark's to be used.
@@ -43,7 +43,7 @@ dependencies {
4343
}
4444

4545
// Required for converting JSON to XML. Using 2.15.2 to align with Spark 3.5.3.
46-
shadowDependencies ("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.15.2") {
46+
shadowDependencies("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.15.2") {
4747
// Not needed, as the modules in this group that this dependency depends on are all provided by Spark.
4848
exclude group: "com.fasterxml.jackson.core"
4949
}
@@ -52,26 +52,30 @@ dependencies {
5252
shadowDependencies 'com.squareup.okhttp3:okhttp:4.12.0'
5353

5454
// Supports reading and writing RDF data.
55-
shadowDependencies ("org.apache.jena:jena-arq:4.10.0") {
55+
shadowDependencies("org.apache.jena:jena-arq:4.10.0") {
5656
exclude group: "com.fasterxml.jackson.core"
5757
exclude group: "com.fasterxml.jackson.dataformat"
5858
}
5959

60+
// Allows us to test the langchain4j-specific features without including them in the connector by default.
61+
testImplementation project(":marklogic-langchain4j")
62+
testImplementation project(":marklogic-spark-langchain4j")
63+
6064
// Supports testing the embedder feature.
6165
testImplementation "dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:0.35.0"
6266

6367
// Needed for some XML operations that are far easier with JDOM2 than with DOM.
6468
shadowDependencies "org.jdom:jdom2:2.0.6.1"
6569

66-
testImplementation ('com.marklogic:ml-app-deployer:5.0.0') {
70+
testImplementation('com.marklogic:ml-app-deployer:5.0.0') {
6771
exclude group: "com.fasterxml.jackson.core"
6872
exclude group: "com.fasterxml.jackson.dataformat"
6973

7074
// Use the Java Client declared above.
7175
exclude module: "marklogic-client-api"
7276
}
7377

74-
testImplementation ('com.marklogic:marklogic-junit5:1.5.0') {
78+
testImplementation('com.marklogic:marklogic-junit5:1.5.0') {
7579
exclude group: "com.fasterxml.jackson.core"
7680
exclude group: "com.fasterxml.jackson.dataformat"
7781

@@ -85,26 +89,10 @@ dependencies {
8589
}
8690

8791
test {
88-
finalizedBy jacocoTestReport
8992
// Allows mlHost to override the value in gradle.properties, which the test plumbing will default to.
9093
environment "mlHost", mlHost
9194
}
9295

93-
// See https://docs.gradle.org/current/userguide/jacoco_plugin.html .
94-
jacocoTestReport {
95-
dependsOn test
96-
reports {
97-
xml.required = true
98-
}
99-
}
100-
101-
sonar {
102-
properties {
103-
property "sonar.projectKey", "marklogic-spark"
104-
property "sonar.host.url", "http://localhost:9000"
105-
}
106-
}
107-
10896
task reloadTestData(type: com.marklogic.gradle.task.MarkLogicTask) {
10997
description = "Convenience task for clearing the test database and reloading the test data; only intended for a connector developer to use."
11098
doLast {

marklogic-spark-connector/src/main/java/com/marklogic/spark/ContextSupport.java

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@
55

66
import com.marklogic.client.DatabaseClient;
77
import com.marklogic.client.DatabaseClientFactory;
8+
import com.marklogic.client.document.DocumentManager;
89
import com.marklogic.client.extra.okhttpclient.OkHttpClientConfigurator;
910
import org.slf4j.Logger;
1011
import org.slf4j.LoggerFactory;
1112

1213
import java.io.Serializable;
1314
import java.util.HashMap;
15+
import java.util.HashSet;
1416
import java.util.Map;
17+
import java.util.Set;
1518
import java.util.concurrent.TimeUnit;
19+
import java.util.stream.Stream;
1620

17-
public class ContextSupport implements Serializable {
21+
public class ContextSupport extends Context implements Serializable {
1822

1923
protected static final Logger logger = LoggerFactory.getLogger(ContextSupport.class);
20-
private final Map<String, String> properties;
2124
private final boolean configuratorWasAdded;
2225

2326
// Java Client 6.5.0 has a bug in it (to be fixed in 6.5.1 or 6.6.0) where multiple threads that use a configurator
@@ -26,7 +29,7 @@ public class ContextSupport implements Serializable {
2629
private static final Object CLIENT_LOCK = new Object();
2730

2831
public ContextSupport(Map<String, String> properties) {
29-
this.properties = properties;
32+
super(properties);
3033
this.configuratorWasAdded = addOkHttpConfiguratorIfNecessary();
3134
}
3235

@@ -92,10 +95,6 @@ public final boolean optionExists(String option) {
9295
return value != null && value.trim().length() > 0;
9396
}
9497

95-
public final String getOptionNameForMessage(String option) {
96-
return Util.getOptionNameForErrorMessage(option);
97-
}
98-
9998
private void parseConnectionString(String value, Map<String, String> connectionProps) {
10099
ConnectionString connectionString = new ConnectionString(value, getOptionNameForMessage("spark.marklogic.client.uri"));
101100
connectionProps.put(Options.CLIENT_USERNAME, connectionString.getUsername());
@@ -109,24 +108,6 @@ private void parseConnectionString(String value, Map<String, String> connectionP
109108
}
110109
}
111110

112-
public final int getIntOption(String optionName, int defaultValue, int minimumValue) {
113-
return (int) getNumericOption(optionName, defaultValue, minimumValue);
114-
}
115-
116-
public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
117-
try {
118-
long value = this.getProperties().containsKey(optionName) ?
119-
Long.parseLong(this.getProperties().get(optionName)) :
120-
defaultValue;
121-
if (value != defaultValue && value < minimumValue) {
122-
throw new ConnectorException(String.format("The value of '%s' must be %d or greater.", getOptionNameForMessage(optionName), minimumValue));
123-
}
124-
return value;
125-
} catch (NumberFormatException ex) {
126-
throw new ConnectorException(String.format("The value of '%s' must be numeric.", getOptionNameForMessage(optionName)), ex);
127-
}
128-
}
129-
130111
/**
131112
* Only intended for "write" use cases; an error on "read" is always expected to be propagated to the user.
132113
*
@@ -141,30 +122,10 @@ public final boolean isDirectConnection() {
141122
return value != null && value.equalsIgnoreCase(DatabaseClient.ConnectionType.DIRECT.name());
142123
}
143124

144-
public final boolean hasOption(String... options) {
145-
return Util.hasOption(this.properties, options);
146-
}
147-
148-
public final String getStringOption(String option) {
149-
return getStringOption(option, null);
150-
}
151-
152-
public final String getStringOption(String option, String defaultValue) {
153-
return hasOption(option) ? properties.get(option).trim() : defaultValue;
154-
}
155-
156-
public final boolean getBooleanOption(String option, boolean defaultValue) {
157-
return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;
158-
}
159-
160125
public final boolean isStreamingFiles() {
161126
return "true".equalsIgnoreCase(getStringOption(Options.STREAM_FILES));
162127
}
163128

164-
public Map<String, String> getProperties() {
165-
return properties;
166-
}
167-
168129
/**
169130
* @return true if a configurator was added
170131
*/
@@ -197,4 +158,21 @@ private boolean addOkHttpConfiguratorIfNecessary() {
197158
}
198159
return false;
199160
}
161+
162+
public static Set<DocumentManager.Metadata> getRequestedMetadata(ContextSupport context) {
163+
Set<DocumentManager.Metadata> set = new HashSet<>();
164+
if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
165+
for (String category : context.getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
166+
if ("content".equalsIgnoreCase(category)) {
167+
continue;
168+
}
169+
if ("metadata".equalsIgnoreCase(category)) {
170+
set.add(DocumentManager.Metadata.ALL);
171+
} else {
172+
set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
173+
}
174+
}
175+
}
176+
return set;
177+
}
200178
}

0 commit comments

Comments
 (0)