Skip to content

Commit af95421

Browse files
committed
Refactor: Extracted marklogic-spark-api
We now have a loose coupling between the connector and the langchain4j stuff. For the 2.5.0 release, it'll all be combined, but this prepares us for upgrading langchain4j in the near future. Haven't gotten jacoco working across multiple subprojects. That's an issue in Flux too. Going to figure that out soon.
1 parent 48fc0ba commit af95421

File tree

20 files changed

+228
-138
lines changed

20 files changed

+228
-138
lines changed

build.gradle

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,32 @@
1+
plugins {
2+
id "java-library"
3+
id "jacoco-report-aggregation"
4+
id "org.sonarqube" version "5.1.0.4882"
5+
}
6+
7+
// See https://docs.gradle.org/current/samples/sample_jvm_multi_project_with_code_coverage_standalone.html .
8+
// This isn't picking up jacoco coverage of all the subprojects. Will figure that out soon.
9+
dependencies {
10+
jacocoAggregation project(':marklogic-langchain4j')
11+
jacocoAggregation project(':marklogic-spark-api')
12+
jacocoAggregation project(':marklogic-spark-langchain4j')
13+
jacocoAggregation project(':marklogic-spark-connector')
14+
}
15+
16+
sonar {
17+
properties {
18+
property "sonar.projectKey", "marklogic-spark"
19+
property "sonar.host.url", "http://localhost:9000"
20+
}
21+
}
22+
123
subprojects {
224
apply plugin: "java-library"
25+
apply plugin: "jacoco"
26+
apply plugin: "org.sonarqube"
327

428
group = "com.marklogic"
5-
version '2.5-SNAPSHOT'
29+
version "2.5-SNAPSHOT"
630

731
java {
832
sourceCompatibility = 11
@@ -19,9 +43,18 @@ subprojects {
1943

2044
test {
2145
useJUnitPlatform()
46+
finalizedBy jacocoTestReport
2247
testLogging {
2348
events 'started', 'passed', 'skipped', 'failed'
2449
exceptionFormat 'full'
2550
}
2651
}
52+
53+
// See https://docs.gradle.org/current/userguide/jacoco_plugin.html .
54+
jacocoTestReport {
55+
dependsOn test
56+
reports {
57+
xml.required = true
58+
}
59+
}
2760
}

gradle/wrapper/gradle-wrapper.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
distributionBase=GRADLE_USER_HOME
22
distributionPath=wrapper/dists
3-
distributionUrl=https\://services.gradle.org/distributions/gradle-8.9-bin.zip
3+
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
44
networkTimeout=10000
55
validateDistributionUrl=true
66
zipStoreBase=GRADLE_USER_HOME

marklogic-spark-api/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This module defines the public API that marklogic-spark-langchain4j can depend on without depending on Spark or the
2+
connector itself.

marklogic-spark-api/build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
dependencies {
2+
implementation "org.slf4j:jcl-over-slf4j:2.0.13"
3+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
3+
*/
4+
package com.marklogic.spark;
5+
6+
import java.io.Serializable;
7+
import java.util.Map;
8+
import java.util.stream.Stream;
9+
10+
public abstract class Context implements Serializable {
11+
12+
protected final Map<String, String> properties;
13+
14+
protected Context(Map<String, String> properties) {
15+
this.properties = properties;
16+
}
17+
18+
public final boolean hasOption(String... options) {
19+
return Stream.of(options)
20+
.anyMatch(option -> properties.get(option) != null && properties.get(option).trim().length() > 0);
21+
}
22+
23+
public final String getStringOption(String option) {
24+
return getStringOption(option, null);
25+
}
26+
27+
public final String getStringOption(String option, String defaultValue) {
28+
return hasOption(option) ? properties.get(option).trim() : defaultValue;
29+
}
30+
31+
public final int getIntOption(String optionName, int defaultValue, int minimumValue) {
32+
return (int) getNumericOption(optionName, defaultValue, minimumValue);
33+
}
34+
35+
public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
36+
try {
37+
long value = this.getProperties().containsKey(optionName) ?
38+
Long.parseLong(this.getProperties().get(optionName)) :
39+
defaultValue;
40+
if (value != defaultValue && value < minimumValue) {
41+
throw new ConnectorException(String.format("The value of '%s' must be %d or greater.", getOptionNameForMessage(optionName), minimumValue));
42+
}
43+
return value;
44+
} catch (NumberFormatException ex) {
45+
throw new ConnectorException(String.format("The value of '%s' must be numeric.", getOptionNameForMessage(optionName)), ex);
46+
}
47+
}
48+
49+
public final boolean getBooleanOption(String option, boolean defaultValue) {
50+
return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;
51+
}
52+
53+
public final String getOptionNameForMessage(String option) {
54+
return Util.getOptionNameForErrorMessage(option);
55+
}
56+
57+
public Map<String, String> getProperties() {
58+
return properties;
59+
}
60+
}

marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java renamed to marklogic-spark-api/src/main/java/com/marklogic/spark/Util.java

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
*/
44
package com.marklogic.spark;
55

6-
import com.marklogic.client.document.DocumentManager;
76
import org.slf4j.Logger;
87
import org.slf4j.LoggerFactory;
98

@@ -73,21 +72,4 @@ static String getOptionNameForErrorMessage(String option) {
7372
String optionName = bundle.getString(option);
7473
return optionName != null && optionName.trim().length() > 0 ? optionName.trim() : option;
7574
}
76-
77-
static Set<DocumentManager.Metadata> getRequestedMetadata(ContextSupport context) {
78-
Set<DocumentManager.Metadata> set = new HashSet<>();
79-
if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
80-
for (String category : context.getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
81-
if ("content".equalsIgnoreCase(category)) {
82-
continue;
83-
}
84-
if ("metadata".equalsIgnoreCase(category)) {
85-
set.add(DocumentManager.Metadata.ALL);
86-
} else {
87-
set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
88-
}
89-
}
90-
}
91-
return set;
92-
}
9375
}

marklogic-spark-connector/build.gradle

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ plugins {
33
id 'com.gradleup.shadow' version '8.3.3'
44
id "com.marklogic.ml-gradle" version "5.0.0"
55
id 'maven-publish'
6-
id "jacoco"
7-
id "org.sonarqube" version "5.1.0.4882"
86
}
97

108
configurations {
@@ -29,12 +27,14 @@ configurations.all {
2927

3028
dependencies {
3129
// This is compileOnly as any environment this is used in will provide the Spark dependencies itself.
32-
compileOnly ('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
30+
compileOnly('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
3331
// Excluded from our ETL tool for size reasons, so excluded here as well to ensure we don't need it.
3432
exclude module: "rocksdbjni"
3533
}
3634

37-
shadowDependencies (project(":marklogic-langchain4j")) {
35+
shadowDependencies project(":marklogic-spark-api")
36+
37+
shadowDependencies("com.marklogic:marklogic-client-api:7.0.0") {
3838
// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
3939
// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
4040
// So the 4 Jackson modules are excluded to allow for Spark's to be used.
@@ -43,7 +43,7 @@ dependencies {
4343
}
4444

4545
// Required for converting JSON to XML. Using 2.15.2 to align with Spark 3.5.3.
46-
shadowDependencies ("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.15.2") {
46+
shadowDependencies("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.15.2") {
4747
// Not needed, as the modules in this group that this dependency depends on are all provided by Spark.
4848
exclude group: "com.fasterxml.jackson.core"
4949
}
@@ -52,26 +52,30 @@ dependencies {
5252
shadowDependencies 'com.squareup.okhttp3:okhttp:4.12.0'
5353

5454
// Supports reading and writing RDF data.
55-
shadowDependencies ("org.apache.jena:jena-arq:4.10.0") {
55+
shadowDependencies("org.apache.jena:jena-arq:4.10.0") {
5656
exclude group: "com.fasterxml.jackson.core"
5757
exclude group: "com.fasterxml.jackson.dataformat"
5858
}
5959

60+
// Allows us to test the langchain4j-specific features without including them in the connector by default.
61+
testImplementation project(":marklogic-langchain4j")
62+
testImplementation project(":marklogic-spark-langchain4j")
63+
6064
// Supports testing the embedder feature.
6165
testImplementation "dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:0.35.0"
6266

6367
// Needed for some XML operations that are far easier with JDOM2 than with DOM.
6468
shadowDependencies "org.jdom:jdom2:2.0.6.1"
6569

66-
testImplementation ('com.marklogic:ml-app-deployer:5.0.0') {
70+
testImplementation('com.marklogic:ml-app-deployer:5.0.0') {
6771
exclude group: "com.fasterxml.jackson.core"
6872
exclude group: "com.fasterxml.jackson.dataformat"
6973

7074
// Use the Java Client declared above.
7175
exclude module: "marklogic-client-api"
7276
}
7377

74-
testImplementation ('com.marklogic:marklogic-junit5:1.5.0') {
78+
testImplementation('com.marklogic:marklogic-junit5:1.5.0') {
7579
exclude group: "com.fasterxml.jackson.core"
7680
exclude group: "com.fasterxml.jackson.dataformat"
7781

@@ -85,26 +89,10 @@ dependencies {
8589
}
8690

8791
test {
88-
finalizedBy jacocoTestReport
8992
// Allows mlHost to override the value in gradle.properties, which the test plumbing will default to.
9093
environment "mlHost", mlHost
9194
}
9295

93-
// See https://docs.gradle.org/current/userguide/jacoco_plugin.html .
94-
jacocoTestReport {
95-
dependsOn test
96-
reports {
97-
xml.required = true
98-
}
99-
}
100-
101-
sonar {
102-
properties {
103-
property "sonar.projectKey", "marklogic-spark"
104-
property "sonar.host.url", "http://localhost:9000"
105-
}
106-
}
107-
10896
task reloadTestData(type: com.marklogic.gradle.task.MarkLogicTask) {
10997
description = "Convenience task for clearing the test database and reloading the test data; only intended for a connector developer to use."
11098
doLast {

marklogic-spark-connector/src/main/java/com/marklogic/spark/ContextSupport.java

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@
55

66
import com.marklogic.client.DatabaseClient;
77
import com.marklogic.client.DatabaseClientFactory;
8+
import com.marklogic.client.document.DocumentManager;
89
import com.marklogic.client.extra.okhttpclient.OkHttpClientConfigurator;
910
import org.slf4j.Logger;
1011
import org.slf4j.LoggerFactory;
1112

1213
import java.io.Serializable;
1314
import java.util.HashMap;
15+
import java.util.HashSet;
1416
import java.util.Map;
17+
import java.util.Set;
1518
import java.util.concurrent.TimeUnit;
19+
import java.util.stream.Stream;
1620

17-
public class ContextSupport implements Serializable {
21+
public class ContextSupport extends Context implements Serializable {
1822

1923
protected static final Logger logger = LoggerFactory.getLogger(ContextSupport.class);
20-
private final Map<String, String> properties;
2124
private final boolean configuratorWasAdded;
2225

2326
// Java Client 6.5.0 has a bug in it (to be fixed in 6.5.1 or 6.6.0) where multiple threads that use a configurator
@@ -26,7 +29,7 @@ public class ContextSupport implements Serializable {
2629
private static final Object CLIENT_LOCK = new Object();
2730

2831
public ContextSupport(Map<String, String> properties) {
29-
this.properties = properties;
32+
super(properties);
3033
this.configuratorWasAdded = addOkHttpConfiguratorIfNecessary();
3134
}
3235

@@ -92,10 +95,6 @@ public final boolean optionExists(String option) {
9295
return value != null && value.trim().length() > 0;
9396
}
9497

95-
public final String getOptionNameForMessage(String option) {
96-
return Util.getOptionNameForErrorMessage(option);
97-
}
98-
9998
private void parseConnectionString(String value, Map<String, String> connectionProps) {
10099
ConnectionString connectionString = new ConnectionString(value, getOptionNameForMessage("spark.marklogic.client.uri"));
101100
connectionProps.put(Options.CLIENT_USERNAME, connectionString.getUsername());
@@ -109,24 +108,6 @@ private void parseConnectionString(String value, Map<String, String> connectionP
109108
}
110109
}
111110

112-
public final int getIntOption(String optionName, int defaultValue, int minimumValue) {
113-
return (int) getNumericOption(optionName, defaultValue, minimumValue);
114-
}
115-
116-
public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
117-
try {
118-
long value = this.getProperties().containsKey(optionName) ?
119-
Long.parseLong(this.getProperties().get(optionName)) :
120-
defaultValue;
121-
if (value != defaultValue && value < minimumValue) {
122-
throw new ConnectorException(String.format("The value of '%s' must be %d or greater.", getOptionNameForMessage(optionName), minimumValue));
123-
}
124-
return value;
125-
} catch (NumberFormatException ex) {
126-
throw new ConnectorException(String.format("The value of '%s' must be numeric.", getOptionNameForMessage(optionName)), ex);
127-
}
128-
}
129-
130111
/**
131112
* Only intended for "write" use cases; an error on "read" is always expected to be propagated to the user.
132113
*
@@ -141,30 +122,10 @@ public final boolean isDirectConnection() {
141122
return value != null && value.equalsIgnoreCase(DatabaseClient.ConnectionType.DIRECT.name());
142123
}
143124

144-
public final boolean hasOption(String... options) {
145-
return Util.hasOption(this.properties, options);
146-
}
147-
148-
public final String getStringOption(String option) {
149-
return getStringOption(option, null);
150-
}
151-
152-
public final String getStringOption(String option, String defaultValue) {
153-
return hasOption(option) ? properties.get(option).trim() : defaultValue;
154-
}
155-
156-
public final boolean getBooleanOption(String option, boolean defaultValue) {
157-
return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;
158-
}
159-
160125
public final boolean isStreamingFiles() {
161126
return "true".equalsIgnoreCase(getStringOption(Options.STREAM_FILES));
162127
}
163128

164-
public Map<String, String> getProperties() {
165-
return properties;
166-
}
167-
168129
/**
169130
* @return true if a configurator was added
170131
*/
@@ -197,4 +158,21 @@ private boolean addOkHttpConfiguratorIfNecessary() {
197158
}
198159
return false;
199160
}
161+
162+
public static Set<DocumentManager.Metadata> getRequestedMetadata(ContextSupport context) {
163+
Set<DocumentManager.Metadata> set = new HashSet<>();
164+
if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
165+
for (String category : context.getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
166+
if ("content".equalsIgnoreCase(category)) {
167+
continue;
168+
}
169+
if ("metadata".equalsIgnoreCase(category)) {
170+
set.add(DocumentManager.Metadata.ALL);
171+
} else {
172+
set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
173+
}
174+
}
175+
}
176+
return set;
177+
}
200178
}

0 commit comments

Comments
 (0)