Skip to content

Commit 411bd57

Browse files
feat(spark-k8s): Add the Spark HBase connector (#878)
* feat(spark-k8s): Add the Spark HBase connector * Copy modules from Hadoop and HBase builder images instead of Nexus * Update changelog * Fix linter warnings * Use AWS_JAVA_SDK_BUNDLE instead of a wildcard * Add mvn parameters Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --------- Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com>
1 parent 1c9741f commit 411bd57

File tree

4 files changed

+241
-33
lines changed

4 files changed

+241
-33
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file.
2121
- hbase: Add hadoop-azure.jar to the lib directory to support the Azure Blob Filesystem and
2222
the Azure Data Lake Storage ([#853]).
2323
- kafka: Add cyrus-sasl-gssapi package for kerberos ([#874]).
24+
- spark: Add HBase connector ([#878]).
2425

2526
### Changed
2627

@@ -68,6 +69,7 @@ All notable changes to this project will be documented in this file.
6869
[#868]: https://github.com/stackabletech/docker-images/pull/868
6970
[#874]: https://github.com/stackabletech/docker-images/pull/874
7071
[#877]: https://github.com/stackabletech/docker-images/pull/877
72+
[#878]: https://github.com/stackabletech/docker-images/pull/878
7173
[#879]: https://github.com/stackabletech/docker-images/pull/879
7274

7375
## [24.7.0] - 2024-07-24

spark-k8s/Dockerfile

Lines changed: 182 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
# syntax=docker/dockerfile:1.8.1@sha256:e87caa74dcb7d46cd820352bfea12591f3dba3ddc4285e19c7dcd13359f7cefd
22

3-
FROM stackable/image/java-devel as builder
3+
# hadoop-builder: Provides Hadoop libraries
4+
FROM stackable/image/hadoop AS hadoop-builder
5+
6+
# hbase-builder: Provides HBase libraries
7+
FROM stackable/image/hbase AS hbase-builder
8+
9+
# spark-source-builder: Download the Spark source code into
10+
# /stackable/spark and apply the patches
11+
FROM stackable/image/java-devel as spark-source-builder
412

513
ARG PRODUCT
6-
ARG HADOOP_LONG_VERSION
7-
ARG AWS_JAVA_SDK_BUNDLE
8-
ARG AZURE_STORAGE
9-
ARG AZURE_KEYVAULT_CORE
10-
ARG JACKSON_DATAFORMAT_XML
11-
ARG STAX2_API
12-
ARG WOODSTOX_CORE
13-
ARG JMX_EXPORTER
14-
ARG TARGETARCH
15-
ARG TINI
1614

1715
RUN <<EOF
1816
microdnf update
@@ -27,8 +25,135 @@ EOF
2725

2826
WORKDIR /stackable
2927

30-
COPY --chown=stackable:stackable spark-k8s/stackable/patches/apply_patches.sh /stackable/spark-${PRODUCT}/patches/apply_patches.sh
31-
COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackable/spark-${PRODUCT}/patches/${PRODUCT}
28+
RUN <<EOF
29+
curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz \
30+
| tar xz
31+
ln -s spark-${PRODUCT} spark
32+
EOF
33+
34+
WORKDIR /stackable/spark
35+
36+
COPY --chown=stackable:stackable \
37+
spark-k8s/stackable/patches/apply_patches.sh \
38+
patches/apply_patches.sh
39+
COPY --chown=stackable:stackable \
40+
spark-k8s/stackable/patches/${PRODUCT} \
41+
patches/${PRODUCT}
42+
43+
RUN patches/apply_patches.sh ${PRODUCT}
44+
45+
46+
# hbase-connectors-builder: Build the Spark HBase connector and copy
47+
# required JARs into /stackable/spark/jars
48+
FROM stackable/image/java-devel as hbase-connectors-builder
49+
50+
ARG PRODUCT
51+
ARG HADOOP
52+
ARG HBASE
53+
ARG HBASE_CONNECTOR
54+
55+
WORKDIR /stackable
56+
57+
# Download the hbase-connectors source code
58+
RUN <<EOF
59+
curl https://repo.stackable.tech/repository/packages/hbase-connectors/hbase-connectors_${HBASE_CONNECTOR}.tar.gz \
60+
| tar xz
61+
ln -s hbase-connectors-rel-${HBASE_CONNECTOR} hbase-connectors
62+
EOF
63+
64+
# Copy the pom.xml file from the patched Spark source code to read the
65+
# versions used by Spark. The pom.xml defines child modules which are
66+
# not required and not copied, therefore mvn must be called with the
67+
# parameter --non-recursive.
68+
COPY --chown=stackable:stackable --from=spark-source-builder \
69+
/stackable/spark/pom.xml \
70+
spark/
71+
72+
WORKDIR /stackable/hbase-connectors/spark
73+
74+
RUN <<EOF
75+
# Building the hbase-connectors with JDK 17 is not yet supported, see
76+
# https://github.com/apache/hbase-connectors/pull/132.
77+
# As there are no JDK profiles, access to the non-public elements must
78+
# be enabled with --add-opens, see https://openjdk.org/jeps/403 and
79+
# https://openjdk.org/jeps/261#Breaking-encapsulation.
80+
export JDK_JAVA_OPTIONS="\
81+
--add-opens java.base/java.lang=ALL-UNNAMED \
82+
--add-opens java.base/java.util=ALL-UNNAMED"
83+
84+
# Get the Scala version used by Spark
85+
SCALA_VERSION=$( \
86+
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
87+
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
88+
-DforceStdout \
89+
-Dexpression='project.properties(scala.version)')
90+
91+
# Get the Scala binary version used by Spark
92+
SCALA_BINARY_VERSION=$( \
93+
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
94+
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
95+
-DforceStdout \
96+
-Dexpression='project.properties(scala.binary.version)')
97+
98+
# Build the Spark HBase connector
99+
# Skip the tests because the MiniHBaseCluster does not get ready for
100+
# whatever reason:
101+
# Caused by: java.lang.RuntimeException: Master not active after 30000ms
102+
# at org.apache.hadoop.hbase.util.JVMClusterUtil.waitForEvent(JVMClusterUtil.java:221)
103+
# at org.apache.hadoop.hbase.util.JVMClusterUtil.startup(JVMClusterUtil.java:177)
104+
# at org.apache.hadoop.hbase.LocalHBaseCluster.startup(LocalHBaseCluster.java:407)
105+
# at org.apache.hadoop.hbase.MiniHBaseCluster.init(MiniHBaseCluster.java:250)
106+
mvn \
107+
--batch-mode \
108+
--no-transfer-progress \
109+
--define spark.version="${PRODUCT}" \
110+
--define scala.version="${SCALA_VERSION}" \
111+
--define scala.binary.version="${SCALA_BINARY_VERSION}" \
112+
--define hadoop-three.version="${HADOOP}" \
113+
--define hbase.version="${HBASE}" \
114+
--define skipTests \
115+
clean package
116+
EOF
117+
118+
WORKDIR /stackable/spark/jars
119+
120+
RUN <<EOF
121+
ln -s /stackable/hbase-connectors/spark/hbase-spark/target/hbase-spark-${HBASE_CONNECTOR}.jar
122+
123+
# Download log4j-slf4j-impl-x.x.x.jar containing the StaticLoggerBinder
124+
# which is required by the connector.
125+
# Spark contains only log4j-slf4j2-impl-x.x.x.jar but not
126+
# log4j-slf4j-impl-x.x.x.jar. It is okay to have both JARs in the
127+
# classpath as long as they have the same version.
128+
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
129+
dependency:copy \
130+
-Dartifact=org.apache.logging.log4j:log4j-slf4j-impl:'${log4j.version}' \
131+
-DoutputDirectory=.
132+
EOF
133+
134+
135+
# spark-builder: Build Spark into /stackable/spark-${PRODUCT}/dist,
136+
# download additional JARs and perform checks, like log4shell check.
137+
FROM stackable/image/java-devel as spark-builder
138+
139+
ARG PRODUCT
140+
ARG HADOOP
141+
ARG HBASE
142+
ARG AWS_JAVA_SDK_BUNDLE
143+
ARG AZURE_STORAGE
144+
ARG AZURE_KEYVAULT_CORE
145+
ARG JACKSON_DATAFORMAT_XML
146+
ARG STAX2_API
147+
ARG WOODSTOX_CORE
148+
ARG JMX_EXPORTER
149+
ARG TARGETARCH
150+
ARG TINI
151+
152+
WORKDIR /stackable/spark-${PRODUCT}
153+
154+
COPY --chown=stackable:stackable --from=spark-source-builder \
155+
/stackable/spark/ \
156+
./
32157

33158
# >>> Build spark
34159
# Compiling the tests takes a lot of time, so we skip them
@@ -37,12 +162,9 @@ COPY --chown=stackable:stackable spark-k8s/stackable/patches/${PRODUCT} /stackab
37162
#
38163
# This will download it's own version of maven because the UBI version is too old:
39164
# 134.0 [ERROR] Detected Maven Version: 3.6.3 is not in the allowed range [3.8.8,)
40-
RUN curl https://repo.stackable.tech/repository/packages/spark/spark-${PRODUCT}.tgz | tar -xzf - \
41-
&& cd spark-${PRODUCT} \
42-
&& ./patches/apply_patches.sh ${PRODUCT} \
43-
&& export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
165+
RUN export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" \
44166
&& ./dev/make-distribution.sh \
45-
-Dhadoop.version="$HADOOP_LONG_VERSION" \
167+
-Dhadoop.version="$HADOOP" \
46168
-Dmaven.test.skip=true \
47169
-DskipTests \
48170
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
@@ -55,12 +177,40 @@ RUN curl -o /usr/bin/tini "https://repo.stackable.tech/repository/packages/tini/
55177
# We download these under dist so that log4shell checks them
56178
WORKDIR /stackable/spark-${PRODUCT}/dist/jars
57179

58-
# Download various modules for Hadoop (e.g. support for s3a:// and abfs://)
59-
RUN curl -O https://repo.stackable.tech/repository/packages/aws/hadoop-aws-${HADOOP_LONG_VERSION}.jar \
60-
&& curl -O https://repo.stackable.tech/repository/packages/aws/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
61-
&& curl -O https://repo.stackable.tech/repository/packages/azure/hadoop-azure-${HADOOP_LONG_VERSION}.jar \
62-
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-storage-${AZURE_STORAGE}.jar \
63-
&& curl -O https://repo.stackable.tech/repository/packages/azure/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar
180+
# Copy modules required for s3a://
181+
COPY --from=hadoop-builder --chown=stackable:stackable \
182+
/stackable/hadoop/share/hadoop/tools/lib/hadoop-aws-${HADOOP}.jar \
183+
/stackable/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_JAVA_SDK_BUNDLE}.jar \
184+
./
185+
186+
# Copy modules required for abfs://
187+
COPY --from=hadoop-builder --chown=stackable:stackable \
188+
/stackable/hadoop/share/hadoop/tools/lib/hadoop-azure-${HADOOP}.jar \
189+
/stackable/hadoop/share/hadoop/tools/lib/azure-storage-${AZURE_STORAGE}.jar \
190+
/stackable/hadoop/share/hadoop/tools/lib/azure-keyvault-core-${AZURE_KEYVAULT_CORE}.jar \
191+
./
192+
193+
# Copy the HBase connector including required modules
194+
COPY --from=hbase-connectors-builder --chown=stackable:stackable \
195+
/stackable/spark/jars/* \
196+
./
197+
198+
# Copy modules required to access HBase
199+
COPY --from=hbase-builder --chown=stackable:stackable \
200+
/stackable/hbase/lib/shaded-clients/hbase-shaded-client-byo-hadoop-${HBASE}.jar \
201+
/stackable/hbase/lib/shaded-clients/hbase-shaded-mapreduce-${HBASE}.jar \
202+
./
203+
# Copy modules required to access HBase if $HBASE == 2.4.x
204+
COPY --from=hbase-builder --chown=stackable:stackable \
205+
/stackable/hbase/lib/client-facing-thirdparty/htrace-core4-*-incubating.jar \
206+
/stackable/hbase/lib/client-facing-thirdparty/slf4j-reload4j-*.jar \
207+
./
208+
# Copy modules required to access HBase if $HBASE == 2.6.x
209+
COPY --from=hbase-builder --chown=stackable:stackable \
210+
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-api-*.jar \
211+
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-context-*.jar \
212+
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
213+
./
64214

65215
WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
66216

@@ -93,6 +243,7 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner
93243
RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist
94244
# ===
95245

246+
96247
FROM stackable/image/java-base as final
97248

98249
ARG PRODUCT
@@ -115,12 +266,12 @@ RUN microdnf update && \
115266
hostname \
116267
# required for spark startup scripts
117268
procps \
118-
python${PYTHON} \
119-
python${PYTHON}-pip \
269+
"python${PYTHON}" \
270+
"python${PYTHON}-pip" \
120271
zip \
121272
# This is needed by the Spark UI to display process information using jps and jmap
122273
# Copying the binaries from the builder stage failed.
123-
java-${JAVA_VERSION}-openjdk-devel \
274+
"java-${JAVA_VERSION}-openjdk-devel" \
124275
&& microdnf clean all \
125276
&& rm -rf /var/cache/yum
126277

@@ -134,10 +285,10 @@ ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/b
134285
ENV PYSPARK_PYTHON=/usr/bin/python
135286
ENV PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
136287

137-
COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/dist /stackable/spark
138-
COPY --chown=stackable:stackable --from=builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
139-
COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx
140-
COPY --from=builder /usr/bin/tini /usr/bin/tini
288+
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/dist /stackable/spark
289+
COPY --chown=stackable:stackable --from=spark-builder /stackable/spark-${PRODUCT}/assembly/target/bom.json /stackable/spark/spark-${PRODUCT}.cdx.json
290+
COPY --chown=stackable:stackable --from=spark-builder /stackable/jmx /stackable/jmx
291+
COPY --from=spark-builder /usr/bin/tini /usr/bin/tini
141292

142293
RUN ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar \
143294
# Symlink example jar, so that we can easily use it in tests
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
VERSION=${1:?"Missing version number argument (arg 1)"}
6+
NEXUS_USER=${2:?"Missing Nexus username argument (arg 2)"}
7+
8+
read -r -s -p "Nexus Password: " NEXUS_PASSWORD
9+
echo ""
10+
11+
# https://stackoverflow.com/questions/4632028/how-to-create-a-temporary-directory
12+
# Find the directory name of the script
13+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
14+
15+
# the temp directory used, within $DIR
16+
WORK_DIR=$(mktemp -d -p "$DIR")
17+
18+
# check if tmp dir was created
19+
if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
20+
echo "Could not create temp dir"
21+
exit 1
22+
fi
23+
24+
# deletes the temp directory
25+
function cleanup {
26+
rm -rf "$WORK_DIR"
27+
}
28+
29+
# register the cleanup function to be called on the EXIT signal
30+
trap cleanup EXIT
31+
32+
cd "$WORK_DIR" || exit
33+
34+
download_url="https://github.com/apache/hbase-connectors/archive/refs/tags/rel/${VERSION}.tar.gz"
35+
36+
tar_gz_file="hbase-connectors_${VERSION}.tar.gz"
37+
38+
echo "Downloading hbase-connectors source from ${download_url}"
39+
curl --fail -L -o "${tar_gz_file}" "${download_url}"
40+
41+
echo "Uploading hbase-connectors source to Nexus"
42+
EXIT_STATUS=0
43+
curl --fail -u "$NEXUS_USER:$NEXUS_PASSWORD" --upload-file "${tar_gz_file}" 'https://repo.stackable.tech/repository/packages/hbase-connectors/' || EXIT_STATUS=$?
44+
45+
if [ $EXIT_STATUS -ne 0 ]; then
46+
echo "ERROR: Upload failed"
47+
exit 1
48+
fi
49+
50+
echo "Successfully uploaded version $VERSION of hbase-connectors to Nexus"
51+
echo "https://repo.stackable.tech/service/rest/repository/browse/packages/hbase-connectors/"

spark-k8s/versions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
"java-base": "17",
55
"java-devel": "17",
66
"python": "3.11",
7-
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
7+
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
8+
"hbase": "2.4.18", # current Stackable LTS version
89
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
910
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
1011
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
@@ -14,13 +15,15 @@
1415
"vector": "0.41.1",
1516
"jmx_exporter": "1.0.1",
1617
"tini": "0.19.0",
18+
"hbase_connector": "1.0.1",
1719
},
1820
{
1921
"product": "3.5.2",
2022
"java-base": "17",
2123
"java-devel": "17",
2224
"python": "3.11",
23-
"hadoop_long_version": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
25+
"hadoop": "3.3.4", # https://github.com/apache/spark/blob/6a5747d66e53ed0d934cdd9ca5c9bd9fde6868e6/pom.xml#L125
26+
"hbase": "2.4.18", # current Stackable LTS version
2427
"aws_java_sdk_bundle": "1.12.262", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4
2528
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
2629
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
@@ -30,5 +33,6 @@
3033
"vector": "0.41.1",
3134
"jmx_exporter": "1.0.1",
3235
"tini": "0.19.0",
36+
"hbase_connector": "1.0.1",
3337
},
3438
]

0 commit comments

Comments
 (0)