Skip to content

Commit ad9bb0f

Browse files
committed
WIP
1 parent 71868da commit ad9bb0f

36 files changed

+327086
-135
lines changed

hadoop/Dockerfile

Lines changed: 25 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# hadolint global ignore=DL3038,DL4006
66

77
# hadolint ignore=DL3006
8-
FROM stackable/image/java-devel AS builder
8+
FROM stackable/image/java-devel AS hadoop-builder
99

1010
ARG PRODUCT
1111
ARG ASYNC_PROFILER
@@ -25,25 +25,31 @@ COPY hadoop/stackable/fuse_dfs_wrapper /stackable/fuse_dfs_wrapper
2525
# At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar.
2626
# After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar)
2727
# And then we can also remove the symlink to 0.16.1 from this Dockerfile.
28-
RUN curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
29-
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
30-
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \
31-
ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
28+
RUN <<EOF
29+
curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
30+
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
31+
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
32+
ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
33+
#TODO: Can the symlink go?
3234

33-
RUN ARCH="${TARGETARCH/amd64/x64}" && \
34-
curl --fail -L "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC . && \
35-
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
35+
ARCH="${TARGETARCH/amd64/x64}"
36+
curl --fail -L "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC .
37+
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
3638

3739
# This Protobuf version is the exact version as used in the Hadoop Dockerfile
3840
# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
3941
# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
40-
WORKDIR /opt/protobuf-src
41-
RUN curl --fail -L -s -S https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz && \
42-
tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner && \
43-
./configure --prefix=/opt/protobuf && \
44-
make "-j$(nproc)" && \
45-
make install && \
46-
rm -rf /opt/protobuf-src
42+
# At the time of writing we could save around ~350MB if we included this in the later RUN statement and deleted it afterwards
43+
mkdir /opt/protobuf-src
44+
cd /opt/protobuf-src
45+
curl --fail -L -s -S https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz
46+
tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner
47+
./configure --prefix=/opt/protobuf
48+
make "-j$(nproc)"
49+
make install
50+
rm -rf /opt/protobuf-src
51+
rm -f /opt/protobuf.tar.gz
52+
EOF
4753

4854
ENV PROTOBUF_HOME=/opt/protobuf
4955
ENV PATH="${PATH}:/opt/protobuf/bin"
@@ -56,6 +62,7 @@ RUN microdnf update && \
5662
microdnf clean all && \
5763
rm -rf /var/cache/yum
5864

65+
USER stackable
5966
WORKDIR /stackable
6067

6168
COPY hadoop/stackable/patches /stackable/patches
@@ -65,123 +72,6 @@ COPY hadoop/stackable/patches /stackable/patches
6572
# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
6673
# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
6774
# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
68-
RUN curl --fail -L "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC . && \
69-
patches/apply_patches.sh ${PRODUCT} && \
70-
cd hadoop-${PRODUCT}-src && \
71-
mvn clean package -Pdist,native -pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' -Drequire.fuse=true -DskipTests -Dmaven.javadoc.skip=true && \
72-
cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT} && \
73-
# HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
74-
cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin && \
75-
rm -rf /stackable/hadoop-${PRODUCT}-src
76-
77-
# For earlier versions this script removes the .class file that contains the
78-
# vulnerable code.
79-
# TODO: This can be restricted to target only versions which do not honor the environment
80-
# varible that has been set above but this has not currently been implemented
81-
COPY shared/log4shell.sh /bin
82-
RUN /bin/log4shell.sh "/stackable/hadoop-${PRODUCT}"
83-
84-
# Ensure no vulnerable files are left over
85-
# This will currently report vulnerable files being present, as it also alerts on
86-
# SocketNode.class, which we do not remove with our scripts.
87-
# Further investigation will be needed whether this should also be removed.
88-
COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
89-
COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
90-
COPY shared/log4shell_scanner /bin/log4shell_scanner
91-
RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}"
92-
# ===
93-
94-
FROM stackable/image/java-devel AS hdfs-utils-builder
95-
96-
ARG HDFS_UTILS
97-
ARG PRODUCT
98-
99-
WORKDIR /stackable
100-
101-
# The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
102-
# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
103-
# labels to build a rackID from.
104-
# Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.
105-
106-
RUN curl --fail -L "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC . && \
107-
cd hdfs-utils-${HDFS_UTILS} && \
108-
mvn clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \
109-
mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \
110-
cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \
111-
rm -rf /stackable/hdfs-utils-main
112-
113-
FROM stackable/image/java-base AS final
114-
115-
ARG PRODUCT
116-
ARG RELEASE
117-
ARG HDFS_UTILS
118-
119-
LABEL name="Apache Hadoop" \
120-
maintainer="info@stackable.tech" \
121-
vendor="Stackable GmbH" \
122-
version="${PRODUCT}" \
123-
release="${RELEASE}" \
124-
summary="The Stackable image for Apache Hadoop." \
125-
description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."
126-
127-
# fuse is required for fusermount (called by fuse_dfs)
128-
# fuse-libs is required for fuse_dfs (not included in fuse)
129-
# openssl -> not sure
130-
RUN microdnf update && \
131-
microdnf install \
132-
fuse \
133-
fuse-libs \
134-
# tar is required for `kubectl cp` which can be used to copy the log files
135-
# or profiler flamegraph from the Pod
136-
tar && \
137-
microdnf clean all && \
138-
rm -rf /var/cache/yum
139-
140-
COPY hadoop/licenses /licenses
141-
142-
# Without this fuse_dfs does not work
143-
# It is so non-root users (as we are) can mount a FUSE device and let other users access it
144-
RUN echo "user_allow_other" > /etc/fuse.conf
145-
146-
USER stackable
147-
WORKDIR /stackable
148-
149-
COPY --chown=stackable:stackable --from=builder /stackable/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}/
150-
COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx/
151-
COPY --chown=stackable:stackable --from=builder /stackable/async-profiler /stackable/async-profiler/
152-
COPY --chown=stackable:stackable --from=hdfs-utils-builder /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
153-
RUN ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
154-
155-
COPY hadoop/stackable/fuse_dfs_wrapper /stackable/hadoop/bin
156-
157-
ENV HOME=/stackable
158-
ENV LD_LIBRARY_PATH=/stackable/hadoop/lib/native:/usr/lib/jvm/jre/lib/server
159-
ENV PATH="${PATH}":/stackable/hadoop/bin
160-
ENV HADOOP_HOME=/stackable/hadoop
161-
ENV HADOOP_CONF_DIR=/stackable/config
162-
ENV ASYNC_PROFILER_HOME=/stackable/async-profiler
163-
# The following 2 env-vars are required for common scripts even if the respective libraries are never used.
164-
# HADOOP_HOME is often used internally if HADOOP_YARN_HOME/HADOOP_MAPRED_HOME are not set, although
165-
# a subdirectory is also required in (at least)
166-
# hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh
167-
# if HADOOP_YARN_HOME does not exist at all, so we set it here to a sensible default.
168-
ENV HADOOP_YARN_HOME=/stackable/hadoop
169-
ENV HADOOP_MAPRED_HOME=/stackable/hadoop
170-
171-
# Remove unneeded binaries:
172-
# - code sources
173-
# - mapreduce/yarn binaries that were built as cross-project dependencies
174-
# - minicluster (only used for testing) and test .jars
175-
# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
176-
RUN rm -rf /stackable/hadoop/share/hadoop/common/sources/ && \
177-
rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/ && \
178-
rm -rf /stackable/hadoop/share/hadoop/tools/sources/ && \
179-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar && \
180-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar && \
181-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar && \
182-
find . -name 'hadoop-minicluster-*.jar' -type f -delete && \
183-
find . -name 'hadoop-client-minicluster-*.jar' -type f -delete && \
184-
find . -name 'hadoop-*tests.jar' -type f -delete
185-
186-
WORKDIR /stackable/hadoop
187-
CMD ["echo", "This image is not meant to be 'run' directly."]
75+
RUN <<EOF
76+
curl --fail -L "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC .
77+
EOF

0 commit comments

Comments
 (0)