Skip to content

Commit 896d81c

Browse files
committed
Simplify spark flavor
1 parent 5477cfb commit 896d81c

File tree

5 files changed

+89
-938
lines changed

5 files changed

+89
-938
lines changed

spark-flavor/Dockerfile

Lines changed: 48 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace:latest"
1+
ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace-r:latest"
22
# Build from full flavor of workspace with same version
33
FROM $ARG_WORKSPACE_BASE_IMAGE
44

@@ -24,14 +24,52 @@ RUN \
2424
# Cleanup
2525
clean-layer.sh
2626

27+
# Install Hadoop
28+
RUN \
29+
/bin/bash $RESOURCES_PATH/tools/hadoop-local-cluster.sh --install && \
30+
# Cleanup
31+
clean-layer.sh
32+
33+
# Needs to be seperated, otherwise it does not exist yet
34+
ENV HADOOP_HOME="/opt/hadoop"
35+
36+
ENV \
37+
HADOOP_INSTALL=$HADOOP_HOME \
38+
HADOOP_MAPRED_HOME=$HADOOP_HOME \
39+
HADOOP_COMMON_HOME=$HADOOP_HOME \
40+
HADOOP_HDFS_HOME=$HADOOP_HOME \
41+
HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
42+
# HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/* \
43+
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native \
44+
HADOOP_OPTS="-Djava.library.path=$HADOOP_COMMON_LIB_NATIVE_DIR" \
45+
HDFS_NAMENODE_USER=$NB_USER \
46+
HDFS_DATANODE_USER=$NB_USER \
47+
HDFS_SECONDARYNAMENODE_USER=$NB_USER \
48+
YARN_HOME=$HADOOP_HOME \
49+
YARN_RESOURCEMANAGER_USER=$NB_USER \
50+
YARN_NODEMANAGER_USER=$NB_USER \
51+
PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
52+
2753
# Install Spark
2854
RUN \
2955
/bin/bash $RESOURCES_PATH/tools/spark-local-cluster.sh --install && \
3056
# Cleanup
3157
clean-layer.sh
3258

3359
# Configure Spark
34-
ENV SPARK_HOME=/opt/spark \
60+
ENV SPARK_HOME="/opt/spark"
61+
62+
ENV \
63+
# PYSPARK_DRIVER_PYTHON="jupyter"
64+
# PYSPARK_DRIVER_PYTHON_OPTS='notebook'
65+
# https://zeppelin.apache.org/docs/latest/interpreter/spark.html
66+
# export SPARK_DIST_CLASSPATH=`hadoop classpath`
67+
PYSPARK_PYTHON=$CONDA_ROOT/bin/python \
68+
PYSPARK_DRIVER_PYTHON=$CONDA_ROOT/bin/python \
69+
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
70+
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
71+
PYTHONHASHSEED=0 \
72+
PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
3573
PATH=$PATH:$SPARK_HOME/bin
3674

3775
# Install Zeppelin
@@ -40,45 +78,15 @@ RUN \
4078
# Cleanup
4179
clean-layer.sh
4280

43-
### CONFIGURATION ###
81+
RUN \
82+
# Install almond jupyter scala kernel: https://almond.sh/
83+
# TODO: The installation in scala-utils does not seem to work currently
84+
curl -Lo coursier https://git.io/coursier-cli && \
85+
chmod +x coursier && \
86+
./coursier launch --fork almond -- --install --force && \
87+
rm -f coursier
4488

45-
ENV \
46-
PYSPARK_PYTHON="python" \
47-
PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
48-
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
49-
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
50-
PYTHONHASHSEED=0
51-
52-
# Todo: Add additional spark configuration:
53-
# https://spark.apache.org/docs/latest/configuration.html
54-
# https://zeppelin.apache.org/docs/latest/interpreter/spark.html
55-
56-
# PYSPARK_DRIVER_PYTHON / PYSPARK_DRIVER_PYTHON_OPTS / HADOOP_HOME / HADOOP_CLASSPATH / SPARK_DIST_CLASSPATH
57-
# export HADOOP_HOME=~/hadoop-2.7.0 export PATH=$HADOOP_HOME/bin:$PATH export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
58-
# export HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/*
59-
# export SPARK_DIST_CLASSPATH=`hadoop classpath`
60-
# export PYSPARK_DRIVER_PYTHON="jupyter"
61-
# export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
62-
# HADOOP_CONF_DIR=/usr/lib/hadoop
63-
64-
# TODO start spark master?
65-
# https://medium.com/@marcovillarreal_40011/creating-a-spark-standalone-cluster-with-docker-and-docker-compose-ba9d743a157f
66-
# ENV SPARK_MASTER_PORT 7077
67-
# ENV SPARK_MASTER_WEBUI_PORT 8080
68-
# ENV SPARK_WORKER_WEBUI_PORT 8081
69-
# ENV SPARK_MASTER_LOG /spark/logs
70-
# ENV SPARK_WORKER_LOG /spark/logs
71-
# CMD ["/bin/bash", "/start-master.sh"]
72-
# export SPARK_MASTER_HOST=`hostname`
73-
# SPARK_WORKER_CORES=1
74-
# SPARK_WORKER_MEMORY=1G
75-
# SPARK_DRIVER_MEMORY=128m
76-
# SPARK_EXECUTOR_MEMORY=256m
77-
78-
# TODO configure spark ui to be proxied with base path:
79-
# https://stackoverflow.com/questions/45971127/wrong-css-location-of-spark-application-ui
80-
# https://github.com/jupyterhub/jupyter-server-proxy/issues/57
81-
# https://github.com/yuvipanda/jupyter-sparkui-proxy/blob/master/jupyter_sparkui_proxy/__init__.py
89+
### CONFIGURATION ###
8290

8391
# Add supervisor config to start zeppelin on port 8072
8492
COPY resources/zeppelin-service.conf /etc/supervisor/conf.d/

spark-flavor/build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
except Exception:
6464
pass
6565

66-
base_image = "ml-workspace:" + VERSION
66+
base_image = "ml-workspace-r:" + VERSION
6767
if args.get(build_utils.FLAG_RELEASE):
6868
base_image = docker_image_prefix + base_image
6969

spark-flavor/resources/tutorials/spark-monitor-tutorial.ipynb

Lines changed: 0 additions & 215 deletions
This file was deleted.

0 commit comments

Comments
 (0)