1
- ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace:latest"
1
+ ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace-r :latest"
2
2
# Build from full flavor of workspace with same version
3
3
FROM $ARG_WORKSPACE_BASE_IMAGE
4
4
@@ -24,14 +24,52 @@ RUN \
24
24
# Cleanup
25
25
clean-layer.sh
26
26
27
+ # Install Hadoop
28
+ RUN \
29
+ /bin/bash $RESOURCES_PATH/tools/hadoop-local-cluster.sh --install && \
30
+ # Cleanup
31
+ clean-layer.sh
32
+
33
+ # Needs to be seperated, otherwise it does not exist yet
34
+ ENV HADOOP_HOME="/opt/hadoop"
35
+
36
+ ENV \
37
+ HADOOP_INSTALL=$HADOOP_HOME \
38
+ HADOOP_MAPRED_HOME=$HADOOP_HOME \
39
+ HADOOP_COMMON_HOME=$HADOOP_HOME \
40
+ HADOOP_HDFS_HOME=$HADOOP_HOME \
41
+ HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
42
+ # HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/* \
43
+ HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native \
44
+ HADOOP_OPTS="-Djava.library.path=$HADOOP_COMMON_LIB_NATIVE_DIR" \
45
+ HDFS_NAMENODE_USER=$NB_USER \
46
+ HDFS_DATANODE_USER=$NB_USER \
47
+ HDFS_SECONDARYNAMENODE_USER=$NB_USER \
48
+ YARN_HOME=$HADOOP_HOME \
49
+ YARN_RESOURCEMANAGER_USER=$NB_USER \
50
+ YARN_NODEMANAGER_USER=$NB_USER \
51
+ PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
52
+
27
53
# Install Spark
28
54
RUN \
29
55
/bin/bash $RESOURCES_PATH/tools/spark-local-cluster.sh --install && \
30
56
# Cleanup
31
57
clean-layer.sh
32
58
33
59
# Configure Spark
34
- ENV SPARK_HOME=/opt/spark \
60
+ ENV SPARK_HOME="/opt/spark"
61
+
62
+ ENV \
63
+ # PYSPARK_DRIVER_PYTHON="jupyter"
64
+ # PYSPARK_DRIVER_PYTHON_OPTS='notebook'
65
+ # https://zeppelin.apache.org/docs/latest/interpreter/spark.html
66
+ # export SPARK_DIST_CLASSPATH=`hadoop classpath`
67
+ PYSPARK_PYTHON=$CONDA_ROOT/bin/python \
68
+ PYSPARK_DRIVER_PYTHON=$CONDA_ROOT/bin/python \
69
+ SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
70
+ # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
71
+ PYTHONHASHSEED=0 \
72
+ PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
35
73
PATH=$PATH:$SPARK_HOME/bin
36
74
37
75
# Install Zeppelin
@@ -40,45 +78,15 @@ RUN \
40
78
# Cleanup
41
79
clean-layer.sh
42
80
43
- # ## CONFIGURATION ###
81
+ RUN \
82
+ # Install almond jupyter scala kernel: https://almond.sh/
83
+ # TODO: The installation in scala-utils does not seem to work currently
84
+ curl -Lo coursier https://git.io/coursier-cli && \
85
+ chmod +x coursier && \
86
+ ./coursier launch --fork almond -- --install --force && \
87
+ rm -f coursier
44
88
45
- ENV \
46
- PYSPARK_PYTHON="python" \
47
- PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
48
- SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
49
- # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
50
- PYTHONHASHSEED=0
51
-
52
- # Todo: Add additional spark configuration:
53
- # https://spark.apache.org/docs/latest/configuration.html
54
- # https://zeppelin.apache.org/docs/latest/interpreter/spark.html
55
-
56
- # PYSPARK_DRIVER_PYTHON / PYSPARK_DRIVER_PYTHON_OPTS / HADOOP_HOME / HADOOP_CLASSPATH / SPARK_DIST_CLASSPATH
57
- # export HADOOP_HOME=~/hadoop-2.7.0 export PATH=$HADOOP_HOME/bin:$PATH export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
58
- # export HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/*
59
- # export SPARK_DIST_CLASSPATH=`hadoop classpath`
60
- # export PYSPARK_DRIVER_PYTHON="jupyter"
61
- # export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
62
- # HADOOP_CONF_DIR=/usr/lib/hadoop
63
-
64
- # TODO start spark master?
65
- # https://medium.com/@marcovillarreal_40011/creating-a-spark-standalone-cluster-with-docker-and-docker-compose-ba9d743a157f
66
- # ENV SPARK_MASTER_PORT 7077
67
- # ENV SPARK_MASTER_WEBUI_PORT 8080
68
- # ENV SPARK_WORKER_WEBUI_PORT 8081
69
- # ENV SPARK_MASTER_LOG /spark/logs
70
- # ENV SPARK_WORKER_LOG /spark/logs
71
- # CMD ["/bin/bash", "/start-master.sh"]
72
- # export SPARK_MASTER_HOST=`hostname`
73
- # SPARK_WORKER_CORES=1
74
- # SPARK_WORKER_MEMORY=1G
75
- # SPARK_DRIVER_MEMORY=128m
76
- # SPARK_EXECUTOR_MEMORY=256m
77
-
78
- # TODO configure spark ui to be proxied with base path:
79
- # https://stackoverflow.com/questions/45971127/wrong-css-location-of-spark-application-ui
80
- # https://github.com/jupyterhub/jupyter-server-proxy/issues/57
81
- # https://github.com/yuvipanda/jupyter-sparkui-proxy/blob/master/jupyter_sparkui_proxy/__init__.py
89
+ # ## CONFIGURATION ###
82
90
83
91
# Add supervisor config to start zeppelin on port 8072
84
92
COPY resources/zeppelin-service.conf /etc/supervisor/conf.d/
0 commit comments