Skip to content

Commit d3bcb49

Browse files
Airflow (#5)
* [feat] add airflow * [fix] move dependencies inside services * minor changes + fixes * minor update * init: add S3 Bucker MinIO * init: add HashiCorp vault * feat: add Apache Spark + Airflow
1 parent 0de3523 commit d3bcb49

15 files changed

+307
-0
lines changed

apache-airflow/.env.template

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
HOSTNAME=airflow
2+
# Postgres
3+
POSTGRES_DB=airflow
4+
POSTGRES_PORT=5432
5+
POSTGRES_USER=
6+
POSTGRES_PASSWORD=
7+
# Airflow
8+
AIRFLOW_UID=0
9+
AIRFLOW_WEBSERVER_PORT=8085
10+
AIRFLOW_SCHEDULER_PORT=8793

apache-airflow/Dockerfile

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
FROM apache/airflow:2.6.0rc5-python3.10
2+
3+
USER root
4+
5+
# Install OpenJDK-11
6+
RUN apt-get update
7+
RUN apt-get install -y openjdk-11-jdk
8+
RUN apt-get install -y ant
9+
RUN apt-get clean
10+
RUN apt-get install curl
11+
12+
# Set JAVA_HOME
13+
ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
14+
RUN mkdir -p /opt/airflow/spark/jars
15+
RUN export JAVA_HOME
16+
17+
# Trino JDBC driver
18+
# RUN curl https://repo1.maven.org/maven2/io/trino/trino-jdbc/396/trino-jdbc-396.jar \
19+
# --output /opt/airflow/spark/jars/trino-jdbc-396.jar
20+
21+
# PostgreSQL JDBC driver
22+
RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar \
23+
--output /opt/airflow/spark/jars/postgresql-42.5.0.jar
24+
25+
# Must use 'airflow' user to install PIP packages
26+
USER airflow
27+
28+
# Install requirements
29+
COPY requirements.txt /
30+
RUN --mount=type=cache,target=/root/.cache \
31+
pip install -r /requirements.txt

apache-airflow/build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
export DOCKER_BUILDKIT=1
3+
docker-compose -f docker-compose.yml --env-file .env up -d --build

apache-airflow/docker-compose.yml

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
version: '3.9'
2+
3+
x-common:
4+
&common
5+
build:
6+
context: .
7+
dockerfile: Dockerfile
8+
user: "${AIRFLOW_UID:-0}:0"
9+
logging:
10+
driver: "json-file"
11+
options:
12+
max-size: "100m"
13+
max-file: "3"
14+
# deploy:
15+
# resources:
16+
# limits:
17+
# cpus: '0.5'
18+
# memory: 1g
19+
environment:
20+
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-airflow}"
21+
22+
env_file:
23+
- .env
24+
volumes:
25+
- ./volume/airflow/dags:/opt/airflow/dags
26+
- ./volume/airflow/logs:/opt/airflow/logs
27+
- ./volume/airflow/plugins:/opt/airflow/plugins
28+
- /var/run/docker.sock:/var/run/docker.sock
29+
networks:
30+
- private
31+
- nginx
32+
- public
33+
34+
x-depends-on:
35+
&depends-on
36+
depends_on:
37+
airflow-init:
38+
condition: service_completed_successfully
39+
40+
41+
services:
42+
# Airflow Scheduler
43+
airflow-scheduler:
44+
<<: *common
45+
<<: *depends-on
46+
container_name: airflow-scheduler
47+
healthcheck:
48+
test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
49+
interval: 10s
50+
timeout: 10s
51+
retries: 5
52+
command: scheduler
53+
restart: on-failure
54+
ports:
55+
- "${AIRFLOW_SCHEDULER_PORT}:8793"
56+
57+
# Airflow Webserver
58+
airflow-webserver:
59+
<<: *common
60+
<<: *depends-on
61+
container_name: airflow-webserver
62+
restart: always
63+
command: webserver
64+
ports:
65+
- "${AIRFLOW_WEBSERVER_PORT:-8080}:8080"
66+
healthcheck:
67+
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
68+
interval: 10s
69+
timeout: 10s
70+
retries: 3
71+
72+
# Airflow Initializer
73+
airflow-init:
74+
<<: *common
75+
container_name: airflow-init
76+
entrypoint: /bin/bash
77+
command:
78+
- -c
79+
- |
80+
mkdir -p /sources/logs /sources/dags /sources/plugins
81+
chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
82+
exec /entrypoint airflow version
83+
84+
85+
networks:
86+
private:
87+
name: private
88+
external: true
89+
nginx:
90+
name: nginx
91+
internal: true
92+
public:
93+
name: public
94+
external: true

apache-airflow/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Airflow Provider Packages
2+
apache-airflow-providers-apache-spark==4.0.0
3+
apache-airflow-providers-cncf-kubernetes==5.0.0

apache-spark/Dockerfile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
FROM bitnami/spark:3.3
2+
3+
# Install curl
4+
USER root
5+
RUN install_packages curl
6+
USER 1001
7+
8+
# Trino JDBC driver
9+
RUN curl https://repo1.maven.org/maven2/io/trino/trino-jdbc/396/trino-jdbc-396.jar \
10+
--output /opt/bitnami/spark/jars/trino-jdbc-396.jar
11+
12+
# PostgreSQL JDBC driver
13+
RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar \
14+
--output /opt/bitnami/spark/jars/postgresql-42.5.0.jar

apache-spark/build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
export DOCKER_BUILDKIT=1
3+
docker-compose -f docker-compose.yml --env-file ../.env up -d --build

apache-spark/docker-compose.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
version: '3.9'
2+
3+
services:
4+
spark-master:
5+
build:
6+
context: .
7+
dockerfile: Dockerfile
8+
container_name: spark-master
9+
environment:
10+
- SPARK_MODE=master
11+
- SPARK_RPC_AUTHENTICATION_ENABLED=no
12+
- SPARK_RPC_ENCRYPTION_ENABLED=no
13+
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=yes
14+
- SPARK_SSL_ENABLED=no
15+
ports:
16+
- 8080:8080 # Master UI
17+
- 7077:7077 # Master
18+
networks:
19+
- services
20+
- spark
21+
22+
spark-worker:
23+
build:
24+
context: .
25+
dockerfile: Dockerfile
26+
container_name: spark-worker
27+
environment:
28+
- SPARK_MODE=worker
29+
- SPARK_WORKER_MEMORY=1G
30+
- SPARK_WORKER_CORES=1
31+
- SPARK_RPC_AUTHENTICATION_ENABLED=no
32+
- SPARK_RPC_ENCRYPTION_ENABLED=no
33+
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=yes
34+
- SPARK_SSL_ENABLED=no
35+
networks:
36+
- spark
37+
- services
38+
39+
networks:
40+
services:
41+
name: services
42+
external: true
43+
spark:
44+
name: spark
45+
internal: true

minIO/.env.template

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
MINIO_ROOT_USER=
2+
MINIO_ROOT_PASSWORD=

minIO/build.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
docker-compose --env-file .env up -d --build

minIO/docker-compose.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
version: '3.9'
2+
3+
services:
4+
minio:
5+
container_name: minio
6+
image: minio/minio
7+
restart: always
8+
ports:
9+
- "${MINIO_API_PORT:9000}:9000" # API port
10+
- "${MINIO_CONSOLE_PORT:-9001}:9001" # Console port
11+
environment:
12+
MINIO_ROOT_USER: ${MINIO_ROOT_USER}
13+
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD}
14+
healthcheck:
15+
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
16+
interval: 30s
17+
timeout: 10s
18+
retries: 3
19+
command: server --console-address ":9001" /data
20+
volumes:
21+
- ./volume/minio:/data
22+
logging:
23+
driver: "json-file"
24+
options:
25+
max-size: "100m"
26+
max-file: "3"
27+
networks:
28+
- private
29+
- nginx
30+
31+
networks:
32+
private:
33+
name: private
34+
internal: true
35+
nginx:
36+
name: nginx
37+
internal: true

vault/.env.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
DOMAIN=
2+
EMAIL=
3+
VAULT_PORT=8200

vault/build.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
docker-compose --env-file .env up -d --build

vault/docker-compose.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
version: '3.7'
2+
3+
x-common:
4+
&common
5+
logging:
6+
driver: "json-file"
7+
options:
8+
max-size: "100m"
9+
max-file: "3"
10+
11+
services:
12+
13+
# Vault
14+
vault:
15+
<<: *common
16+
image: hashicorp/vault:1.13.0-rc1
17+
container_name: vault
18+
restart: always
19+
entrypoint: vault server -config vault/config/config.hcl
20+
ports:
21+
- "${VAULT_PORT}:8200"
22+
cap_add:
23+
- IPC_LOCK
24+
volumes:
25+
- ./volume/logs:/vault/logs
26+
- ./volume/file:/vault/file
27+
- ./volume/config:/vault/config
28+
- ./volume/policies:/vault/policies
29+
- ./vault-config.hcl:/vault/config/config.hcl
30+
- /etc/letsencrypt/live/${DOMAIN}/fullchain.pem:/certs/fullchain.pem
31+
- /etc/letsencrypt/live/${DOMAIN}/privkey.pem:/certs/privkey.pem
32+
networks:
33+
- nginx
34+
- private
35+
36+
networks:
37+
nginx:
38+
name: nginx
39+
internal: true
40+
private:
41+
name: private
42+
internal: true

vault/vault-config.hcl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
storage "file" {
2+
path = "/vault/file"
3+
}
4+
5+
listener "tcp" {
6+
address = "0.0.0.0:8200"
7+
tls_disable = "true"
8+
tls_cert_file = "/certs/fullchain.pem"
9+
tls_key_file = "/certs/privkey.pem"
10+
}
11+
12+
ui = true
13+
api_addr = "http://0.0.0.0:8200"
14+
default_lease_ttl = "48h"
15+
max_lease_ttl = "168h"
16+
cluster_name = "Primary"

0 commit comments

Comments
 (0)