From a71bc7948611928328bb4ee4d017b4ac3c0cd366 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Wed, 22 Jul 2020 20:37:35 +0530 Subject: [PATCH 1/8] rewrite to use official airflow image --- .gitignore | 3 + Dockerfile | 89 ++-------------- Dockerfile.old | 86 ++++++++++++++++ Makefile | 57 +++++++++++ config/airflow.cfg | 107 ++++++++++++++++---- docker-compose-LocalExecutor.yml | 67 ++++++++---- docker-compose.airflow.yml | 22 ++++ script/{entrypoint.sh => entrypoint.sh.old} | 0 script/entrypoint_wrapper.sh | 25 +++++ 9 files changed, 333 insertions(+), 123 deletions(-) create mode 100644 Dockerfile.old create mode 100644 Makefile create mode 100644 docker-compose.airflow.yml rename script/{entrypoint.sh => entrypoint.sh.old} (100%) create mode 100755 script/entrypoint_wrapper.sh diff --git a/.gitignore b/.gitignore index 991a0fb0..85fcfcc2 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ sftp-config.json # Python __pycache__ + +# IDE +.idea/ diff --git a/Dockerfile b/Dockerfile index 02782d0c..350d23e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,86 +1,13 @@ -# VERSION 1.10.9 -# AUTHOR: Matthieu "Puckel_" Roisil -# DESCRIPTION: Basic Airflow container -# BUILD: docker build --rm -t puckel/docker-airflow . -# SOURCE: https://github.com/puckel/docker-airflow +# Custom Dockerfile +FROM apache/airflow:1.10.11 -FROM python:3.7-slim-buster -LABEL maintainer="Puckel_" +# Install mssql support & dag dependencies +USER root -# Never prompt the user for choices on installation/configuration of packages -ENV DEBIAN_FRONTEND noninteractive -ENV TERM linux +COPY script/entrypoint_wrapper.sh /entrypoint_wrapper.sh +COPY config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg -# Airflow -ARG AIRFLOW_VERSION=1.10.9 -ARG AIRFLOW_USER_HOME=/usr/local/airflow -ARG AIRFLOW_DEPS="" -ARG PYTHON_DEPS="" -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} - -# Define en_US. -ENV LANGUAGE en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 -ENV LC_CTYPE en_US.UTF-8 -ENV LC_MESSAGES en_US.UTF-8 - -# Disable noisy "Handling signal" log messages: -# ENV GUNICORN_CMD_ARGS --log-level WARNING - -RUN set -ex \ - && buildDeps=' \ - freetds-dev \ - libkrb5-dev \ - libsasl2-dev \ - libssl-dev \ - libffi-dev \ - libpq-dev \ - git \ - ' \ - && apt-get update -yqq \ - && apt-get upgrade -yqq \ - && apt-get install -yqq --no-install-recommends \ - $buildDeps \ - freetds-bin \ - build-essential \ - default-libmysqlclient-dev \ - apt-utils \ - curl \ - rsync \ - netcat \ - locales \ - && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ - && locale-gen \ - && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ - && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \ - && pip install -U pip setuptools wheel \ - && pip install pytz \ - && pip install pyOpenSSL \ - && pip install ndg-httpsclient \ - && pip install pyasn1 \ - && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \ - && pip install 'redis==3.2' \ - && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \ - && apt-get purge --auto-remove -yqq $buildDeps \ - && apt-get autoremove -yqq --purge \ - && apt-get clean \ - && rm -rf \ - /var/lib/apt/lists/* \ - /tmp/* \ - /var/tmp/* \ - /usr/share/man \ - /usr/share/doc \ - /usr/share/doc-base - -COPY script/entrypoint.sh /entrypoint.sh -COPY config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg - -RUN chown -R airflow: ${AIRFLOW_USER_HOME} - -EXPOSE 8080 5555 8793 +#EXPOSE 5555 USER airflow -WORKDIR ${AIRFLOW_USER_HOME} -ENTRYPOINT ["/entrypoint.sh"] -CMD ["webserver"] +ENTRYPOINT ["/entrypoint_wrapper.sh"] diff --git a/Dockerfile.old b/Dockerfile.old new file mode 100644 index 00000000..02782d0c --- /dev/null +++ b/Dockerfile.old @@ -0,0 +1,86 @@ +# VERSION 1.10.9 +# AUTHOR: Matthieu "Puckel_" Roisil +# DESCRIPTION: Basic Airflow container +# BUILD: docker build --rm -t puckel/docker-airflow . +# SOURCE: https://github.com/puckel/docker-airflow + +FROM python:3.7-slim-buster +LABEL maintainer="Puckel_" + +# Never prompt the user for choices on installation/configuration of packages +ENV DEBIAN_FRONTEND noninteractive +ENV TERM linux + +# Airflow +ARG AIRFLOW_VERSION=1.10.9 +ARG AIRFLOW_USER_HOME=/usr/local/airflow +ARG AIRFLOW_DEPS="" +ARG PYTHON_DEPS="" +ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} + +# Define en_US. +ENV LANGUAGE en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 +ENV LC_CTYPE en_US.UTF-8 +ENV LC_MESSAGES en_US.UTF-8 + +# Disable noisy "Handling signal" log messages: +# ENV GUNICORN_CMD_ARGS --log-level WARNING + +RUN set -ex \ + && buildDeps=' \ + freetds-dev \ + libkrb5-dev \ + libsasl2-dev \ + libssl-dev \ + libffi-dev \ + libpq-dev \ + git \ + ' \ + && apt-get update -yqq \ + && apt-get upgrade -yqq \ + && apt-get install -yqq --no-install-recommends \ + $buildDeps \ + freetds-bin \ + build-essential \ + default-libmysqlclient-dev \ + apt-utils \ + curl \ + rsync \ + netcat \ + locales \ + && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ + && locale-gen \ + && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ + && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \ + && pip install -U pip setuptools wheel \ + && pip install pytz \ + && pip install pyOpenSSL \ + && pip install ndg-httpsclient \ + && pip install pyasn1 \ + && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \ + && pip install 'redis==3.2' \ + && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \ + && apt-get purge --auto-remove -yqq $buildDeps \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/man \ + /usr/share/doc \ + /usr/share/doc-base + +COPY script/entrypoint.sh /entrypoint.sh +COPY config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg + +RUN chown -R airflow: ${AIRFLOW_USER_HOME} + +EXPOSE 8080 5555 8793 + +USER airflow +WORKDIR ${AIRFLOW_USER_HOME} +ENTRYPOINT ["/entrypoint.sh"] +CMD ["webserver"] diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..70912627 --- /dev/null +++ b/Makefile @@ -0,0 +1,57 @@ +SERVICE = "scheduler" +TITLE = "Airflow Containers" +FILE = "docker-compose-LocalExecutor.yml" + +.PHONY: run + +build: + docker-compose -f $(FILE) build + +up: + @echo "Starting $(TITLE)" + docker-compose -f $(FILE) up -d + +upf: + @echo "Starting $(TITLE)" + docker-compose -f $(FILE) up + +down: + @echo "Stopping $(TITLE)" + docker-compose -f $(FILE) down + +restart: + @echo "Restarting $(TITLE)" + docker-compose -f $(FILE) restart + +downup: down print-newline up + +run: + docker-compose -f $(FILE) run --rm --entrypoint='' $(SERVICE) bash + +runr: + docker-compose -f $(FILE) run --rm --entrypoint='' -u root $(SERVICE) bash + +bash: + docker-compose -f $(FILE) exec $(SERVICE) bash + +bashr: + docker-compose -f $(FILE) exec -u root $(SERVICE) bash + +logs: + docker-compose -f $(FILE) logs --tail 50 --follow $(SERVICE) + +conf: + docker-compose -f $(FILE) config + +initdb: + docker-compose -f $(FILE) run --rm $(SERVICE) initdb + +upgradedb: + docker-compose -f $(FILE) run --rm $(SERVICE) upgradedb + +resetdb: + docker-compose -f $(FILE) run --rm $(SERVICE) resetdb + +print-newline: + @echo "" + @echo "" diff --git a/config/airflow.cfg b/config/airflow.cfg index 9e4d5229..71c6d947 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -1,11 +1,11 @@ [core] # The folder where your airflow pipelines live, most likely a # subfolder in a code repository. This path must be absolute. -dags_folder = /usr/local/airflow/dags +dags_folder = {AIRFLOW_HOME}/dags # The folder where airflow should store its log files # This path must be absolute -base_log_folder = /usr/local/airflow/logs +base_log_folder = {AIRFLOW_HOME}/logs # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. # Set this to True if you want to enable remote logging. @@ -34,17 +34,17 @@ logging_config_class = colored_console_log = True # Log format for when Colored logs is enabled -colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s +colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter # Format of Log line -log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s +log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s # Log filename format log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log log_processor_filename_template = {{ filename }}.log -dag_processor_manager_log_location = /usr/local/airflow/logs/dag_processor_manager/dag_processor_manager.log +dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log # Name of handler to read task instance logs. # Default to use task handler. @@ -71,7 +71,7 @@ executor = SequentialExecutor # The SqlAlchemy connection string to the metadata database. # SqlAlchemy supports many different database engine, more information # their website -# sql_alchemy_conn = sqlite:////tmp/airflow.db +sql_alchemy_conn = sqlite:///{AIRFLOW_HOME}/airflow.db # The encoding for the databases sql_engine_encoding = utf-8 @@ -110,6 +110,12 @@ sql_alchemy_pool_pre_ping = True # SqlAlchemy supports databases with the concept of multiple schemas. sql_alchemy_schema = +# Import path for connect args in SqlAlchemy. Default to an empty dict. +# This is useful when you want to configure db engine args that SqlAlchemy won't parse +# in connection string. +# See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args +# sql_alchemy_connect_args = + # The amount of parallelism as a setting to the executor. This defines # the max number of task instances that should run simultaneously # on this airflow installation @@ -124,16 +130,21 @@ dags_are_paused_at_creation = True # The maximum number of active DAG runs per DAG max_active_runs_per_dag = 16 -# Whether to load the examples that ship with Airflow. It's good to +# Whether to load the DAG examples that ship with Airflow. It's good to # get started, but you probably want to set this to False in a production # environment load_examples = True +# Whether to load the default connections that ship with Airflow. It's good to +# get started, but you probably want to set this to False in a production +# environment +load_default_connections = True + # Where your Airflow plugins are stored -plugins_folder = /usr/local/airflow/plugins +plugins_folder = {AIRFLOW_HOME}/plugins # Secret key to save connection passwords in the db -fernet_key = $FERNET_KEY +fernet_key = {FERNET_KEY} # Whether to disable pickling dags donot_pickle = False @@ -184,7 +195,7 @@ dag_discovery_safe_mode = True # The number of retries each task is going to have by default. Can be overridden at dag or task level. default_task_retries = 0 -# Whether to serialises DAGs and persist them in DB. +# Whether to serialise DAGs and persist them in DB. # If set to True, Webserver reads from DB instead of parsing DAG files # More details: https://airflow.apache.org/docs/stable/dag-serialization.html store_serialized_dags = False @@ -192,9 +203,35 @@ store_serialized_dags = False # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. min_serialized_dag_update_interval = 30 +# Whether to persist DAG files code in DB. +# If set to True, Webserver reads file contents from DB instead of +# trying to access files in a DAG folder. Defaults to same as the +# ``store_serialized_dags`` setting. +# Example: store_dag_code = False +# store_dag_code = + +# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store +# in the Database. +# When Dag Serialization is enabled (``store_serialized_dags=True``), all the template_fields +# for each of Task Instance are stored in the Database. +# Keeping this number small may cause an error when you try to view ``Rendered`` tab in +# TaskInstance view for older tasks. +max_num_rendered_ti_fields_per_task = 30 + # On each dagrun check against defined SLAs check_slas = True +[secrets] +# Full class name of secrets backend to enable (will precede env vars and metastore in search path) +# Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend +backend = + +# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. +# See documentation for the secrets backend you are using. JSON is expected. +# Example for AWS Systems Manager ParameterStore: +# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` +backend_kwargs = + [cli] # In what way should the cli access the API. The LocalClient will use the # database directly, while the json_client will use the api running on the @@ -212,8 +249,10 @@ endpoint_url = http://localhost:8080 fail_fast = False [api] -# How to authenticate users of the API -auth_backend = airflow.api.auth.backend.default +# How to authenticate users of the API. See +# https://airflow.apache.org/docs/stable/security.html for possible values. +# ("airflow.api.auth.backend.default" allows all requests for historic reasons) +auth_backend = airflow.api.auth.backend.deny_all [lineage] # what lineage backend to use @@ -245,6 +284,12 @@ default_hive_mapred_queue = # airflow sends to point links to the right web server base_url = http://localhost:8080 +# Default timezone to display all dates in the RBAC UI, can be UTC, system, or +# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the +# default value of core/default_timezone will be used +# Example: default_ui_timezone = America/New_York +default_ui_timezone = UTC + # The ip specified when starting the web server web_server_host = 0.0.0.0 @@ -273,6 +318,10 @@ worker_refresh_batch_size = 1 # Number of seconds to wait before refreshing a batch of workers. worker_refresh_interval = 30 +# If set to True, Airflow will track files in plugins_folder directory. When it detects changes, +# then reload the gunicorn. +reload_on_plugin_change = False + # Secret key used to run your flask app # It should be as random as possible secret_key = temporary_key @@ -291,7 +340,7 @@ access_logfile = - error_logfile = - # Expose the configuration file in the web server -expose_config = True +expose_config = False # Expose hostname in the web server expose_hostname = True @@ -446,7 +495,7 @@ worker_concurrency = 16 # If autoscale option is available, worker_concurrency will be ignored. # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale # Example: worker_autoscale = 16,12 -worker_autoscale = 16,12 +# worker_autoscale = # When you start an airflow worker, airflow starts a tiny web server # subprocess to serve the workers local log files to the airflow main @@ -459,7 +508,7 @@ worker_log_server_port = 8793 # a sqlalchemy database. Refer to the Celery documentation for more # information. # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings -broker_url = redis://redis:6379/1 +broker_url = sqla+mysql://airflow:airflow@localhost:3306/airflow # The Celery result_backend. When a job finishes, it needs to update the # metadata of the job. Therefore it will post a message on a message bus, @@ -467,7 +516,7 @@ broker_url = redis://redis:6379/1 # This status is used by the scheduler to update the state of the task # The use of a database is highly recommended # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings -result_backend = db+postgresql://airflow:airflow@postgres/airflow +result_backend = db+mysql://airflow:airflow@localhost:3306/airflow # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start # it ``airflow flower``. This defines the IP that Celery Flower runs on @@ -574,7 +623,7 @@ print_stats_interval = 30 # ago (in seconds), scheduler is considered unhealthy. # This is used by the health check in the "/health" endpoint scheduler_health_check_threshold = 30 -child_process_log_directory = /usr/local/airflow/logs/scheduler +child_process_log_directory = {AIRFLOW_HOME}/logs/scheduler # Local task jobs periodically heartbeat to the DB. If the job has # not heartbeat in this many seconds, the scheduler will mark the @@ -708,7 +757,7 @@ hide_sensitive_variable_fields = True host = # Format of the log_id, which is used to query for a given tasks logs -log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}} +log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} # Used to mark the end of a log stream for a task end_of_log_mark = end_of_log @@ -734,12 +783,19 @@ verify_certs = True [kubernetes] # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run worker_container_repository = + +# Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. +pod_template_file = worker_container_tag = worker_container_image_pull_policy = IfNotPresent -# If True (default), worker pods will be deleted upon termination +# If True, all worker pods will be deleted upon termination delete_worker_pods = True +# If False (and delete_worker_pods is True), +# failed worker pods will not be deleted so users can investigate them. +delete_worker_pods_on_failure = False + # Number of Kubernetes Worker Pod creation calls per scheduler loop worker_pods_creation_batch_size = 1 @@ -782,6 +838,9 @@ dags_in_image = False # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs dags_volume_subpath = +# For either git sync or volume mounted DAGs, the worker will mount the volume in this path +dags_volume_mount_point = + # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) dags_volume_claim = @@ -810,6 +869,10 @@ env_from_secret_ref = # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim) git_repo = git_branch = + +# Use a shallow clone with a history truncated to the specified number of commits. +# 0 - do not use shallow clone. +git_sync_depth = 1 git_subpath = # The specific rev or hash the git_sync init container will checkout @@ -824,7 +887,7 @@ git_sync_root = /git git_sync_dest = repo # Mount point of the volume if git-sync is being used. -# i.e. /usr/local/airflow/dags +# i.e. {AIRFLOW_HOME}/dags git_dags_folder_mount_point = # To get Git-sync SSH authentication set up follow this format @@ -931,10 +994,10 @@ tolerations = # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely # for kubernetes api responses, which will cause the scheduler to hang. # The timeout is specified as [connect timeout, read timeout] -kube_client_request_args = {{"_request_timeout" : [60,60] }} +kube_client_request_args = # Specifies the uid to run the first process of the worker pods containers as -run_as_user = +run_as_user = 50000 # Specifies a gid to associate with all containers in the worker pods # if using a git_ssh_key_secret_name use an fs_group diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index 26e9e92e..6798156b 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -1,36 +1,63 @@ -version: '3.7' +version: '2.1' services: postgres: image: postgres:9.6 + container_name: af_postgres environment: - POSTGRES_USER=airflow - POSTGRES_PASSWORD=airflow - POSTGRES_DB=airflow - logging: - options: - max-size: 10m - max-file: "3" +# ports: +# - 5432:5432 webserver: - image: puckel/docker-airflow:1.10.9 - restart: always + extends: + file: docker-compose.airflow.yml + service: airflow + container_name: af_webserver + command: webserver depends_on: - postgres - environment: - - LOAD_EX=n - - EXECUTOR=Local - logging: - options: - max-size: 10m - max-file: "3" - volumes: - - ./dags:/usr/local/airflow/dags - # - ./plugins:/usr/local/airflow/plugins ports: - - "8080:8080" - command: webserver + - 8080:8080 +# networks: +# - proxy +# - default +# environment: +# # Web Server Config +# - AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW=graph +# - AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT=true +# - AIRFLOW__WEBSERVER__RBAC=true +# +# # Web Server Performance tweaks +# # 2 * NUM_CPU_CORES + 1 +# - AIRFLOW__WEBSERVER__WORKERS=5 +# # Restart workers every 30min instead of 30seconds +# - AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL=1800 +# labels: +# - "traefik.enable=true" +# - "traefik.http.routers.airflow.rule=Host(`af.example.com`)" +# - "traefik.http.routers.airflow.middlewares=admin-auth@file" healthcheck: - test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] + test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"] interval: 30s timeout: 30s retries: 3 + + scheduler: + extends: + file: docker-compose.airflow.yml + service: airflow + container_name: af_scheduler + command: scheduler + depends_on: + - postgres +# environment: +# # Performance Tweaks +# # Reduce how often DAGs are reloaded to dramatically reduce CPU use +# - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} +# - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} +# +#networks: +# proxy: +# external: true diff --git a/docker-compose.airflow.yml b/docker-compose.airflow.yml new file mode 100644 index 00000000..81904a56 --- /dev/null +++ b/docker-compose.airflow.yml @@ -0,0 +1,22 @@ +version: '2.1' +services: + airflow: +# image: apache/airflow:1.10.11 + build: + context: . +# args: +# - DOCKER_UID=${DOCKER_UID-1000} + dockerfile: Dockerfile + restart: always + environment: + - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow + - AIRFLOW__CORE__FERNET_KEY=KPr56n1SCB9uoco1cjT0Nyr-dKopjaRsNK0K_bBYpZQ= + - AIRFLOW__CORE__EXECUTOR=LocalExecutor + - AIRFLOW__CORE__DEFAULT_TIMEZONE=system + - AIRFLOW__CORE__LOAD_EXAMPLES=True + - AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=True + volumes: + - ./dags:/opt/airflow/dags:z +# - ./plugins:/opt/airflow/plugins:z +# - ./volumes/airflow_data_dump:/opt/airflow/data_dump:z +# - ./volumes/airflow_logs:/opt/airflow/logs:z diff --git a/script/entrypoint.sh b/script/entrypoint.sh.old similarity index 100% rename from script/entrypoint.sh rename to script/entrypoint.sh.old diff --git a/script/entrypoint_wrapper.sh b/script/entrypoint_wrapper.sh new file mode 100755 index 00000000..b0bf1fe5 --- /dev/null +++ b/script/entrypoint_wrapper.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Install custom python package if requirements.txt is present +if [[ -e "/requirements.txt" ]]; then + $(command -v pip) install --user -r /requirements.txt +fi + +case "$1" in + webserver|worker|flower) + # Give the scheduler time to run initdb. + sleep 10 + exec /entrypoint "$@" + ;; + scheduler) + airflow upgradedb + exec /entrypoint "$@" + ;; + bash|python) + exec /entrypoint "$@" + ;; + *) + # The command is something like bash, not an airflow subcommand. Just run it in the right environment. + exec /entrypoint "$@" + ;; +esac From 796477601f5c6b64b8c2a42b155426ca74df0e82 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Fri, 24 Jul 2020 21:26:49 +0530 Subject: [PATCH 2/8] Celery config --- Dockerfile | 3 - README.md | 4 +- docker-compose-CeleryExecutor.yml | 121 +++++++++--------- docker-compose-LocalExecutor.yml | 6 +- docker-compose.airflow.celery.yml | 21 +++ ...ow.yml => docker-compose.airflow.local.yml | 0 script/entrypoint_wrapper.sh | 6 +- 7 files changed, 92 insertions(+), 69 deletions(-) create mode 100644 docker-compose.airflow.celery.yml rename docker-compose.airflow.yml => docker-compose.airflow.local.yml (100%) diff --git a/Dockerfile b/Dockerfile index 350d23e2..89ee74aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,6 @@ FROM apache/airflow:1.10.11 USER root COPY script/entrypoint_wrapper.sh /entrypoint_wrapper.sh -COPY config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg - -#EXPOSE 5555 USER airflow ENTRYPOINT ["/entrypoint_wrapper.sh"] diff --git a/README.md b/README.md index 922e51a7..68de0a13 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This repository contains **Dockerfile** of [apache-airflow](https://github.com/a ## Informations -* Based on Python (3.7-slim-buster) official Image [python:3.7-slim-buster](https://hub.docker.com/_/python/) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [Redis](https://hub.docker.com/_/redis/) as queue +* Based on Python (3.7-slim-buster) official Image [apache/airflow:1.10.11](https://hub.docker.com/r/apache/airflow) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [Redis](https://hub.docker.com/_/redis/) as queue * Install [Docker](https://www.docker.com/) * Install [Docker Compose](https://docs.docker.com/compose/install/) * Following the Airflow release from [Python Package Index](https://pypi.python.org/pypi/apache-airflow) @@ -163,7 +163,7 @@ it explicitly: | `REDIS_PROTO` | `redis://` | Protocol | | `REDIS_HOST` | `redis` | Redis server host | | `REDIS_PORT` | `6379` | Redis server port | -| `REDIS_PASSWORD` | empty | If Redis is password protected | +| `REDIS_PASSWORD` | `redispass` | If Redis is password protected | | `REDIS_DBNUM` | `1` | Database number | You can also use those variables to adapt your compose file to match an existing Redis instance managed elsewhere. diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml index de4f5dac..a8360c46 100644 --- a/docker-compose-CeleryExecutor.yml +++ b/docker-compose-CeleryExecutor.yml @@ -2,91 +2,94 @@ version: '2.1' services: redis: image: 'redis:5.0.5' - # command: redis-server --requirepass redispass + container_name: af_redis + command: redis-server --requirepass redispass postgres: image: postgres:9.6 + container_name: af_postgres environment: - POSTGRES_USER=airflow - POSTGRES_PASSWORD=airflow - POSTGRES_DB=airflow - # Uncomment these lines to persist data on the local filesystem. - # - PGDATA=/var/lib/postgresql/data/pgdata - # volumes: - # - ./pgdata:/var/lib/postgresql/data/pgdata +# ports: +# - 5432:5432 +# Uncomment these lines to persist data on the local filesystem. +# - PGDATA=/var/lib/postgresql/data/pgdata +# volumes: +# - ./pgdata:/var/lib/postgresql/data/pgdata webserver: - image: puckel/docker-airflow:1.10.9 - restart: always + extends: + file: docker-compose.airflow.celery.yml + service: airflow + container_name: af_webserver + command: webserver depends_on: - postgres - - redis - environment: - - LOAD_EX=n - - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - - EXECUTOR=Celery - # - POSTGRES_USER=airflow - # - POSTGRES_PASSWORD=airflow - # - POSTGRES_DB=airflow - # - REDIS_PASSWORD=redispass - volumes: - - ./dags:/usr/local/airflow/dags - # Uncomment to include custom plugins - # - ./plugins:/usr/local/airflow/plugins ports: - - "8080:8080" - command: webserver + - 8080:8080 +# networks: +# - proxy +# - default +# environment: +# # Web Server Config +# - AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW=graph +# - AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT=true +# - AIRFLOW__WEBSERVER__RBAC=true +# +# # Web Server Performance tweaks +# # 2 * NUM_CPU_CORES + 1 +# - AIRFLOW__WEBSERVER__WORKERS=5 +# # Restart workers every 30min instead of 30seconds +# - AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL=1800 +# labels: +# - "traefik.enable=true" +# - "traefik.http.routers.airflow.rule=Host(`af.example.com`)" +# - "traefik.http.routers.airflow.middlewares=admin-auth@file" healthcheck: - test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] + test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"] interval: 30s timeout: 30s retries: 3 flower: - image: puckel/docker-airflow:1.10.9 + extends: + file: docker-compose.airflow.celery.yml + service: airflow + container_name: af_flower restart: always depends_on: - redis - environment: - - EXECUTOR=Celery - # - REDIS_PASSWORD=redispass ports: - - "5555:5555" + - 5555:5555 command: flower scheduler: - image: puckel/docker-airflow:1.10.9 - restart: always - depends_on: - - webserver - volumes: - - ./dags:/usr/local/airflow/dags - # Uncomment to include custom plugins - # - ./plugins:/usr/local/airflow/plugins - environment: - - LOAD_EX=n - - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - - EXECUTOR=Celery - # - POSTGRES_USER=airflow - # - POSTGRES_PASSWORD=airflow - # - POSTGRES_DB=airflow - # - REDIS_PASSWORD=redispass + extends: + file: docker-compose.airflow.celery.yml + service: airflow + container_name: af_scheduler command: scheduler + depends_on: + - postgres + - redis +# environment: +# # Performance Tweaks +# # Reduce how often DAGs are reloaded to dramatically reduce CPU use +# - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} +# - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} worker: - image: puckel/docker-airflow:1.10.9 - restart: always - depends_on: - - scheduler - volumes: - - ./dags:/usr/local/airflow/dags - # Uncomment to include custom plugins - # - ./plugins:/usr/local/airflow/plugins - environment: - - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - - EXECUTOR=Celery - # - POSTGRES_USER=airflow - # - POSTGRES_PASSWORD=airflow - # - POSTGRES_DB=airflow - # - REDIS_PASSWORD=redispass + extends: + file: docker-compose.airflow.celery.yml + service: airflow + container_name: af_worker command: worker + depends_on: + - postgres + - redis + +#networks: +# proxy: +# external: true diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index 6798156b..90c08b4a 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -12,7 +12,7 @@ services: webserver: extends: - file: docker-compose.airflow.yml + file: docker-compose.airflow.local.yml service: airflow container_name: af_webserver command: webserver @@ -46,7 +46,7 @@ services: scheduler: extends: - file: docker-compose.airflow.yml + file: docker-compose.airflow.local.yml service: airflow container_name: af_scheduler command: scheduler @@ -57,7 +57,7 @@ services: # # Reduce how often DAGs are reloaded to dramatically reduce CPU use # - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} # - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} -# + #networks: # proxy: # external: true diff --git a/docker-compose.airflow.celery.yml b/docker-compose.airflow.celery.yml new file mode 100644 index 00000000..040a6275 --- /dev/null +++ b/docker-compose.airflow.celery.yml @@ -0,0 +1,21 @@ +version: '2.1' +services: + airflow: + build: + context: . + dockerfile: Dockerfile + restart: always + environment: + - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow + - AIRFLOW__CORE__FERNET_KEY=KPr56n1SCB9uoco1cjT0Nyr-dKopjaRsNK0K_bBYpZQ= + - AIRFLOW__CORE__DEFAULT_TIMEZONE=system + - AIRFLOW__CORE__LOAD_EXAMPLES=True + - AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=True + - AIRFLOW__CORE__EXECUTOR=CeleryExecutor + - AIRFLOW__CELERY__BROKER_URL=redis://:redispass@redis:6379/0 + - AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow@postgres:5432/airflow + volumes: + - ./dags:/opt/airflow/dags:z +# - ./plugins:/opt/airflow/plugins:z +# - ./volumes/airflow_data_dump:/opt/airflow/data_dump:z +# - ./volumes/airflow_logs:/opt/airflow/logs:z diff --git a/docker-compose.airflow.yml b/docker-compose.airflow.local.yml similarity index 100% rename from docker-compose.airflow.yml rename to docker-compose.airflow.local.yml diff --git a/script/entrypoint_wrapper.sh b/script/entrypoint_wrapper.sh index b0bf1fe5..0755fa4e 100755 --- a/script/entrypoint_wrapper.sh +++ b/script/entrypoint_wrapper.sh @@ -7,11 +7,13 @@ fi case "$1" in webserver|worker|flower) - # Give the scheduler time to run initdb. - sleep 10 + # Give the scheduler time to run upgradedb. + sleep 20 exec /entrypoint "$@" ;; scheduler) + # Give postgres time to come up. + sleep 10 airflow upgradedb exec /entrypoint "$@" ;; From 66072ee1b41c91f2aaf41703b099052f4d138e2e Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 25 Jul 2020 00:16:33 +0530 Subject: [PATCH 3/8] option to give custom config file --- Dockerfile | 2 +- config/airflow.cfg | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 89ee74aa..81672939 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ # Custom Dockerfile FROM apache/airflow:1.10.11 -# Install mssql support & dag dependencies USER root COPY script/entrypoint_wrapper.sh /entrypoint_wrapper.sh +COPY config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg USER airflow ENTRYPOINT ["/entrypoint_wrapper.sh"] diff --git a/config/airflow.cfg b/config/airflow.cfg index 71c6d947..88baf92b 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -1,11 +1,11 @@ [core] # The folder where your airflow pipelines live, most likely a # subfolder in a code repository. This path must be absolute. -dags_folder = {AIRFLOW_HOME}/dags +dags_folder = /opt/airflow/dags # The folder where airflow should store its log files # This path must be absolute -base_log_folder = {AIRFLOW_HOME}/logs +base_log_folder = /opt/airflow/logs # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. # Set this to True if you want to enable remote logging. @@ -44,7 +44,7 @@ simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s # Log filename format log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log log_processor_filename_template = {{ filename }}.log -dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log +dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log # Name of handler to read task instance logs. # Default to use task handler. @@ -71,7 +71,7 @@ executor = SequentialExecutor # The SqlAlchemy connection string to the metadata database. # SqlAlchemy supports many different database engine, more information # their website -sql_alchemy_conn = sqlite:///{AIRFLOW_HOME}/airflow.db +sql_alchemy_conn = sqlite:////opt/airflow/airflow.db # The encoding for the databases sql_engine_encoding = utf-8 @@ -141,10 +141,10 @@ load_examples = True load_default_connections = True # Where your Airflow plugins are stored -plugins_folder = {AIRFLOW_HOME}/plugins +plugins_folder = /opt/airflow/plugins # Secret key to save connection passwords in the db -fernet_key = {FERNET_KEY} +fernet_key = $FERNET_KEY # Whether to disable pickling dags donot_pickle = False @@ -623,7 +623,7 @@ print_stats_interval = 30 # ago (in seconds), scheduler is considered unhealthy. # This is used by the health check in the "/health" endpoint scheduler_health_check_threshold = 30 -child_process_log_directory = {AIRFLOW_HOME}/logs/scheduler +child_process_log_directory = /opt/airflow/logs/scheduler # Local task jobs periodically heartbeat to the DB. If the job has # not heartbeat in this many seconds, the scheduler will mark the @@ -887,7 +887,7 @@ git_sync_root = /git git_sync_dest = repo # Mount point of the volume if git-sync is being used. -# i.e. {AIRFLOW_HOME}/dags +# i.e. /opt/airflow/dags git_dags_folder_mount_point = # To get Git-sync SSH authentication set up follow this format From cb2fb21c92e19501d4fcea265d940e0fab484f30 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 29 Aug 2020 01:26:02 +0530 Subject: [PATCH 4/8] add enhancements --- .dockerignore | 5 ++ .github/workflows/ci.yml | 6 +- Dockerfile | 31 ++++++- Dockerfile.old | 86 ------------------- Makefile | 41 +++++---- docker-compose-CeleryExecutor.yml | 6 +- docker-compose-LocalExecutor.yml | 27 ++---- docker-compose.airflow.celery.yml | 2 +- docker-compose.airflow.local.yml | 3 - script/entrypoint.sh.old | 135 ------------------------------ script/entrypoint_wrapper.sh | 12 ++- 11 files changed, 87 insertions(+), 267 deletions(-) delete mode 100644 Dockerfile.old delete mode 100755 script/entrypoint.sh.old diff --git a/.dockerignore b/.dockerignore index 6b8710a7..15a7516c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,6 @@ .git +.github +*.yml +Makefile +README.md +LICENSE \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7a4f851c..f154c718 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,7 +6,7 @@ on: - master pull_request: branches: - - master + - master jobs: ci: @@ -14,5 +14,5 @@ jobs: steps: - uses: actions/checkout@v1 - run: docker build -t "${PWD##*/}" . - - run: docker run "${PWD##*/}" python -V - - run: docker run "${PWD##*/}" version + - run: docker run "${PWD##*/}" python -V | grep '3.6' + - run: docker run "${PWD##*/}" version | grep '1.10.11' diff --git a/Dockerfile b/Dockerfile index 81672939..367e3fa0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,39 @@ -# Custom Dockerfile +# VERSION 1.10.11 +# AUTHOR: Swapnil Gusani +# DESCRIPTION: Basic Airflow container +# BUILD: docker build --rm -t swapniel99/docker-airflow . +# SOURCE: https://github.com/swapniel99/docker-airflow + FROM apache/airflow:1.10.11 USER root +##System upgrade +#RUN set -ex \ +# # Upgrade packages +# && apt-get update -yqq \ +# && apt-get upgrade -yqq \ +# && pip install --upgrade pip \ +# # Cleanup +# && apt-get autoremove -yqq --purge \ +# && apt-get clean \ +# && rm -rf \ +# /var/lib/apt/lists/* \ +# /root/.cache/pip \ +# /tmp/* \ +# /var/tmp/* \ +# /usr/share/man \ +# /usr/share/doc \ +# /usr/share/doc-base + +# Copy Config Files COPY script/entrypoint_wrapper.sh /entrypoint_wrapper.sh COPY config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg +# Make airflow user owner +RUN chown -R airflow: ${AIRFLOW_HOME} + +EXPOSE 5555 8793 + USER airflow ENTRYPOINT ["/entrypoint_wrapper.sh"] diff --git a/Dockerfile.old b/Dockerfile.old deleted file mode 100644 index 02782d0c..00000000 --- a/Dockerfile.old +++ /dev/null @@ -1,86 +0,0 @@ -# VERSION 1.10.9 -# AUTHOR: Matthieu "Puckel_" Roisil -# DESCRIPTION: Basic Airflow container -# BUILD: docker build --rm -t puckel/docker-airflow . -# SOURCE: https://github.com/puckel/docker-airflow - -FROM python:3.7-slim-buster -LABEL maintainer="Puckel_" - -# Never prompt the user for choices on installation/configuration of packages -ENV DEBIAN_FRONTEND noninteractive -ENV TERM linux - -# Airflow -ARG AIRFLOW_VERSION=1.10.9 -ARG AIRFLOW_USER_HOME=/usr/local/airflow -ARG AIRFLOW_DEPS="" -ARG PYTHON_DEPS="" -ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} - -# Define en_US. -ENV LANGUAGE en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 -ENV LC_CTYPE en_US.UTF-8 -ENV LC_MESSAGES en_US.UTF-8 - -# Disable noisy "Handling signal" log messages: -# ENV GUNICORN_CMD_ARGS --log-level WARNING - -RUN set -ex \ - && buildDeps=' \ - freetds-dev \ - libkrb5-dev \ - libsasl2-dev \ - libssl-dev \ - libffi-dev \ - libpq-dev \ - git \ - ' \ - && apt-get update -yqq \ - && apt-get upgrade -yqq \ - && apt-get install -yqq --no-install-recommends \ - $buildDeps \ - freetds-bin \ - build-essential \ - default-libmysqlclient-dev \ - apt-utils \ - curl \ - rsync \ - netcat \ - locales \ - && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ - && locale-gen \ - && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ - && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \ - && pip install -U pip setuptools wheel \ - && pip install pytz \ - && pip install pyOpenSSL \ - && pip install ndg-httpsclient \ - && pip install pyasn1 \ - && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \ - && pip install 'redis==3.2' \ - && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \ - && apt-get purge --auto-remove -yqq $buildDeps \ - && apt-get autoremove -yqq --purge \ - && apt-get clean \ - && rm -rf \ - /var/lib/apt/lists/* \ - /tmp/* \ - /var/tmp/* \ - /usr/share/man \ - /usr/share/doc \ - /usr/share/doc-base - -COPY script/entrypoint.sh /entrypoint.sh -COPY config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg - -RUN chown -R airflow: ${AIRFLOW_USER_HOME} - -EXPOSE 8080 5555 8793 - -USER airflow -WORKDIR ${AIRFLOW_USER_HOME} -ENTRYPOINT ["/entrypoint.sh"] -CMD ["webserver"] diff --git a/Makefile b/Makefile index 70912627..bd7df7bf 100644 --- a/Makefile +++ b/Makefile @@ -5,52 +5,65 @@ FILE = "docker-compose-LocalExecutor.yml" .PHONY: run build: - docker-compose -f $(FILE) build + docker build -t docker-airflow . + docker tag docker-airflow-ascent:latest swapniel99/docker-airflow:latest up: @echo "Starting $(TITLE)" - docker-compose -f $(FILE) up -d + docker-compose -f $(FILE) up -d upf: @echo "Starting $(TITLE)" - docker-compose -f $(FILE) up + docker-compose -f $(FILE) up down: @echo "Stopping $(TITLE)" - docker-compose -f $(FILE) down + docker-compose -f $(FILE) down + +start: + @echo "Starting $(TITLE)" + docker-compose -f $(FILE) start + +stop: + @echo "Stopping $(TITLE)" + docker-compose -f $(FILE) stop restart: @echo "Restarting $(TITLE)" - docker-compose -f $(FILE) restart + docker-compose -f $(FILE) restart downup: down print-newline up run: - docker-compose -f $(FILE) run --rm --entrypoint='' $(SERVICE) bash + docker-compose -f $(FILE) run --rm --entrypoint='' $(SERVICE) bash runr: - docker-compose -f $(FILE) run --rm --entrypoint='' -u root $(SERVICE) bash + docker-compose -f $(FILE) run --rm --entrypoint='' -u root $(SERVICE) bash bash: - docker-compose -f $(FILE) exec $(SERVICE) bash + docker-compose -f $(FILE) exec $(SERVICE) bash bashr: - docker-compose -f $(FILE) exec -u root $(SERVICE) bash + docker-compose -f $(FILE) exec -u root $(SERVICE) bash logs: - docker-compose -f $(FILE) logs --tail 50 --follow $(SERVICE) + docker-compose -f $(FILE) logs --tail 50 --follow $(SERVICE) conf: - docker-compose -f $(FILE) config + docker-compose -f $(FILE) config initdb: - docker-compose -f $(FILE) run --rm $(SERVICE) initdb + docker-compose -f $(FILE) run --rm $(SERVICE) airflow initdb upgradedb: - docker-compose -f $(FILE) run --rm $(SERVICE) upgradedb + docker-compose -f $(FILE) run --rm $(SERVICE) airflow upgradedb resetdb: - docker-compose -f $(FILE) run --rm $(SERVICE) resetdb + docker-compose -f $(FILE) run --rm $(SERVICE) airflow resetdb + +rbacadmin: + # Change user details and password after login if using RBAC mode. + docker-compose -f $(FILE) exec webserver airflow create_user -r Admin -u admin -e admin@example.com -f Firstname -l Lastname -p admin123 print-newline: @echo "" diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml index a8360c46..a717efd8 100644 --- a/docker-compose-CeleryExecutor.yml +++ b/docker-compose-CeleryExecutor.yml @@ -1,12 +1,12 @@ version: '2.1' services: redis: - image: 'redis:5.0.5' + image: redis:latest container_name: af_redis command: redis-server --requirepass redispass postgres: - image: postgres:9.6 + image: postgres:latest container_name: af_postgres environment: - POSTGRES_USER=airflow @@ -75,7 +75,7 @@ services: - postgres - redis # environment: -# # Performance Tweaks +# # Scheduler Performance Tweaks # # Reduce how often DAGs are reloaded to dramatically reduce CPU use # - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} # - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index 90c08b4a..cf2bfcf4 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -1,7 +1,7 @@ version: '2.1' services: postgres: - image: postgres:9.6 + image: postgres:latest container_name: af_postgres environment: - POSTGRES_USER=airflow @@ -10,12 +10,12 @@ services: # ports: # - 5432:5432 - webserver: + scheduler: extends: file: docker-compose.airflow.local.yml service: airflow - container_name: af_webserver - command: webserver + container_name: af_scheduler + command: scheduler depends_on: - postgres ports: @@ -34,6 +34,11 @@ services: # - AIRFLOW__WEBSERVER__WORKERS=5 # # Restart workers every 30min instead of 30seconds # - AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL=1800 +# +# # Scheduler Performance Tweaks +# # Reduce how often DAGs are reloaded to dramatically reduce CPU use +# - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} +# - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} # labels: # - "traefik.enable=true" # - "traefik.http.routers.airflow.rule=Host(`af.example.com`)" @@ -44,20 +49,6 @@ services: timeout: 30s retries: 3 - scheduler: - extends: - file: docker-compose.airflow.local.yml - service: airflow - container_name: af_scheduler - command: scheduler - depends_on: - - postgres -# environment: -# # Performance Tweaks -# # Reduce how often DAGs are reloaded to dramatically reduce CPU use -# - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=${AF_MIN_FILE_PROCESS_INTERVAL-60} -# - AIRFLOW__SCHEDULER__MAX_THREADS=${AF_THREADS-1} - #networks: # proxy: # external: true diff --git a/docker-compose.airflow.celery.yml b/docker-compose.airflow.celery.yml index 040a6275..0d96840a 100644 --- a/docker-compose.airflow.celery.yml +++ b/docker-compose.airflow.celery.yml @@ -8,10 +8,10 @@ services: environment: - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow - AIRFLOW__CORE__FERNET_KEY=KPr56n1SCB9uoco1cjT0Nyr-dKopjaRsNK0K_bBYpZQ= + - AIRFLOW__CORE__EXECUTOR=CeleryExecutor - AIRFLOW__CORE__DEFAULT_TIMEZONE=system - AIRFLOW__CORE__LOAD_EXAMPLES=True - AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=True - - AIRFLOW__CORE__EXECUTOR=CeleryExecutor - AIRFLOW__CELERY__BROKER_URL=redis://:redispass@redis:6379/0 - AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow@postgres:5432/airflow volumes: diff --git a/docker-compose.airflow.local.yml b/docker-compose.airflow.local.yml index 81904a56..b259ec56 100644 --- a/docker-compose.airflow.local.yml +++ b/docker-compose.airflow.local.yml @@ -1,11 +1,8 @@ version: '2.1' services: airflow: -# image: apache/airflow:1.10.11 build: context: . -# args: -# - DOCKER_UID=${DOCKER_UID-1000} dockerfile: Dockerfile restart: always environment: diff --git a/script/entrypoint.sh.old b/script/entrypoint.sh.old deleted file mode 100755 index 166f4837..00000000 --- a/script/entrypoint.sh.old +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash - -# User-provided configuration must always be respected. -# -# Therefore, this script must only derives Airflow AIRFLOW__ variables from other variables -# when the user did not provide their own configuration. - -TRY_LOOP="20" - -# Global defaults and back-compat -: "${AIRFLOW_HOME:="/usr/local/airflow"}" -: "${AIRFLOW__CORE__FERNET_KEY:=${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}}" -: "${AIRFLOW__CORE__EXECUTOR:=${EXECUTOR:-Sequential}Executor}" - -# Load DAGs examples (default: Yes) -if [[ -z "$AIRFLOW__CORE__LOAD_EXAMPLES" && "${LOAD_EX:=n}" == n ]]; then - AIRFLOW__CORE__LOAD_EXAMPLES=False -fi - -export \ - AIRFLOW_HOME \ - AIRFLOW__CORE__EXECUTOR \ - AIRFLOW__CORE__FERNET_KEY \ - AIRFLOW__CORE__LOAD_EXAMPLES \ - -# Install custom python package if requirements.txt is present -if [ -e "/requirements.txt" ]; then - $(command -v pip) install --user -r /requirements.txt -fi - -wait_for_port() { - local name="$1" host="$2" port="$3" - local j=0 - while ! nc -z "$host" "$port" >/dev/null 2>&1 < /dev/null; do - j=$((j+1)) - if [ $j -ge $TRY_LOOP ]; then - echo >&2 "$(date) - $host:$port still not reachable, giving up" - exit 1 - fi - echo "$(date) - waiting for $name... $j/$TRY_LOOP" - sleep 5 - done -} - -# Other executors than SequentialExecutor drive the need for an SQL database, here PostgreSQL is used -if [ "$AIRFLOW__CORE__EXECUTOR" != "SequentialExecutor" ]; then - # Check if the user has provided explicit Airflow configuration concerning the database - if [ -z "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" ]; then - # Default values corresponding to the default compose files - : "${POSTGRES_HOST:="postgres"}" - : "${POSTGRES_PORT:="5432"}" - : "${POSTGRES_USER:="airflow"}" - : "${POSTGRES_PASSWORD:="airflow"}" - : "${POSTGRES_DB:="airflow"}" - : "${POSTGRES_EXTRAS:-""}" - - AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" - export AIRFLOW__CORE__SQL_ALCHEMY_CONN - - # Check if the user has provided explicit Airflow configuration for the broker's connection to the database - if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then - AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" - export AIRFLOW__CELERY__RESULT_BACKEND - fi - else - if [[ "$AIRFLOW__CORE__EXECUTOR" == "CeleryExecutor" && -z "$AIRFLOW__CELERY__RESULT_BACKEND" ]]; then - >&2 printf '%s\n' "FATAL: if you set AIRFLOW__CORE__SQL_ALCHEMY_CONN manually with CeleryExecutor you must also set AIRFLOW__CELERY__RESULT_BACKEND" - exit 1 - fi - - # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user - POSTGRES_ENDPOINT=$(echo -n "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" | cut -d '/' -f3 | sed -e 's,.*@,,') - POSTGRES_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1) - POSTGRES_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2) - fi - - wait_for_port "Postgres" "$POSTGRES_HOST" "$POSTGRES_PORT" -fi - -# CeleryExecutor drives the need for a Celery broker, here Redis is used -if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then - # Check if the user has provided explicit Airflow configuration concerning the broker - if [ -z "$AIRFLOW__CELERY__BROKER_URL" ]; then - # Default values corresponding to the default compose files - : "${REDIS_PROTO:="redis://"}" - : "${REDIS_HOST:="redis"}" - : "${REDIS_PORT:="6379"}" - : "${REDIS_PASSWORD:=""}" - : "${REDIS_DBNUM:="1"}" - - # When Redis is secured by basic auth, it does not handle the username part of basic auth, only a token - if [ -n "$REDIS_PASSWORD" ]; then - REDIS_PREFIX=":${REDIS_PASSWORD}@" - else - REDIS_PREFIX= - fi - - AIRFLOW__CELERY__BROKER_URL="${REDIS_PROTO}${REDIS_PREFIX}${REDIS_HOST}:${REDIS_PORT}/${REDIS_DBNUM}" - export AIRFLOW__CELERY__BROKER_URL - else - # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user - REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,') - REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1) - REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2) - fi - - wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT" -fi - -case "$1" in - webserver) - airflow initdb - if [ "$AIRFLOW__CORE__EXECUTOR" = "LocalExecutor" ] || [ "$AIRFLOW__CORE__EXECUTOR" = "SequentialExecutor" ]; then - # With the "Local" and "Sequential" executors it should all run in one container. - airflow scheduler & - fi - exec airflow webserver - ;; - worker|scheduler) - # Give the webserver time to run initdb. - sleep 10 - exec airflow "$@" - ;; - flower) - sleep 10 - exec airflow "$@" - ;; - version) - exec airflow "$@" - ;; - *) - # The command is something like bash, not an airflow subcommand. Just run it in the right environment. - exec "$@" - ;; -esac diff --git a/script/entrypoint_wrapper.sh b/script/entrypoint_wrapper.sh index 0755fa4e..36c7936d 100755 --- a/script/entrypoint_wrapper.sh +++ b/script/entrypoint_wrapper.sh @@ -8,13 +8,19 @@ fi case "$1" in webserver|worker|flower) # Give the scheduler time to run upgradedb. - sleep 20 + sleep 10 exec /entrypoint "$@" ;; scheduler) - # Give postgres time to come up. - sleep 10 + echo "Attempting upgradedb command.." + # In upgradedb default connections are not populated. Use "airflow initdb" instead for default connections. airflow upgradedb + if [[ "$AIRFLOW__CORE__EXECUTOR" = "LocalExecutor" ]] || [[ "$AIRFLOW__CORE__EXECUTOR" = "SequentialExecutor" ]]; + then + # Running webserver in scheduler instead of reverse to maintain consistency in Makefile. + # With the "Local" and "Sequential" executors it should all run in one container. + airflow webserver & + fi exec /entrypoint "$@" ;; bash|python) From 8b2a19b88c8ac53b3fb4176b037307b68648f636 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 29 Aug 2020 01:31:21 +0530 Subject: [PATCH 5/8] bugfix in makefile --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index bd7df7bf..5d198de2 100644 --- a/Makefile +++ b/Makefile @@ -53,13 +53,13 @@ conf: docker-compose -f $(FILE) config initdb: - docker-compose -f $(FILE) run --rm $(SERVICE) airflow initdb + docker-compose -f $(FILE) run --rm $(SERVICE) initdb upgradedb: - docker-compose -f $(FILE) run --rm $(SERVICE) airflow upgradedb + docker-compose -f $(FILE) run --rm $(SERVICE) upgradedb resetdb: - docker-compose -f $(FILE) run --rm $(SERVICE) airflow resetdb + docker-compose -f $(FILE) run --rm $(SERVICE) resetdb rbacadmin: # Change user details and password after login if using RBAC mode. From b28dc05fa401d2fbbd5608d3770d4296f6144f12 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 29 Aug 2020 01:40:09 +0530 Subject: [PATCH 6/8] readme --- README.md | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 68de0a13..6fe3c2f9 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,17 @@ # docker-airflow -[![CI status](https://github.com/puckel/docker-airflow/workflows/CI/badge.svg?branch=master)](https://github.com/puckel/docker-airflow/actions?query=workflow%3ACI+branch%3Amaster+event%3Apush) -[![Docker Build status](https://img.shields.io/docker/build/puckel/docker-airflow?style=plastic)](https://hub.docker.com/r/puckel/docker-airflow/tags?ordering=last_updated) +[![CI status](https://github.com/swapniel99/docker-airflow/workflows/CI/badge.svg?branch=master)](https://github.com/swapniel99/docker-airflow/actions?query=workflow%3ACI+branch%3Amaster+event%3Apush) +[![Docker Build status](https://img.shields.io/docker/build/swapniel99/docker-airflow?style=plastic)](https://hub.docker.com/r/swapniel99/docker-airflow/tags?ordering=last_updated) -[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/puckel/docker-airflow/) -[![Docker Pulls](https://img.shields.io/docker/pulls/puckel/docker-airflow.svg)]() -[![Docker Stars](https://img.shields.io/docker/stars/puckel/docker-airflow.svg)]() +[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/swapniel99/docker-airflow/) +[![Docker Pulls](https://img.shields.io/docker/pulls/swapniel99/docker-airflow.svg)]() +[![Docker Stars](https://img.shields.io/docker/stars/swapniel99/docker-airflow.svg)]() -This repository contains **Dockerfile** of [apache-airflow](https://github.com/apache/incubator-airflow) for [Docker](https://www.docker.com/)'s [automated build](https://registry.hub.docker.com/u/puckel/docker-airflow/) published to the public [Docker Hub Registry](https://registry.hub.docker.com/). +(Readme not fully updated.q) +This repository contains **Dockerfile** of [apache-airflow](https://github.com/apache/incubator-airflow) for [Docker](https://www.docker.com/)'s [automated build](https://registry.hub.docker.com/u/swapniel99/docker-airflow/) published to the public [Docker Hub Registry](https://registry.hub.docker.com/). ## Informations -* Based on Python (3.7-slim-buster) official Image [apache/airflow:1.10.11](https://hub.docker.com/r/apache/airflow) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [Redis](https://hub.docker.com/_/redis/) as queue +* Based on Python (3.6-slim-buster) official Image [apache/airflow:1.10.11](https://hub.docker.com/r/apache/airflow) and uses the official [Postgres](https://hub.docker.com/_/postgres/) as backend and [Redis](https://hub.docker.com/_/redis/) as queue * Install [Docker](https://www.docker.com/) * Install [Docker Compose](https://docs.docker.com/compose/install/) * Following the Airflow release from [Python Package Index](https://pypi.python.org/pypi/apache-airflow) @@ -19,26 +20,26 @@ This repository contains **Dockerfile** of [apache-airflow](https://github.com/a Pull the image from the Docker repository. - docker pull puckel/docker-airflow + docker pull swapniel99/docker-airflow ## Build Optionally install [Extra Airflow Packages](https://airflow.incubator.apache.org/installation.html#extra-package) and/or python dependencies at build time : - docker build --rm --build-arg AIRFLOW_DEPS="datadog,dask" -t puckel/docker-airflow . - docker build --rm --build-arg PYTHON_DEPS="flask_oauthlib>=0.9" -t puckel/docker-airflow . + docker build --rm --build-arg AIRFLOW_DEPS="datadog,dask" -t swapniel99/docker-airflow . + docker build --rm --build-arg PYTHON_DEPS="flask_oauthlib>=0.9" -t swapniel99/docker-airflow . or combined - docker build --rm --build-arg AIRFLOW_DEPS="datadog,dask" --build-arg PYTHON_DEPS="flask_oauthlib>=0.9" -t puckel/docker-airflow . + docker build --rm --build-arg AIRFLOW_DEPS="datadog,dask" --build-arg PYTHON_DEPS="flask_oauthlib>=0.9" -t swapniel99/docker-airflow . -Don't forget to update the airflow images in the docker-compose files to puckel/docker-airflow:latest. +Don't forget to update the airflow images in the docker-compose files to swapniel99/docker-airflow:latest. ## Usage By default, docker-airflow runs Airflow with **SequentialExecutor** : - docker run -d -p 8080:8080 puckel/docker-airflow webserver + docker run -d -p 8080:8080 swapniel99/docker-airflow webserver If you want to run another executor, use the other docker-compose.yml files provided in this repository. @@ -54,7 +55,7 @@ NB : If you want to have DAGs example loaded (default=False), you've to set the `LOAD_EX=n` - docker run -d -p 8080:8080 -e LOAD_EX=y puckel/docker-airflow + docker run -d -p 8080:8080 -e LOAD_EX=y swapniel99/docker-airflow If you want to use Ad hoc query, make sure you've configured connections: Go to Admin -> Connections and Edit "postgres_default" set this values (equivalent to values in airflow.cfg/docker-compose*.yml) : @@ -65,7 +66,7 @@ Go to Admin -> Connections and Edit "postgres_default" set this values (equivale For encrypted connection passwords (in Local or Celery Executor), you must have the same fernet_key. By default docker-airflow generates the fernet_key at startup, you have to set an environment variable in the docker-compose (ie: docker-compose-LocalExecutor.yml) file to set the same key accross containers. To generate a fernet_key : - docker run puckel/docker-airflow python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)" + docker run swapniel99/docker-airflow python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)" ## Configuring Airflow @@ -111,7 +112,7 @@ This can be used to scale to a multi node setup using docker swarm. If you want to run other airflow sub-commands, such as `list_dags` or `clear` you can do so like this: - docker run --rm -ti puckel/docker-airflow airflow list_dags + docker run --rm -ti swapniel99/docker-airflow airflow list_dags or with your docker-compose set up like this: @@ -119,8 +120,8 @@ or with your docker-compose set up like this: You can also use this to run a bash shell or any other command in the same environment that airflow would be run in: - docker run --rm -ti puckel/docker-airflow bash - docker run --rm -ti puckel/docker-airflow ipython + docker run --rm -ti swapniel99/docker-airflow bash + docker run --rm -ti swapniel99/docker-airflow ipython # Simplified SQL database configuration using PostgreSQL From c817e641df1920a6ca7a4be66a7ee881ec6ea8c5 Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 29 Aug 2020 01:59:38 +0530 Subject: [PATCH 7/8] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6fe3c2f9..f2edb939 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ [![Docker Pulls](https://img.shields.io/docker/pulls/swapniel99/docker-airflow.svg)]() [![Docker Stars](https://img.shields.io/docker/stars/swapniel99/docker-airflow.svg)]() -(Readme not fully updated.q) +(Readme not fully updated.) + This repository contains **Dockerfile** of [apache-airflow](https://github.com/apache/incubator-airflow) for [Docker](https://www.docker.com/)'s [automated build](https://registry.hub.docker.com/u/swapniel99/docker-airflow/) published to the public [Docker Hub Registry](https://registry.hub.docker.com/). ## Informations From f6afa2f9b0b771e47df9b9723e539d9214268bdc Mon Sep 17 00:00:00 2001 From: Swapnil Gusani Date: Sat, 29 Aug 2020 02:08:28 +0530 Subject: [PATCH 8/8] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f2edb939..08844314 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ (Readme not fully updated.) +Courtesy: [puckel/docker-airflow](https://github.com/puckel/docker-airflow) + This repository contains **Dockerfile** of [apache-airflow](https://github.com/apache/incubator-airflow) for [Docker](https://www.docker.com/)'s [automated build](https://registry.hub.docker.com/u/swapniel99/docker-airflow/) published to the public [Docker Hub Registry](https://registry.hub.docker.com/). ## Informations