diff --git a/.gitignore b/.gitignore index ec01476..f545cc8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,9 +28,6 @@ __pycache__ # policy trust-policy.json -# data -data* - # logs logs/* *.log @@ -80,3 +77,6 @@ override.tf.json terraform.rc Footer +dashboard_files/ + +data/*.csv diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..bc91fdd --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +python 3.11.1 diff --git a/Makefile b/Makefile index 5c56bb3..09a0ee3 100644 --- a/Makefile +++ b/Makefile @@ -2,15 +2,17 @@ # Setup containers to run Airflow docker-spin-up: - docker compose --env-file env up airflow-init && docker compose --env-file env up --build -d + docker compose up airflow-init && docker compose up --build -d perms: - sudo mkdir -p logs plugins temp dags tests migrations && sudo chmod -R u=rwx,g=rwx,o=rwx logs plugins temp dags tests migrations + sudo mkdir -p logs plugins temp dags tests migrations data visualization && sudo chmod -R u=rwx,g=rwx,o=rwx logs plugins temp dags tests migrations data visualization -up: perms docker-spin-up warehouse-migration +up: perms docker-spin-up down: - docker compose down + docker compose down --volumes --rmi all + +restart: down up sh: docker exec -ti webserver bash @@ -50,18 +52,6 @@ infra-down: infra-config: terraform -chdir=./terraform output -#################################################################################################################### -# Create tables in Warehouse - -db-migration: - @read -p "Enter migration name:" migration_name; docker exec webserver yoyo new ./migrations -m "$$migration_name" - -warehouse-migration: - docker exec webserver yoyo develop --no-config-file --database postgres://sdeuser:sdepassword1234@warehouse:5432/finance ./migrations - -warehouse-rollback: - docker exec webserver yoyo rollback --no-config-file --database postgres://sdeuser:sdepassword1234@warehouse:5432/finance ./migrations - #################################################################################################################### # Port forwarding to local machine diff --git a/README.md b/README.md index 199504e..640f089 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,90 @@ + + +* [Data engineering project template](#data-engineering-project-template) + * [Prerequisites](#prerequisites) + * [Run code](#run-code) + * [Codespaces](#codespaces) + * [Your machine](#your-machine) + * [Infrastructure](#infrastructure) + * [Using template](#using-template) + * [Writing pipelines](#writing-pipelines) + * [(Optional) Advanced cloud setup](#optional-advanced-cloud-setup) + * [Prerequisites:](#prerequisites-1) + * [Tear down infra](#tear-down-infra) + # Data engineering project template Detailed explanation can be found **[`in this post`](https://www.startdataengineering.com/post/data-engineering-projects-with-free-template/)** -## Prerequisites - -To use the template, please install the following. +## Prerequisites 1. [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) 2. [Github account](https://github.com/) -3. [Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) -4. [AWS account](https://aws.amazon.com/) -5. [AWS CLI installed](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) and [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) -6. [Docker](https://docs.docker.com/engine/install/) with at least 4GB of RAM and [Docker Compose](https://docs.docker.com/compose/install/) v1.27.0 or later +3. [Docker](https://docs.docker.com/engine/install/) with at least 4GB of RAM and [Docker Compose](https://docs.docker.com/compose/install/) v1.27.0 or later + +## Run code + +### Codespaces + +Start a code spaces, run `make up`, wait until its ready and click on the link in the Port tab to see the AirflowUI. + +![CodeSpace start](./assets/images/cs1.png) +![Codespace make up](./assets/images/cs2.png) +![Codespace open Airflow UI](./assets/images/cs3.png) + +**Note**: Make sure to turn off your codespaces when you are done, you only have a limited amount of free codespace use. + +### Your machine + +Clone the repo and run the `make up` command as shown here: + +```bash +git clone https://github.com/josephmachado/data_engineering_project_template.git +cd data_engineering_project_template +make up +make ci # run checks and tests +sleep 30 # wait for Airflow to start +``` +**Windows users**: please setup WSL and a local Ubuntu Virtual machine following **[the instructions here](https://ubuntu.com/tutorials/install-ubuntu-on-wsl2-on-windows-10#1-overview)**. Install the above prerequisites on your ubuntu terminal; if you have trouble installing docker, follow **[the steps here](https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-on-ubuntu-22-04#step-1-installing-docker)** (only Step 1 is necessary). Please install the **make** command with `sudo apt install make -y` (if its not already present). + +Go to [http:localhost:8080](http:localhost:8080) to see the Airflow UI. Username and password are both `airflow`. + +## Infrastructure + +This data engineering project template, includes the following: + +1. **`Airflow`**: To schedule and orchestrate DAGs. +2. **`Postgres`**: To store Airflow's details (which you can see via Airflow UI) and also has a schema to represent upstream databases. +3. **`DuckDB`**: To act as our warehouse +4. **`Quarto with Plotly`**: To convert code in `markdown` format to html files that can be embedded in your app or servered as is. +5. **`minio`**: To provide an S3 compatible open source storage system. + +For simplicity services 1-4 of the above are installed and run in one container defined [here](./containers/airflow/Dockerfile). -If you are using windows please setup WSL and a local Ubuntu Virtual machine following **[the instructions here](https://ubuntu.com/tutorials/install-ubuntu-on-wsl2-on-windows-10#1-overview)**. Install the above prerequisites on your ubuntu terminal, if you have trouble installing docker follow **[the steps here](https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-on-ubuntu-22-04#step-1-installing-docker)**. +![File strucutre](./assets/images/fs.png) +![DET](./assets/images/det.png) -### Setup infra +## Using template + +You can use this repo as a template and create your own, click on the `Use this template` button. + +![Template](./assets/images/template.png) + +## Writing pipelines + +We have a sample pipeline at [coincap_elt.py](./dags/coincap_elt.py) that you can use as a starter to create your own DAGs. The tests are available at [./tests](./tests) folder. + +Once the `coincap_elt` DAG runs, we can see the dashboard html at [./visualization/dashboard.html](./visualization/dashboard.html) and will look like ![Dashboard](./assets/images/dash.png). + +## (Optional) Advanced cloud setup + +If you want to run your code on an EC2 instance, with terraform, follow the steps below. + +### Prerequisites: + +1. [Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +2. [AWS account](https://aws.amazon.com/) +3. [AWS CLI installed](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) and [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) You can create your GitHub repository based on this template by clicking on the `Use this template button in the **[data_engineering_project_template](https://github.com/josephmachado/data_engineering_project_template)** repository. Clone your repository and replace content in the following files @@ -26,10 +95,6 @@ You can create your GitHub repository based on this template by clicking on the Run the following commands in your project directory. ```shell -# Local run & test -make up # start the docker containers on your computer & runs migrations under ./migrations -make ci # Runs auto formatting, lint checks, & all the test files under ./tests - # Create AWS services with Terraform make tf-init # Only needed on your first terraform run (or if you add new providers) make infra-up # type in yes after verifying the changes TF will make @@ -45,21 +110,6 @@ make cloud-metabase # this command will forward Metabase port from EC2 to your m # use https://github.com/josephmachado/data_engineering_project_template/blob/main/env file to connect to the warehouse from metabase ``` -**Data infrastructure** -![DE Infra](/assets/images/infra.png) - -**Project structure** -![Project structure](/assets/images/proj_1.png) -![Project structure - GH actions](/assets/images/proj_2.png) - -Database migrations can be created as shown below. - -```shell -make db-migration # enter a description, e.g. create some schema -# make your changes to the newly created file under ./migrations -make warehouse-migration # to run the new migration on your warehouse -``` - For the [continuous delivery](https://github.com/josephmachado/data_engineering_project_template/blob/main/.github/workflows/cd.yml) to work, set up the infrastructure with terraform, & defined the following repository secrets. You can set up the repository secrets by going to `Settings > Secrets > Actions > New repository secret`. 1. **`SERVER_SSH_KEY`**: We can get this by running `terraform -chdir=./terraform output -raw private_key` in the project directory and paste the entire content in a new Action secret called SERVER_SSH_KEY. @@ -73,4 +123,5 @@ After you are done, make sure to destroy your cloud infrastructure. ```shell make down # Stop docker containers on your computer make infra-down # type in yes after verifying the changes TF will make -``` \ No newline at end of file +``` + diff --git a/assets/images/cs1.png b/assets/images/cs1.png new file mode 100644 index 0000000..f0fe636 Binary files /dev/null and b/assets/images/cs1.png differ diff --git a/assets/images/cs2.png b/assets/images/cs2.png new file mode 100644 index 0000000..e177fd2 Binary files /dev/null and b/assets/images/cs2.png differ diff --git a/assets/images/cs3.png b/assets/images/cs3.png new file mode 100644 index 0000000..e0deca5 Binary files /dev/null and b/assets/images/cs3.png differ diff --git a/assets/images/dash.png b/assets/images/dash.png new file mode 100644 index 0000000..203b664 Binary files /dev/null and b/assets/images/dash.png differ diff --git a/assets/images/det.png b/assets/images/det.png new file mode 100644 index 0000000..e17a7ba Binary files /dev/null and b/assets/images/det.png differ diff --git a/assets/images/fs.png b/assets/images/fs.png new file mode 100644 index 0000000..7873f71 Binary files /dev/null and b/assets/images/fs.png differ diff --git a/assets/images/template.png b/assets/images/template.png new file mode 100644 index 0000000..90285b2 Binary files /dev/null and b/assets/images/template.png differ diff --git a/containers/airflow/Dockerfile b/containers/airflow/Dockerfile index 53597b4..7558f38 100755 --- a/containers/airflow/Dockerfile +++ b/containers/airflow/Dockerfile @@ -1,3 +1,6 @@ -FROM apache/airflow:2.2.0 +FROM apache/airflow:2.9.2 COPY requirements.txt / RUN pip install --no-cache-dir -r /requirements.txt + +COPY quarto.sh / +RUN cd / && bash /quarto.sh diff --git a/containers/airflow/quarto.sh b/containers/airflow/quarto.sh new file mode 100644 index 0000000..702e10c --- /dev/null +++ b/containers/airflow/quarto.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +curl -L -o ~/quarto-1.5.43-linux-amd64.tar.gz https://github.com/quarto-dev/quarto-cli/releases/download/v1.5.43/quarto-1.5.43-linux-amd64.tar.gz +mkdir ~/opt +tar -C ~/opt -xvzf ~/quarto-1.5.43-linux-amd64.tar.gz + +mkdir ~/.local/bin +ln -s ~/opt/quarto-1.5.43/bin/quarto ~/.local/bin/quarto + +( echo ""; echo 'export PATH=$PATH:~/.local/bin\n' ; echo "" ) >> ~/.profile +source ~/.profile + diff --git a/containers/airflow/requirements.txt b/containers/airflow/requirements.txt index 96958af..30ae22f 100755 --- a/containers/airflow/requirements.txt +++ b/containers/airflow/requirements.txt @@ -1,9 +1,13 @@ -black==22.8.0 -flake8==5.0.4 -mypy==0.971 -isort==5.10.1 -moto[all]==4.0.6 -pytest==7.0.1 -pytest-mock==3.6.1 -apache-airflow-client==2.3.0 -yoyo-migrations==8.0.0 \ No newline at end of file +black==24.4.2 +flake8==7.0.0 +mypy==1.10.0 +isort==5.13.2 +moto[all]==5.0.9 +pytest==8.2.2 +pytest-mock==3.14.0 +apache-airflow-client==2.9.0 +yoyo-migrations==8.2.0 +duckdb==1.0.0 +plotly==5.22.0 +jupyter==1.0.0 +types-requests==2.32.0.20240602 diff --git a/dags/.gitignore b/dags/.gitignore deleted file mode 100755 index 86d0cb2..0000000 --- a/dags/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore \ No newline at end of file diff --git a/dags/coincap_elt.py b/dags/coincap_elt.py new file mode 100755 index 0000000..9dce84b --- /dev/null +++ b/dags/coincap_elt.py @@ -0,0 +1,43 @@ +import csv +import os +from datetime import datetime, timedelta + +import requests + +from airflow import DAG +from airflow.decorators import task +from airflow.operators.bash import BashOperator + +with DAG( + 'coincap_elt', + description='A simple DAG to fetch data \ + from CoinCap Exchanges API and write to a file', + schedule_interval=timedelta(days=1), + start_date=datetime(2023, 1, 1), + catchup=False, +) as dag: + + url = "https://api.coincap.io/v2/exchanges" + file_path = f'{os.getenv("AIRFLOW_HOME")}/data/coincap_exchanges.csv' + + @task + def fetch_coincap_exchanges(url, file_path): + response = requests.get(url) + data = response.json() + exchanges = data['data'] + if exchanges: + keys = exchanges[0].keys() + with open(file_path, 'w') as f: + dict_writer = csv.DictWriter(f, fieldnames=keys) + dict_writer.writeheader() + dict_writer.writerows(exchanges) + + markdown_path = f'{os.getenv("AIRFLOW_HOME")}/visualization/' + q_cmd = ( + f'cd {markdown_path} && quarto render {markdown_path}/dashboard.qmd' + ) + gen_dashboard = BashOperator( + task_id="generate_dashboard", bash_command=q_cmd + ) + + fetch_coincap_exchanges(url, file_path) >> gen_dashboard diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100755 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml index a3a8de0..869762c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,41 +1,3 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. -# -# WARNING: This configuration is for local development. Do not use it in a production deployment. -# -# This configuration supports basic configuration using environment variables or an .env file -# The following variables are supported: -# -# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. -# Default: apache/airflow:master-python3.8 -# AIRFLOW_UID - User ID in Airflow containers -# Default: 50000 -# AIRFLOW_GID - Group ID in Airflow containers -# Default: 50000 -# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. -# Default: airflow -# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. -# Default: airflow -# -# Feel free to modify this file to suit your needs. ---- version: '3' x-airflow-common: &airflow-common @@ -50,14 +12,11 @@ x-airflow-common: AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' AIRFLOW_CONN_POSTGRES_DEFAULT: postgres://airflow:airflow@postgres:5432/airflow - WAREHOUSE_USER: ${POSTGRES_USER} - WAREHOUSE_PASSWORD: ${POSTGRES_PASSWORD} - WAREHOUSE_DB: ${POSTGRES_DB} - WAREHOUSE_HOST: ${POSTGRES_HOST} - WARREHOUSE_PORT: ${POSTGRES_PORT} volumes: - ./dags:/opt/airflow/dags + - ./data:/opt/airflow/data + - ./visualization:/opt/airflow/visualization - ./logs:/opt/airflow/logs - ./plugins:/opt/airflow/plugins - ./tests:/opt/airflow/tests @@ -71,7 +30,7 @@ x-airflow-common: services: postgres: container_name: postgres - image: postgres:13 + image: postgres:16 environment: POSTGRES_USER: airflow POSTGRES_PASSWORD: airflow @@ -127,24 +86,3 @@ services: _AIRFLOW_WWW_USER_CREATE: 'true' _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} - - dashboard: - image: metabase/metabase - container_name: dashboard - ports: - - "3000:3000" - - warehouse: - image: postgres:13 - container_name: warehouse - environment: - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB} - healthcheck: - test: [ "CMD", "pg_isready", "-U", "${POSTGRES_USER}" ] - interval: 5s - retries: 5 - restart: always - ports: - - "5439:5432" diff --git a/env b/env deleted file mode 100644 index 1776e4f..0000000 --- a/env +++ /dev/null @@ -1,5 +0,0 @@ -POSTGRES_USER=sdeuser -POSTGRES_PASSWORD=sdepassword1234 -POSTGRES_DB=finance -POSTGRES_HOST=warehouse -POSTGRES_PORT=5439 diff --git a/migrations/20221023_01_JVZ9p-create-bitcoin-schema.py b/migrations/20221023_01_JVZ9p-create-bitcoin-schema.py deleted file mode 100755 index bed8b80..0000000 --- a/migrations/20221023_01_JVZ9p-create-bitcoin-schema.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -create bitcoin schema -""" - -from typing import Any - -from yoyo import step - -__depends__: Any = {} - -steps = [step("CREATE SCHEMA bitcoin", "DROP SCHEMA bitcoin")] diff --git a/tests/dags/test_dag_validity.py b/tests/dags/test_dag_validity.py index 6e553a1..d4f7ef8 100755 --- a/tests/dags/test_dag_validity.py +++ b/tests/dags/test_dag_validity.py @@ -5,4 +5,4 @@ def test_no_import_errors(): dag_bag = DagBag() assert len(dag_bag.import_errors) == 0, "No Import Failures" - assert dag_bag.size() == 0 + assert dag_bag.size() == 1 diff --git a/visualization/dashboard.html b/visualization/dashboard.html new file mode 100755 index 0000000..0827b73 --- /dev/null +++ b/visualization/dashboard.html @@ -0,0 +1,575 @@ + + + + + + + + + + +CoinCap Exchange Dashboard + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+
+
+
Coincap Exchange data analysis
+
+
+
+
+
+ + + + + +
+
+
+ + + +
+ + + + + + + \ No newline at end of file diff --git a/visualization/dashboard.qmd b/visualization/dashboard.qmd new file mode 100755 index 0000000..ab51cec --- /dev/null +++ b/visualization/dashboard.qmd @@ -0,0 +1,24 @@ +--- +title: "CoinCap Exchange Dashboard" +author: "StartDataEngineering" +format: dashboard +--- + +## Row {height=70%} + +```{python} +#| title: Coincap Exchange data analysis + +import pandas as pd +import plotly.express as px +import os +# Load the CSV file +file_path = f'{os.getenv("AIRFLOW_HOME")}/data/coincap_exchanges.csv' +import duckdb + +clean_data = duckdb.sql(f"select name, volumeUsd from '{file_path}' order by 2 desc limit 10").df() +# Plot the top 10 exchanges' volumeUSD +fig = px.bar(clean_data, x='name', y='volumeUsd', title='Top 10 Exchanges by VolumeUSD') +fig.show() + +```