Skip to content

v1.1.0

v1.1.0 #36722

Workflow file for this run

name: Docker Build, Scan, Test
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *" # Run at midnight UTC every day
push:
branches:
- master
- releases/**
pull_request:
branches:
- "**"
release:
types: [published]
concurrency:
# Using `github.run_id` (unique val) instead of `github.ref` here
# because we don't want to cancel this workflow on master only for PRs
# as that makes reproducing issues easier
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id }}
cancel-in-progress: true
env:
DOCKER_REGISTRY: "acryldata"
DOCKER_CACHE: "DEPOT"
DEPOT_PROJECT_ID: "${{ vars.DEPOT_PROJECT_ID }}"
DEPOT_TOKEN: "${{ secrets.DEPOT_TOKEN }}"
permissions:
contents: read
id-token: write
jobs:
setup:
runs-on: depot-ubuntu-24.04-small
outputs:
# TODO: Many of the vars below should not be required anymore.
tag: ${{ steps.tag.outputs.tag }}
slim_tag: ${{ steps.tag.outputs.slim_tag }}
full_tag: ${{ steps.tag.outputs.full_tag }}
short_sha: ${{ steps.tag.outputs.short_sha }} # needed for auto-deploy
unique_tag: ${{ steps.tag.outputs.unique_tag }}
unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }}
unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }}
docker-login: ${{ steps.docker-login.outputs.docker-login }}
publish: ${{ steps.publish.outputs.publish }}
pr-publish: ${{ steps.pr-publish.outputs.publish }}
python_release_version: ${{ steps.tag.outputs.python_release_version }}
branch_name: ${{ steps.tag.outputs.branch_name }}
repository_name: ${{ steps.tag.outputs.repository_name }}
frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' || github.event_name != 'pull_request' }}
actions_change: ${{ steps.ci-optimize.outputs.actions-change == 'true' || github.event_name != 'pull_request'}}
ingestion_change: ${{ steps.ci-optimize.outputs.ingestion-change == 'true' || github.event_name != 'pull_request' }}
ingestion_base_change: ${{ steps.ci-optimize.outputs.ingestion-base-change == 'true' }}
backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' || github.event_name != 'pull_request' }}
frontend_only: ${{ steps.ci-optimize.outputs.frontend-only == 'true' }}
ingestion_only: ${{ steps.ci-optimize.outputs.ingestion-only == 'true' }}
backend_only: ${{ steps.ci-optimize.outputs.backend-only == 'true' }}
kafka_setup_change: ${{ steps.ci-optimize.outputs.kafka-setup-change == 'true' }}
mysql_setup_change: ${{ steps.ci-optimize.outputs.mysql-setup-change == 'true' }}
postgres_setup_change: ${{ steps.ci-optimize.outputs.postgres-setup-change == 'true' }}
elasticsearch_setup_change: ${{ steps.ci-optimize.outputs.elasticsearch-setup-change == 'true' }}
smoke_test_change: ${{ steps.ci-optimize.outputs.smoke-test-change == 'true' }}
integrations_service_change: "false"
datahub_executor_change: "false"
run_publish_images: ${{ steps.run-publish-images.outputs.run_publish_images}}
build_runner_type: ${{ steps.set-runner.outputs.build_runner_type }}
test_runner_type: ${{ steps.set-runner.outputs.test_runner_type }}
test_runner_type_small: ${{ steps.set-runner.outputs.test_runner_type_small }}
use_depot_cache: ${{ steps.set-runner.outputs.use_depot_cache }}
steps:
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- name: Compute Tag
id: tag
run: |
source .github/scripts/docker_helpers.sh
{
echo "short_sha=${SHORT_SHA}"
echo "tag=$(get_tag)"
echo "slim_tag=$(get_tag_slim)"
echo "full_tag=$(get_tag_full)"
echo "unique_tag=$(get_unique_tag)"
echo "unique_slim_tag=$(get_unique_tag_slim)"
echo "unique_full_tag=$(get_unique_tag_full)"
echo "python_release_version=$(get_python_docker_release_v)"
echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}"
echo "repository_name=${GITHUB_REPOSITORY#*/}"
} >> "$GITHUB_OUTPUT"
- name: Check whether docker login is possible
id: docker-login
env:
ENABLE_DOCKER_LOGIN: ${{ secrets.ACRYL_DOCKER_PASSWORD != '' }}
run: |
echo "Enable Docker Login: ${{ env.ENABLE_DOCKER_LOGIN }}"
echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> "$GITHUB_OUTPUT"
- name: Check whether publishing enabled
id: publish
env:
ENABLE_PUBLISH: >-
${{
(github.event_name != 'pull_request')
&& ( secrets.ACRYL_DOCKER_PASSWORD != '' )
}}
run: |
echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
- name: Check whether PR publishing enabled
id: pr-publish
env:
ENABLE_PUBLISH: >-
${{
(github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'publish') || contains(github.event.pull_request.labels.*.name, 'publish-docker')))
&& ( secrets.ACRYL_DOCKER_PASSWORD != '' )
}}
run: |
echo "Enable PR publish: ${{ env.ENABLE_PUBLISH }}"
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
- uses: ./.github/actions/ci-optimization
id: ci-optimize
- name: Determine runner type
id: set-runner
# This needs to handle two scenarios:
# 1. Running on a PR from a fork. There are some auth issues that prevent us from using depot in that case.
# So, Its easier to just use the regular github actions cache and build all images for each parallel job running smoke test.
# Note, concurrency is lower when using github runners, queue times can be longer, test time is longer due to fewer parallel jobs.
# 2. Running on a PR from a branch in the datahub-project org and push/schedule events on master.
# Depot is used here for remote container builds in base_build and also for all runners. Depot runners support unlimited concurrency
# and hence short queue times and higher parallelism of smoke tests
run: |
if [[ "${{ env.DOCKER_CACHE }}" == "DEPOT" && "${{ env.DEPOT_PROJECT_ID }}" != "" ]]; then
echo "build_runner_type=depot-ubuntu-24.04-4" >> "$GITHUB_OUTPUT"
echo "test_runner_type=depot-ubuntu-24.04-4" >> "$GITHUB_OUTPUT"
echo "test_runner_type_small=depot-ubuntu-24.04-small" >> "$GITHUB_OUTPUT"
echo "use_depot_cache=true" >> "$GITHUB_OUTPUT"
else
echo "build_runner_type=ubuntu-latest" >> "$GITHUB_OUTPUT"
echo "test_runner_type=ubuntu-latest" >> "$GITHUB_OUTPUT"
echo "test_runner_type_small=ubuntu-latest" >> "$GITHUB_OUTPUT"
echo "use_depot_cache=false" >> "$GITHUB_OUTPUT"
# publishing is currently only supported via depot
fi
smoke_test_lint:
name: Lint on smoke tests
runs-on: ${{ needs.setup.outputs.test_runner_type_small }}
needs: setup
if: ${{ needs.setup.outputs.smoke_test_change == 'true' }}
steps:
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- uses: actions/cache@v4
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
- uses: actions/cache@v4
with:
path: |
~/.cache/yarn
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
- name: Run lint on smoke test
run: |
python ./.github/scripts/check_python_package.py
./gradlew :smoke-test:pythonLint
./gradlew :smoke-test:cypressLint
base_build:
name: Build all images
runs-on: ${{ needs.setup.outputs.build_runner_type }}
needs: setup
if: ${{ needs.setup.outputs.use_depot_cache == 'true' }} # On fork, smoke test job does the build since depot cache is not available
outputs:
build_id: ${{ steps.capture-build-id.outputs.build_id }}
matrix: ${{ steps.capture-build-id.outputs.matrix }}
steps:
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: 17
- uses: actions/cache/restore@v3
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-uv-
- uses: actions/cache/restore@v3
with:
path: |
~/.cache/yarn
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
restore-keys: |
${{ runner.os }}-yarn-
- name: Set up Depot CLI
if: ${{ env.DOCKER_CACHE == 'DEPOT' }}
uses: depot/setup-action@v1
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- name: Login to DockerHub
uses: docker/login-action@v3
if: ${{ needs.setup.outputs.docker-login == 'true' }}
with:
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
- name: Build all Images (For Smoke tests)
if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }}
# If not publishing, just a subset of images required for smoke tests is sufficient.
run: |
./gradlew :docker:buildImagesQuickStartDebugConsumers -Ptag=${{ needs.setup.outputs.tag }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }}
- name: Build all Images (Publish)
if: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
run: |
./gradlew :docker:buildImagesAll -PmatrixBuild=true -Ptag=${{ needs.setup.outputs.tag }} -PshaTag=${{ needs.setup.outputs.short_sha }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }} -PdockerPush=true
- name: Capture build Id
id: capture-build-id
run: |
pip install jq
DEPOT_BUILD_ID=$(jq -r '.["depot.build"]?.buildID' ${{ github.workspace }}/build/build-metadata.json)
echo "build_id=${DEPOT_BUILD_ID}" >> "$GITHUB_OUTPUT"
echo "matrix=$(jq -c '{"target":.["depot.build"].targets}' ${{ github.workspace }}/build/build-metadata.json)" >> $GITHUB_OUTPUT
- uses: actions/cache/save@v4
if: ${{ github.ref == 'refs/heads/master' }}
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
- uses: actions/cache/save@v4
if: ${{ github.ref == 'refs/heads/master' }}
with:
path: |
~/.cache/yarn
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
scan_images:
permissions:
contents: read # for actions/checkout to fetch code
security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
name: Scan images for vulnerabilities
runs-on: depot-ubuntu-24.04
needs: [setup, base_build]
if: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.base_build.outputs.matrix) }}
steps:
- name: Checkout # adding checkout step just to make trivy upload happy
uses: acryldata/sane-checkout-action@v3
- id: download_image
name: Download images from depot
if: ${{ needs.setup.outputs.use_depot_cache == 'true' }}
run: |
depot pull --project ${{ env.DEPOT_PROJECT_ID }} ${{ needs.base_build.outputs.build_id }} --target ${{ matrix.target}}
docker images
echo "docker_image=$(docker images --format '{{.Repository}}:{{.Tag}}' | grep ${{ needs.setup.outputs.tag }} )" >> $GITHUB_OUTPUT
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@0.30.0
env:
TRIVY_OFFLINE_SCAN: true
TRIVY_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-db:2,ghcr.io/aquasecurity/trivy-db:2
TRIVY_JAVA_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-java-db:1,ghcr.io/aquasecurity/trivy-java-db:1
with:
image-ref: ${{ steps.download_image.outputs.docker_image }}
format: "template"
template: "@/contrib/sarif.tpl"
output: "trivy-results.sarif"
severity: "CRITICAL,HIGH"
ignore-unfixed: true
vuln-type: "os,library"
- name: Upload Trivy scan results to GitHub Security tab
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: "trivy-results.sarif"
smoke_test_matrix:
runs-on: ${{ needs.setup.outputs.test_runner_type_small }}
needs: setup
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }}
python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }}
steps:
- id: set-batch-count
# Tests are split simply to ensure the configured number of batches for parallelization. This may need some
# increase as a new tests added increase the duration where an additional parallel batch helps.
# python_batch_count is used to split pytests in the smoke-test (batches of actual test functions)
# cypress_batch_count is used to split the collection of cypress test specs into batches.
run: |
if [[ "${{ needs.setup.outputs.test_runner_type }}" == "ubuntu-latest" ]]; then
echo "cypress_batch_count=5" >> "$GITHUB_OUTPUT"
echo "python_batch_count=3" >> "$GITHUB_OUTPUT"
else
echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT"
echo "python_batch_count=6" >> "$GITHUB_OUTPUT"
fi
- id: set-matrix
# For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n.
# while the github action matrix generation can handle these two parts individually, there isnt a way to use the
# two generated matrices for the same job. So, produce that matrix with scripting and use the include directive
# to add it to the test matrix.
run: |
python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }}
python_matrix='{"test_strategy":"pytests","batch":"0","batch_count":"'"$python_batch_count"'"}'
for ((i=1;i<python_batch_count;i++)); do
python_matrix="$python_matrix"',{"test_strategy":"pytests","batch_count":"'"$python_batch_count"'","batch":"'"$i"'"}'
done
cypress_batch_count=${{ steps.set-batch-count.outputs.cypress_batch_count }}
cypress_matrix='{"test_strategy":"cypress","batch":"0","batch_count":"'"$cypress_batch_count"'"}'
for ((i=1;i<cypress_batch_count;i++)); do
cypress_matrix="$cypress_matrix"',{"test_strategy":"cypress","batch_count":"'"$cypress_batch_count"'","batch":"'"$i"'"}'
done
includes=''
if [[ "${{ needs.setup.outputs.backend_change }}" == 'true' || "${{ needs.setup.outputs.smoke_test_change }}" == 'true' || "${{ needs.setup.outputs.publish }}" == 'true' ]]; then
includes="$python_matrix,$cypress_matrix"
elif [[ "${{ needs.setup.outputs.frontend_only }}" == 'true' ]]; then
includes="$cypress_matrix"
elif [[ "${{ needs.setup.outputs.ingestion_only }}" == 'true' ]]; then
includes="$python_matrix"
fi
echo "matrix={\"include\":[$includes] }" >> "$GITHUB_OUTPUT"
smoke_test:
name: Run Smoke Tests (${{ matrix.test_strategy }}, Batch ${{ matrix.batch }}/${{ matrix.batch_count }})
runs-on: ${{ needs.setup.outputs.test_runner_type }}
needs: [setup, smoke_test_matrix, base_build]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }}
if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }}
steps:
- name: Free up disk space
if: ${{ !contains(needs.setup.outputs.test_runner_type, 'depot') }}
run: |
sudo apt-get remove 'dotnet-*' azure-cli || true
sudo rm -rf /usr/local/lib/android/ || true
sudo docker image prune -a -f || true
- uses: actions/cache/restore@v3
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-uv-
- uses: actions/cache/restore@v3
with:
path: |
~/.npm
~/.cache/Cypress
~/.cache/yarn
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
restore-keys: |
${{ runner.os }}-yarn-
- name: Check out the repo
uses: acryldata/sane-checkout-action@v3
- name: Set up Depot CLI
if: ${{ needs.setup.outputs.use_depot_cache == 'true' }}
uses: depot/setup-action@v1
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- uses: gradle/actions/setup-gradle@v4
if: ${{ needs.setup.outputs.use_depot_cache != 'true' }}
- name: Login to DockerHub
uses: docker/login-action@v3
if: ${{ needs.setup.outputs.docker-login == 'true' }}
with:
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
- name: build images
if: ${{ needs.setup.outputs.use_depot_cache != 'true' }}
run: |
./gradlew :docker:buildImagesQuickstartDebugConsumers -Ptag=${{ needs.setup.outputs.tag }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }}
docker images
env:
DOCKER_CACHE: GITHUB
- name: pull images from depot
if: ${{ needs.setup.outputs.use_depot_cache == 'true' }}
run: |
depot pull --project ${{ env.DEPOT_PROJECT_ID }} ${{ needs.base_build.outputs.build_id }}
docker images
- name: run quickstart
env:
DATAHUB_TELEMETRY_ENABLED: false
DATAHUB_VERSION: ${{ needs.setup.outputs.tag }}
DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_ACTIONS_IMAGE }}
ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor] acryl-datahub-actions"
ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml"
run: |
./smoke-test/run-quickstart.sh
- name: Disk Check
run: df -h . && docker images
- name: Disable ES Disk Threshold
run: |
curl -XPUT "http://localhost:9200/_cluster/settings" \
-H 'Content-Type: application/json' -d'{
"persistent": {
"cluster": {
"routing": {
"allocation.disk.threshold_enabled": false
}
}
}
}'
- name: Install dependencies
run: ./metadata-ingestion/scripts/install_deps.sh
- name: Build datahub cli
run: |
./gradlew :metadata-ingestion:install
- name: Smoke test
env:
RUN_QUICKSTART: false
DATAHUB_VERSION: ${{ needs.setup.outputs.tag }}
CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }}
CLEANUP_DATA: "false"
TEST_STRATEGY: ${{ matrix.test_strategy }}
BATCH_COUNT: ${{ matrix.batch_count }}
BATCH_NUMBER: ${{ matrix.batch }}
run: |
echo "$DATAHUB_VERSION"
./gradlew --stop
./smoke-test/smoke.sh
- name: Disk Check
run: df -h . && docker images
- name: store logs
if: failure()
run: |
docker ps -a
TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
source .github/scripts/docker_logs.sh
- name: Upload logs
uses: actions/upload-artifact@v4
if: failure()
with:
name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: "docker_logs/*.log"
retention-days: 5
- name: Upload screenshots
uses: actions/upload-artifact@v4
if: failure()
with:
name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
path: smoke-test/tests/cypress/cypress/screenshots/
- uses: actions/upload-artifact@v4
if: always()
with:
name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}
path: |
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Upload test results to Codecov
if: ${{ !cancelled() }}
uses: codecov/test-results-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
- uses: actions/cache/save@v4
if: ${{ github.ref == 'refs/heads/master' }}
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
- uses: actions/cache/save@v4
if: ${{ github.ref == 'refs/heads/master' }}
with:
path: |
~/.cache/yarn
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
deploy_datahub_head:
name: Deploy to Datahub HEAD
runs-on: ubuntu-latest
needs: [setup, smoke_test_lint, smoke_test]
steps:
- uses: aws-actions/configure-aws-credentials@v4
if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }}
with:
aws-access-key-id: ${{ secrets.AWS_SQS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SQS_ACCESS_KEY }}
aws-region: us-west-2
- uses: isbang/sqs-action@v0.2.0
if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }}
with:
sqs-url: ${{ secrets.DATAHUB_HEAD_SYNC_QUEUE }}
message: '{ "command": "git-sync", "args" : {"repoName": "${{ needs.setup.outputs.repository_name }}", "repoOrg": "${{ github.repository_owner }}", "repoBranch": "${{ needs.setup.outputs.branch_name }}", "repoShaShort": "${{ needs.setup.outputs.short_sha }}" }}'