Skip to content

ML Training Pipeline #4

ML Training Pipeline

ML Training Pipeline #4

# ML Training Pipeline for Rental ML System
# Automated model training, validation, and deployment
name: ML Training Pipeline
on:
schedule:
# Run model training weekly on Sundays at 2 AM UTC
- cron: '0 2 * * 0'
workflow_dispatch:
inputs:
training_type:
description: 'Type of training to run'
required: true
default: 'incremental'
type: choice
options:
- 'full'
- 'incremental'
- 'experimental'
model_name:
description: 'Specific model to train'
required: false
default: 'all'
type: choice
options:
- 'all'
- 'collaborative_filter'
- 'content_recommender'
- 'hybrid_recommender'
- 'search_ranker'
gpu_enabled:
description: 'Enable GPU training'
required: false
default: false
type: boolean
env:
PYTHON_VERSION: '3.11'
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
jobs:
# ================================
# Data Validation and Preparation
# ================================
data-validation:
name: Data Validation
runs-on: ubuntu-latest
outputs:
data-quality-score: ${{ steps.validate.outputs.quality-score }}
dataset-size: ${{ steps.validate.outputs.dataset-size }}
validation-passed: ${{ steps.validate.outputs.passed }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/base.txt
pip install -r requirements/ml.txt
- name: Configure database connection
run: |
echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV
echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV
- name: Run data quality checks
id: validate
run: |
python -c "
import sys
sys.path.append('src')
from infrastructure.ml.training.data_loader import DataQualityValidator
from infrastructure.data.config import get_database_url
validator = DataQualityValidator(get_database_url())
# Run comprehensive data validation
results = validator.validate_training_data()
print(f'Data Quality Score: {results[\"quality_score\"]}')
print(f'Dataset Size: {results[\"dataset_size\"]}')
print(f'Validation Passed: {results[\"passed\"]}')
# Set outputs
with open('$GITHUB_OUTPUT', 'a') as f:
f.write(f'quality-score={results[\"quality_score\"]}\\n')
f.write(f'dataset-size={results[\"dataset_size\"]}\\n')
f.write(f'passed={str(results[\"passed\"]).lower()}\\n')
# Exit with error if validation fails
if not results['passed']:
print('Data validation failed!')
sys.exit(1)
"
- name: Generate data report
run: |
python scripts/generate_data_report.py \
--output-path data-validation-report.html \
--format html
- name: Upload data validation report
uses: actions/upload-artifact@v3
with:
name: data-validation-report
path: data-validation-report.html
retention-days: 30
# ================================
# Feature Engineering
# ================================
feature-engineering:
name: Feature Engineering
runs-on: ubuntu-latest
needs: [data-validation]
if: needs.data-validation.outputs.validation-passed == 'true'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/base.txt
pip install -r requirements/ml.txt
- name: Configure environment
run: |
echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV
echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV
echo "FEATURE_STORE_PATH=/tmp/feature_store" >> $GITHUB_ENV
- name: Run feature engineering pipeline
run: |
python -m src.application.ml_training.feature_engineering \
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \
--output-path /tmp/features \
--validate-features
- name: Upload feature artifacts
uses: actions/upload-artifact@v3
with:
name: feature-artifacts
path: /tmp/features/
retention-days: 7
- name: Cache feature store
uses: actions/cache@v3
with:
path: /tmp/feature_store
key: feature-store-${{ github.sha }}
restore-keys: |
feature-store-
# ================================
# Model Training (CPU)
# ================================
train-cpu:
name: Train Models (CPU)
runs-on: ubuntu-latest
needs: [data-validation, feature-engineering]
if: ${{ !github.event.inputs.gpu_enabled }}
strategy:
matrix:
model: [collaborative_filter, content_recommender, search_ranker]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/base.txt
pip install -r requirements/ml.txt
- name: Download feature artifacts
uses: actions/download-artifact@v3
with:
name: feature-artifacts
path: /tmp/features/
- name: Configure MLflow
run: |
echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV
echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-cpu" >> $GITHUB_ENV
- name: Train model
run: |
python -m src.infrastructure.ml.training.ml_trainer \
--model-type ${{ matrix.model }} \
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \
--features-path /tmp/features \
--output-path /tmp/models \
--log-level INFO \
--cpu-only
- name: Evaluate model
run: |
python -m src.infrastructure.ml.training.model_evaluator \
--model-path /tmp/models/${{ matrix.model }} \
--test-data-path /tmp/features/test \
--output-path /tmp/evaluation \
--metrics-output evaluation-metrics-${{ matrix.model }}.json
- name: Upload model artifacts
uses: actions/upload-artifact@v3
with:
name: model-${{ matrix.model }}-cpu
path: |
/tmp/models/${{ matrix.model }}/
/tmp/evaluation/
evaluation-metrics-${{ matrix.model }}.json
retention-days: 30
# ================================
# Model Training (GPU)
# ================================
train-gpu:
name: Train Models (GPU)
runs-on: [self-hosted, gpu]
needs: [data-validation, feature-engineering]
if: ${{ github.event.inputs.gpu_enabled }}
strategy:
matrix:
model: [hybrid_recommender, content_recommender]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install CUDA dependencies
run: |
# Install CUDA toolkit and cuDNN
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
sudo cp /var/cuda-repo-ubuntu2004-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
sudo apt-get update
sudo apt-get -y install cuda
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/base.txt
pip install -r requirements/ml.txt
pip install tensorflow-gpu torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
- name: Verify GPU availability
run: |
nvidia-smi
python -c "import tensorflow as tf; print('GPU Available:', tf.config.list_physical_devices('GPU'))"
python -c "import torch; print('CUDA Available:', torch.cuda.is_available())"
- name: Download feature artifacts
uses: actions/download-artifact@v3
with:
name: feature-artifacts
path: /tmp/features/
- name: Configure MLflow
run: |
echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV
echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-gpu" >> $GITHUB_ENV
- name: Train model with GPU
run: |
python -m src.infrastructure.ml.training.ml_trainer \
--model-type ${{ matrix.model }} \
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \
--features-path /tmp/features \
--output-path /tmp/models \
--log-level INFO \
--gpu-enabled \
--batch-size 128 \
--epochs 100
- name: Evaluate model
run: |
python -m src.infrastructure.ml.training.model_evaluator \
--model-path /tmp/models/${{ matrix.model }} \
--test-data-path /tmp/features/test \
--output-path /tmp/evaluation \
--metrics-output evaluation-metrics-${{ matrix.model }}.json \
--gpu-enabled
- name: Upload model artifacts
uses: actions/upload-artifact@v3
with:
name: model-${{ matrix.model }}-gpu
path: |
/tmp/models/${{ matrix.model }}/
/tmp/evaluation/
evaluation-metrics-${{ matrix.model }}.json
retention-days: 30
# ================================
# Model Validation and Comparison
# ================================
model-validation:
name: Model Validation
runs-on: ubuntu-latest
needs: [train-cpu, train-gpu]
if: always() && (needs.train-cpu.result == 'success' || needs.train-gpu.result == 'success')
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/base.txt
pip install -r requirements/ml.txt
pip install matplotlib seaborn plotly
- name: Download all model artifacts
uses: actions/download-artifact@v3
with:
path: /tmp/artifacts/
- name: Run model comparison
run: |
python scripts/compare_models.py \
--models-path /tmp/artifacts \
--output-path /tmp/model-comparison \
--generate-report
- name: A/B test simulation
run: |
python scripts/ab_test_simulation.py \
--models-path /tmp/artifacts \
--test-data-path /tmp/features/test \
--output-path /tmp/ab-test-results
- name: Generate model registry
run: |
python -m src.application.ml_training.model_registry \
--models-path /tmp/artifacts \
--registry-path /tmp/model-registry \
--update-production-candidates
- name: Upload validation results
uses: actions/upload-artifact@v3
with:
name: model-validation-results
path: |
/tmp/model-comparison/
/tmp/ab-test-results/
/tmp/model-registry/
retention-days: 30
# ================================
# Model Deployment to Staging
# ================================
deploy-staging:
name: Deploy to Staging
runs-on: ubuntu-latest
needs: [model-validation]
environment:
name: ml-staging
url: https://ml-staging.rental-ml.com
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download model artifacts
uses: actions/download-artifact@v3
with:
path: /tmp/artifacts/
- name: Configure kubectl
run: |
echo "${{ secrets.STAGING_KUBECONFIG }}" | base64 -d > kubeconfig
export KUBECONFIG=kubeconfig
- name: Deploy models to staging
run: |
# Create model ConfigMap
kubectl create configmap ml-models-staging \
--from-file=/tmp/artifacts/model-registry/ \
--namespace=rental-ml-staging \
--dry-run=client -o yaml | kubectl apply -f -
# Update ML training deployment
kubectl set image deployment/rental-ml-training \
ml-training=ghcr.io/${{ github.repository }}-ml-training:latest \
-n rental-ml-staging
# Wait for rollout
kubectl rollout status deployment/rental-ml-training -n rental-ml-staging --timeout=600s
- name: Run staging validation
run: |
# Wait for services to be ready
sleep 60
# Test model endpoints
curl -f https://ml-staging.rental-ml.com/api/v1/models/health
curl -f https://ml-staging.rental-ml.com/api/v1/recommendations/test
# ================================
# Model Deployment to Production
# ================================
deploy-production:
name: Deploy to Production
runs-on: ubuntu-latest
needs: [deploy-staging]
if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.training_type == 'full')
environment:
name: ml-production
url: https://api.rental-ml.com
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download model artifacts
uses: actions/download-artifact@v3
with:
path: /tmp/artifacts/
- name: Configure kubectl
run: |
echo "${{ secrets.PRODUCTION_KUBECONFIG }}" | base64 -d > kubeconfig
export KUBECONFIG=kubeconfig
- name: Blue-Green model deployment
run: |
# Deploy new models to green environment
kubectl create configmap ml-models-green \
--from-file=/tmp/artifacts/model-registry/ \
--namespace=rental-ml-prod \
--dry-run=client -o yaml | kubectl apply -f -
# Create green ML service
kubectl apply -f k8s/production/ml-service-green.yaml
# Wait for green deployment
kubectl rollout status deployment/rental-ml-training-green -n rental-ml-prod --timeout=600s
- name: Validate green deployment
run: |
# Health check green deployment
GREEN_IP=$(kubectl get service rental-ml-green-service -n rental-ml-prod -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
curl -f http://$GREEN_IP:8000/api/v1/models/health
# Run model performance test
python scripts/model_performance_test.py --endpoint http://$GREEN_IP:8000
- name: Switch traffic to green
run: |
# Update production service to point to green
kubectl patch service rental-ml-service -n rental-ml-prod \
-p '{"spec":{"selector":{"version":"green"}}}'
- name: Monitor deployment
run: |
# Monitor for 5 minutes
sleep 300
# Check error rates and performance
python scripts/monitor_deployment.py \
--endpoint https://api.rental-ml.com \
--duration 300
- name: Cleanup blue deployment
run: |
# Remove old blue deployment
kubectl delete deployment rental-ml-training -n rental-ml-prod --ignore-not-found=true
kubectl delete configmap ml-models-blue -n rental-ml-prod --ignore-not-found=true
# Rename green to blue for next deployment
kubectl patch deployment rental-ml-training-green -n rental-ml-prod \
-p '{"metadata":{"name":"rental-ml-training"}}'
# ================================
# Post-Deployment Monitoring
# ================================
post-deployment-monitoring:
name: Post-Deployment Monitoring
runs-on: ubuntu-latest
needs: [deploy-production]
if: always() && needs.deploy-production.result == 'success'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up monitoring
run: |
# Install monitoring tools
pip install prometheus-client grafana-api
- name: Create monitoring dashboard
run: |
python scripts/create_ml_dashboard.py \
--grafana-url ${{ secrets.GRAFANA_URL }} \
--grafana-token ${{ secrets.GRAFANA_TOKEN }} \
--deployment-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
- name: Set up alerts
run: |
python scripts/setup_ml_alerts.py \
--prometheus-url ${{ secrets.PROMETHEUS_URL }} \
--alert-manager-url ${{ secrets.ALERTMANAGER_URL }}
- name: Generate deployment report
run: |
python scripts/generate_deployment_report.py \
--artifacts-path /tmp/artifacts \
--output-path ml-deployment-report.html
- name: Upload deployment report
uses: actions/upload-artifact@v3
with:
name: ml-deployment-report
path: ml-deployment-report.html
retention-days: 90
- name: Notify team
uses: 8398a7/action-slack@v3
with:
status: success
channel: '#ml-team'
text: 'ML model training and deployment completed successfully'
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}