ML Training Pipeline #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ML Training Pipeline for Rental ML System | |
# Automated model training, validation, and deployment | |
name: ML Training Pipeline | |
on: | |
schedule: | |
# Run model training weekly on Sundays at 2 AM UTC | |
- cron: '0 2 * * 0' | |
workflow_dispatch: | |
inputs: | |
training_type: | |
description: 'Type of training to run' | |
required: true | |
default: 'incremental' | |
type: choice | |
options: | |
- 'full' | |
- 'incremental' | |
- 'experimental' | |
model_name: | |
description: 'Specific model to train' | |
required: false | |
default: 'all' | |
type: choice | |
options: | |
- 'all' | |
- 'collaborative_filter' | |
- 'content_recommender' | |
- 'hybrid_recommender' | |
- 'search_ranker' | |
gpu_enabled: | |
description: 'Enable GPU training' | |
required: false | |
default: false | |
type: boolean | |
env: | |
PYTHON_VERSION: '3.11' | |
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }} | |
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
jobs: | |
# ================================ | |
# Data Validation and Preparation | |
# ================================ | |
data-validation: | |
name: Data Validation | |
runs-on: ubuntu-latest | |
outputs: | |
data-quality-score: ${{ steps.validate.outputs.quality-score }} | |
dataset-size: ${{ steps.validate.outputs.dataset-size }} | |
validation-passed: ${{ steps.validate.outputs.passed }} | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
cache: 'pip' | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements/base.txt | |
pip install -r requirements/ml.txt | |
- name: Configure database connection | |
run: | | |
echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV | |
echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV | |
- name: Run data quality checks | |
id: validate | |
run: | | |
python -c " | |
import sys | |
sys.path.append('src') | |
from infrastructure.ml.training.data_loader import DataQualityValidator | |
from infrastructure.data.config import get_database_url | |
validator = DataQualityValidator(get_database_url()) | |
# Run comprehensive data validation | |
results = validator.validate_training_data() | |
print(f'Data Quality Score: {results[\"quality_score\"]}') | |
print(f'Dataset Size: {results[\"dataset_size\"]}') | |
print(f'Validation Passed: {results[\"passed\"]}') | |
# Set outputs | |
with open('$GITHUB_OUTPUT', 'a') as f: | |
f.write(f'quality-score={results[\"quality_score\"]}\\n') | |
f.write(f'dataset-size={results[\"dataset_size\"]}\\n') | |
f.write(f'passed={str(results[\"passed\"]).lower()}\\n') | |
# Exit with error if validation fails | |
if not results['passed']: | |
print('Data validation failed!') | |
sys.exit(1) | |
" | |
- name: Generate data report | |
run: | | |
python scripts/generate_data_report.py \ | |
--output-path data-validation-report.html \ | |
--format html | |
- name: Upload data validation report | |
uses: actions/upload-artifact@v3 | |
with: | |
name: data-validation-report | |
path: data-validation-report.html | |
retention-days: 30 | |
# ================================ | |
# Feature Engineering | |
# ================================ | |
feature-engineering: | |
name: Feature Engineering | |
runs-on: ubuntu-latest | |
needs: [data-validation] | |
if: needs.data-validation.outputs.validation-passed == 'true' | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
cache: 'pip' | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements/base.txt | |
pip install -r requirements/ml.txt | |
- name: Configure environment | |
run: | | |
echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV | |
echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV | |
echo "FEATURE_STORE_PATH=/tmp/feature_store" >> $GITHUB_ENV | |
- name: Run feature engineering pipeline | |
run: | | |
python -m src.application.ml_training.feature_engineering \ | |
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \ | |
--output-path /tmp/features \ | |
--validate-features | |
- name: Upload feature artifacts | |
uses: actions/upload-artifact@v3 | |
with: | |
name: feature-artifacts | |
path: /tmp/features/ | |
retention-days: 7 | |
- name: Cache feature store | |
uses: actions/cache@v3 | |
with: | |
path: /tmp/feature_store | |
key: feature-store-${{ github.sha }} | |
restore-keys: | | |
feature-store- | |
# ================================ | |
# Model Training (CPU) | |
# ================================ | |
train-cpu: | |
name: Train Models (CPU) | |
runs-on: ubuntu-latest | |
needs: [data-validation, feature-engineering] | |
if: ${{ !github.event.inputs.gpu_enabled }} | |
strategy: | |
matrix: | |
model: [collaborative_filter, content_recommender, search_ranker] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
cache: 'pip' | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements/base.txt | |
pip install -r requirements/ml.txt | |
- name: Download feature artifacts | |
uses: actions/download-artifact@v3 | |
with: | |
name: feature-artifacts | |
path: /tmp/features/ | |
- name: Configure MLflow | |
run: | | |
echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV | |
echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-cpu" >> $GITHUB_ENV | |
- name: Train model | |
run: | | |
python -m src.infrastructure.ml.training.ml_trainer \ | |
--model-type ${{ matrix.model }} \ | |
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \ | |
--features-path /tmp/features \ | |
--output-path /tmp/models \ | |
--log-level INFO \ | |
--cpu-only | |
- name: Evaluate model | |
run: | | |
python -m src.infrastructure.ml.training.model_evaluator \ | |
--model-path /tmp/models/${{ matrix.model }} \ | |
--test-data-path /tmp/features/test \ | |
--output-path /tmp/evaluation \ | |
--metrics-output evaluation-metrics-${{ matrix.model }}.json | |
- name: Upload model artifacts | |
uses: actions/upload-artifact@v3 | |
with: | |
name: model-${{ matrix.model }}-cpu | |
path: | | |
/tmp/models/${{ matrix.model }}/ | |
/tmp/evaluation/ | |
evaluation-metrics-${{ matrix.model }}.json | |
retention-days: 30 | |
# ================================ | |
# Model Training (GPU) | |
# ================================ | |
train-gpu: | |
name: Train Models (GPU) | |
runs-on: [self-hosted, gpu] | |
needs: [data-validation, feature-engineering] | |
if: ${{ github.event.inputs.gpu_enabled }} | |
strategy: | |
matrix: | |
model: [hybrid_recommender, content_recommender] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Install CUDA dependencies | |
run: | | |
# Install CUDA toolkit and cuDNN | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin | |
sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 | |
wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb | |
sudo dpkg -i cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb | |
sudo cp /var/cuda-repo-ubuntu2004-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/ | |
sudo apt-get update | |
sudo apt-get -y install cuda | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements/base.txt | |
pip install -r requirements/ml.txt | |
pip install tensorflow-gpu torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 | |
- name: Verify GPU availability | |
run: | | |
nvidia-smi | |
python -c "import tensorflow as tf; print('GPU Available:', tf.config.list_physical_devices('GPU'))" | |
python -c "import torch; print('CUDA Available:', torch.cuda.is_available())" | |
- name: Download feature artifacts | |
uses: actions/download-artifact@v3 | |
with: | |
name: feature-artifacts | |
path: /tmp/features/ | |
- name: Configure MLflow | |
run: | | |
echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV | |
echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-gpu" >> $GITHUB_ENV | |
- name: Train model with GPU | |
run: | | |
python -m src.infrastructure.ml.training.ml_trainer \ | |
--model-type ${{ matrix.model }} \ | |
--training-type ${{ github.event.inputs.training_type || 'incremental' }} \ | |
--features-path /tmp/features \ | |
--output-path /tmp/models \ | |
--log-level INFO \ | |
--gpu-enabled \ | |
--batch-size 128 \ | |
--epochs 100 | |
- name: Evaluate model | |
run: | | |
python -m src.infrastructure.ml.training.model_evaluator \ | |
--model-path /tmp/models/${{ matrix.model }} \ | |
--test-data-path /tmp/features/test \ | |
--output-path /tmp/evaluation \ | |
--metrics-output evaluation-metrics-${{ matrix.model }}.json \ | |
--gpu-enabled | |
- name: Upload model artifacts | |
uses: actions/upload-artifact@v3 | |
with: | |
name: model-${{ matrix.model }}-gpu | |
path: | | |
/tmp/models/${{ matrix.model }}/ | |
/tmp/evaluation/ | |
evaluation-metrics-${{ matrix.model }}.json | |
retention-days: 30 | |
# ================================ | |
# Model Validation and Comparison | |
# ================================ | |
model-validation: | |
name: Model Validation | |
runs-on: ubuntu-latest | |
needs: [train-cpu, train-gpu] | |
if: always() && (needs.train-cpu.result == 'success' || needs.train-gpu.result == 'success') | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
cache: 'pip' | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install -r requirements/base.txt | |
pip install -r requirements/ml.txt | |
pip install matplotlib seaborn plotly | |
- name: Download all model artifacts | |
uses: actions/download-artifact@v3 | |
with: | |
path: /tmp/artifacts/ | |
- name: Run model comparison | |
run: | | |
python scripts/compare_models.py \ | |
--models-path /tmp/artifacts \ | |
--output-path /tmp/model-comparison \ | |
--generate-report | |
- name: A/B test simulation | |
run: | | |
python scripts/ab_test_simulation.py \ | |
--models-path /tmp/artifacts \ | |
--test-data-path /tmp/features/test \ | |
--output-path /tmp/ab-test-results | |
- name: Generate model registry | |
run: | | |
python -m src.application.ml_training.model_registry \ | |
--models-path /tmp/artifacts \ | |
--registry-path /tmp/model-registry \ | |
--update-production-candidates | |
- name: Upload validation results | |
uses: actions/upload-artifact@v3 | |
with: | |
name: model-validation-results | |
path: | | |
/tmp/model-comparison/ | |
/tmp/ab-test-results/ | |
/tmp/model-registry/ | |
retention-days: 30 | |
# ================================ | |
# Model Deployment to Staging | |
# ================================ | |
deploy-staging: | |
name: Deploy to Staging | |
runs-on: ubuntu-latest | |
needs: [model-validation] | |
environment: | |
name: ml-staging | |
url: https://ml-staging.rental-ml.com | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Download model artifacts | |
uses: actions/download-artifact@v3 | |
with: | |
path: /tmp/artifacts/ | |
- name: Configure kubectl | |
run: | | |
echo "${{ secrets.STAGING_KUBECONFIG }}" | base64 -d > kubeconfig | |
export KUBECONFIG=kubeconfig | |
- name: Deploy models to staging | |
run: | | |
# Create model ConfigMap | |
kubectl create configmap ml-models-staging \ | |
--from-file=/tmp/artifacts/model-registry/ \ | |
--namespace=rental-ml-staging \ | |
--dry-run=client -o yaml | kubectl apply -f - | |
# Update ML training deployment | |
kubectl set image deployment/rental-ml-training \ | |
ml-training=ghcr.io/${{ github.repository }}-ml-training:latest \ | |
-n rental-ml-staging | |
# Wait for rollout | |
kubectl rollout status deployment/rental-ml-training -n rental-ml-staging --timeout=600s | |
- name: Run staging validation | |
run: | | |
# Wait for services to be ready | |
sleep 60 | |
# Test model endpoints | |
curl -f https://ml-staging.rental-ml.com/api/v1/models/health | |
curl -f https://ml-staging.rental-ml.com/api/v1/recommendations/test | |
# ================================ | |
# Model Deployment to Production | |
# ================================ | |
deploy-production: | |
name: Deploy to Production | |
runs-on: ubuntu-latest | |
needs: [deploy-staging] | |
if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.training_type == 'full') | |
environment: | |
name: ml-production | |
url: https://api.rental-ml.com | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Download model artifacts | |
uses: actions/download-artifact@v3 | |
with: | |
path: /tmp/artifacts/ | |
- name: Configure kubectl | |
run: | | |
echo "${{ secrets.PRODUCTION_KUBECONFIG }}" | base64 -d > kubeconfig | |
export KUBECONFIG=kubeconfig | |
- name: Blue-Green model deployment | |
run: | | |
# Deploy new models to green environment | |
kubectl create configmap ml-models-green \ | |
--from-file=/tmp/artifacts/model-registry/ \ | |
--namespace=rental-ml-prod \ | |
--dry-run=client -o yaml | kubectl apply -f - | |
# Create green ML service | |
kubectl apply -f k8s/production/ml-service-green.yaml | |
# Wait for green deployment | |
kubectl rollout status deployment/rental-ml-training-green -n rental-ml-prod --timeout=600s | |
- name: Validate green deployment | |
run: | | |
# Health check green deployment | |
GREEN_IP=$(kubectl get service rental-ml-green-service -n rental-ml-prod -o jsonpath='{.status.loadBalancer.ingress[0].ip}') | |
curl -f http://$GREEN_IP:8000/api/v1/models/health | |
# Run model performance test | |
python scripts/model_performance_test.py --endpoint http://$GREEN_IP:8000 | |
- name: Switch traffic to green | |
run: | | |
# Update production service to point to green | |
kubectl patch service rental-ml-service -n rental-ml-prod \ | |
-p '{"spec":{"selector":{"version":"green"}}}' | |
- name: Monitor deployment | |
run: | | |
# Monitor for 5 minutes | |
sleep 300 | |
# Check error rates and performance | |
python scripts/monitor_deployment.py \ | |
--endpoint https://api.rental-ml.com \ | |
--duration 300 | |
- name: Cleanup blue deployment | |
run: | | |
# Remove old blue deployment | |
kubectl delete deployment rental-ml-training -n rental-ml-prod --ignore-not-found=true | |
kubectl delete configmap ml-models-blue -n rental-ml-prod --ignore-not-found=true | |
# Rename green to blue for next deployment | |
kubectl patch deployment rental-ml-training-green -n rental-ml-prod \ | |
-p '{"metadata":{"name":"rental-ml-training"}}' | |
# ================================ | |
# Post-Deployment Monitoring | |
# ================================ | |
post-deployment-monitoring: | |
name: Post-Deployment Monitoring | |
runs-on: ubuntu-latest | |
needs: [deploy-production] | |
if: always() && needs.deploy-production.result == 'success' | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Set up monitoring | |
run: | | |
# Install monitoring tools | |
pip install prometheus-client grafana-api | |
- name: Create monitoring dashboard | |
run: | | |
python scripts/create_ml_dashboard.py \ | |
--grafana-url ${{ secrets.GRAFANA_URL }} \ | |
--grafana-token ${{ secrets.GRAFANA_TOKEN }} \ | |
--deployment-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
- name: Set up alerts | |
run: | | |
python scripts/setup_ml_alerts.py \ | |
--prometheus-url ${{ secrets.PROMETHEUS_URL }} \ | |
--alert-manager-url ${{ secrets.ALERTMANAGER_URL }} | |
- name: Generate deployment report | |
run: | | |
python scripts/generate_deployment_report.py \ | |
--artifacts-path /tmp/artifacts \ | |
--output-path ml-deployment-report.html | |
- name: Upload deployment report | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ml-deployment-report | |
path: ml-deployment-report.html | |
retention-days: 90 | |
- name: Notify team | |
uses: 8398a7/action-slack@v3 | |
with: | |
status: success | |
channel: '#ml-team' | |
text: 'ML model training and deployment completed successfully' | |
env: | |
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} |