ML Training Pipeline #9

Workflow file for this run

.github/workflows/ml-training-pipeline.yml at 65e494a

	# ML Training Pipeline for Rental ML System
	# Automated model training, validation, and deployment

	name: ML Training Pipeline

	on:
	schedule:
	# Run model training weekly on Sundays at 2 AM UTC
	- cron: '0 2 * * 0'
	workflow_dispatch:
	inputs:
	training_type:
	description: 'Type of training to run'
	required: true
	default: 'incremental'
	type: choice
	options:
	- 'full'
	- 'incremental'
	- 'experimental'
	model_name:
	description: 'Specific model to train'
	required: false
	default: 'all'
	type: choice
	options:
	- 'all'
	- 'collaborative_filter'
	- 'content_recommender'
	- 'hybrid_recommender'
	- 'search_ranker'
	gpu_enabled:
	description: 'Enable GPU training'
	required: false
	default: false
	type: boolean

	env:
	PYTHON_VERSION: '3.11'
	MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}

	jobs:
	# ================================
	# Data Validation and Preparation
	# ================================
	data-validation:
	name: Data Validation
	runs-on: ubuntu-latest
	outputs:
	data-quality-score: ${{ steps.validate.outputs.quality-score }}
	dataset-size: ${{ steps.validate.outputs.dataset-size }}
	validation-passed: ${{ steps.validate.outputs.passed }}

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	cache: 'pip'

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements/base.txt
	pip install -r requirements/ml.txt

	- name: Configure database connection
	run: \|
	echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV
	echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV

	- name: Run data quality checks
	id: validate
	run: \|
	python -c "
	import sys
	sys.path.append('src')

	from infrastructure.ml.training.data_loader import DataQualityValidator
	from infrastructure.data.config import get_database_url

	validator = DataQualityValidator(get_database_url())

	# Run comprehensive data validation
	results = validator.validate_training_data()

	print(f'Data Quality Score: {results[\"quality_score\"]}')
	print(f'Dataset Size: {results[\"dataset_size\"]}')
	print(f'Validation Passed: {results[\"passed\"]}')

	# Set outputs
	with open('$GITHUB_OUTPUT', 'a') as f:
	f.write(f'quality-score={results[\"quality_score\"]}\\n')
	f.write(f'dataset-size={results[\"dataset_size\"]}\\n')
	f.write(f'passed={str(results[\"passed\"]).lower()}\\n')

	# Exit with error if validation fails
	if not results['passed']:
	print('Data validation failed!')
	sys.exit(1)
	"

	- name: Generate data report
	run: \|
	python scripts/generate_data_report.py \
	--output-path data-validation-report.html \
	--format html

	- name: Upload data validation report
	uses: actions/upload-artifact@v3
	with:
	name: data-validation-report
	path: data-validation-report.html
	retention-days: 30

	# ================================
	# Feature Engineering
	# ================================
	feature-engineering:
	name: Feature Engineering
	runs-on: ubuntu-latest
	needs: [data-validation]
	if: needs.data-validation.outputs.validation-passed == 'true'

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	cache: 'pip'

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements/base.txt
	pip install -r requirements/ml.txt

	- name: Configure environment
	run: \|
	echo "DATABASE_URL=${{ secrets.PRODUCTION_DATABASE_URL }}" >> $GITHUB_ENV
	echo "REDIS_URL=${{ secrets.PRODUCTION_REDIS_URL }}" >> $GITHUB_ENV
	echo "FEATURE_STORE_PATH=/tmp/feature_store" >> $GITHUB_ENV

	- name: Run feature engineering pipeline
	run: \|
	python -m src.application.ml_training.feature_engineering \
	--training-type ${{ github.event.inputs.training_type \|\| 'incremental' }} \
	--output-path /tmp/features \
	--validate-features

	- name: Upload feature artifacts
	uses: actions/upload-artifact@v3
	with:
	name: feature-artifacts
	path: /tmp/features/
	retention-days: 7

	- name: Cache feature store
	uses: actions/cache@v3
	with:
	path: /tmp/feature_store
	key: feature-store-${{ github.sha }}
	restore-keys: \|
	feature-store-

	# ================================
	# Model Training (CPU)
	# ================================
	train-cpu:
	name: Train Models (CPU)
	runs-on: ubuntu-latest
	needs: [data-validation, feature-engineering]
	if: ${{ !github.event.inputs.gpu_enabled }}
	strategy:
	matrix:
	model: [collaborative_filter, content_recommender, search_ranker]

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	cache: 'pip'

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements/base.txt
	pip install -r requirements/ml.txt

	- name: Download feature artifacts
	uses: actions/download-artifact@v3
	with:
	name: feature-artifacts
	path: /tmp/features/

	- name: Configure MLflow
	run: \|
	echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV
	echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-cpu" >> $GITHUB_ENV

	- name: Train model
	run: \|
	python -m src.infrastructure.ml.training.ml_trainer \
	--model-type ${{ matrix.model }} \
	--training-type ${{ github.event.inputs.training_type \|\| 'incremental' }} \
	--features-path /tmp/features \
	--output-path /tmp/models \
	--log-level INFO \
	--cpu-only

	- name: Evaluate model
	run: \|
	python -m src.infrastructure.ml.training.model_evaluator \
	--model-path /tmp/models/${{ matrix.model }} \
	--test-data-path /tmp/features/test \
	--output-path /tmp/evaluation \
	--metrics-output evaluation-metrics-${{ matrix.model }}.json

	- name: Upload model artifacts
	uses: actions/upload-artifact@v3
	with:
	name: model-${{ matrix.model }}-cpu
	path: \|
	/tmp/models/${{ matrix.model }}/
	/tmp/evaluation/
	evaluation-metrics-${{ matrix.model }}.json
	retention-days: 30

	# ================================
	# Model Training (GPU)
	# ================================
	train-gpu:
	name: Train Models (GPU)
	runs-on: [self-hosted, gpu]
	needs: [data-validation, feature-engineering]
	if: ${{ github.event.inputs.gpu_enabled }}
	strategy:
	matrix:
	model: [hybrid_recommender, content_recommender]

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Install CUDA dependencies
	run: \|
	# Install CUDA toolkit and cuDNN
	wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
	sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
	wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
	sudo dpkg -i cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
	sudo cp /var/cuda-repo-ubuntu2004-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
	sudo apt-get update
	sudo apt-get -y install cuda

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements/base.txt
	pip install -r requirements/ml.txt
	pip install tensorflow-gpu torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

	- name: Verify GPU availability
	run: \|
	nvidia-smi
	python -c "import tensorflow as tf; print('GPU Available:', tf.config.list_physical_devices('GPU'))"
	python -c "import torch; print('CUDA Available:', torch.cuda.is_available())"

	- name: Download feature artifacts
	uses: actions/download-artifact@v3
	with:
	name: feature-artifacts
	path: /tmp/features/

	- name: Configure MLflow
	run: \|
	echo "MLFLOW_TRACKING_URI=${{ env.MLFLOW_TRACKING_URI }}" >> $GITHUB_ENV
	echo "MLFLOW_EXPERIMENT_NAME=rental-ml-${{ matrix.model }}-gpu" >> $GITHUB_ENV

	- name: Train model with GPU
	run: \|
	python -m src.infrastructure.ml.training.ml_trainer \
	--model-type ${{ matrix.model }} \
	--training-type ${{ github.event.inputs.training_type \|\| 'incremental' }} \
	--features-path /tmp/features \
	--output-path /tmp/models \
	--log-level INFO \
	--gpu-enabled \
	--batch-size 128 \
	--epochs 100

	- name: Evaluate model
	run: \|
	python -m src.infrastructure.ml.training.model_evaluator \
	--model-path /tmp/models/${{ matrix.model }} \
	--test-data-path /tmp/features/test \
	--output-path /tmp/evaluation \
	--metrics-output evaluation-metrics-${{ matrix.model }}.json \
	--gpu-enabled

	- name: Upload model artifacts
	uses: actions/upload-artifact@v3
	with:
	name: model-${{ matrix.model }}-gpu
	path: \|
	/tmp/models/${{ matrix.model }}/
	/tmp/evaluation/
	evaluation-metrics-${{ matrix.model }}.json
	retention-days: 30

	# ================================
	# Model Validation and Comparison
	# ================================
	model-validation:
	name: Model Validation
	runs-on: ubuntu-latest
	needs: [train-cpu, train-gpu]
	if: always() && (needs.train-cpu.result == 'success' \|\| needs.train-gpu.result == 'success')

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	cache: 'pip'

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements/base.txt
	pip install -r requirements/ml.txt
	pip install matplotlib seaborn plotly

	- name: Download all model artifacts
	uses: actions/download-artifact@v3
	with:
	path: /tmp/artifacts/

	- name: Run model comparison
	run: \|
	python scripts/compare_models.py \
	--models-path /tmp/artifacts \
	--output-path /tmp/model-comparison \
	--generate-report

	- name: A/B test simulation
	run: \|
	python scripts/ab_test_simulation.py \
	--models-path /tmp/artifacts \
	--test-data-path /tmp/features/test \
	--output-path /tmp/ab-test-results

	- name: Generate model registry
	run: \|
	python -m src.application.ml_training.model_registry \
	--models-path /tmp/artifacts \
	--registry-path /tmp/model-registry \
	--update-production-candidates

	- name: Upload validation results
	uses: actions/upload-artifact@v3
	with:
	name: model-validation-results
	path: \|
	/tmp/model-comparison/
	/tmp/ab-test-results/
	/tmp/model-registry/
	retention-days: 30

	# ================================
	# Model Deployment to Staging
	# ================================
	deploy-staging:
	name: Deploy to Staging
	runs-on: ubuntu-latest
	needs: [model-validation]
	environment:
	name: ml-staging
	url: https://ml-staging.rental-ml.com

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download model artifacts
	uses: actions/download-artifact@v3
	with:
	path: /tmp/artifacts/

	- name: Configure kubectl
	run: \|
	echo "${{ secrets.STAGING_KUBECONFIG }}" \| base64 -d > kubeconfig
	export KUBECONFIG=kubeconfig

	- name: Deploy models to staging
	run: \|
	# Create model ConfigMap
	kubectl create configmap ml-models-staging \
	--from-file=/tmp/artifacts/model-registry/ \
	--namespace=rental-ml-staging \
	--dry-run=client -o yaml \| kubectl apply -f -

	# Update ML training deployment
	kubectl set image deployment/rental-ml-training \
	ml-training=ghcr.io/${{ github.repository }}-ml-training:latest \
	-n rental-ml-staging

	# Wait for rollout
	kubectl rollout status deployment/rental-ml-training -n rental-ml-staging --timeout=600s

	- name: Run staging validation
	run: \|
	# Wait for services to be ready
	sleep 60

	# Test model endpoints
	curl -f https://ml-staging.rental-ml.com/api/v1/models/health
	curl -f https://ml-staging.rental-ml.com/api/v1/recommendations/test

	# ================================
	# Model Deployment to Production
	# ================================
	deploy-production:
	name: Deploy to Production
	runs-on: ubuntu-latest
	needs: [deploy-staging]
	if: github.event_name == 'schedule' \|\| (github.event_name == 'workflow_dispatch' && github.event.inputs.training_type == 'full')
	environment:
	name: ml-production
	url: https://api.rental-ml.com

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download model artifacts
	uses: actions/download-artifact@v3
	with:
	path: /tmp/artifacts/

	- name: Configure kubectl
	run: \|
	echo "${{ secrets.PRODUCTION_KUBECONFIG }}" \| base64 -d > kubeconfig
	export KUBECONFIG=kubeconfig

	- name: Blue-Green model deployment
	run: \|
	# Deploy new models to green environment
	kubectl create configmap ml-models-green \
	--from-file=/tmp/artifacts/model-registry/ \
	--namespace=rental-ml-prod \
	--dry-run=client -o yaml \| kubectl apply -f -

	# Create green ML service
	kubectl apply -f k8s/production/ml-service-green.yaml

	# Wait for green deployment
	kubectl rollout status deployment/rental-ml-training-green -n rental-ml-prod --timeout=600s

	- name: Validate green deployment
	run: \|
	# Health check green deployment
	GREEN_IP=$(kubectl get service rental-ml-green-service -n rental-ml-prod -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
	curl -f http://$GREEN_IP:8000/api/v1/models/health

	# Run model performance test
	python scripts/model_performance_test.py --endpoint http://$GREEN_IP:8000

	- name: Switch traffic to green
	run: \|
	# Update production service to point to green
	kubectl patch service rental-ml-service -n rental-ml-prod \
	-p '{"spec":{"selector":{"version":"green"}}}'

	- name: Monitor deployment
	run: \|
	# Monitor for 5 minutes
	sleep 300

	# Check error rates and performance
	python scripts/monitor_deployment.py \
	--endpoint https://api.rental-ml.com \
	--duration 300

	- name: Cleanup blue deployment
	run: \|
	# Remove old blue deployment
	kubectl delete deployment rental-ml-training -n rental-ml-prod --ignore-not-found=true
	kubectl delete configmap ml-models-blue -n rental-ml-prod --ignore-not-found=true

	# Rename green to blue for next deployment
	kubectl patch deployment rental-ml-training-green -n rental-ml-prod \
	-p '{"metadata":{"name":"rental-ml-training"}}'

	# ================================
	# Post-Deployment Monitoring
	# ================================
	post-deployment-monitoring:
	name: Post-Deployment Monitoring
	runs-on: ubuntu-latest
	needs: [deploy-production]
	if: always() && needs.deploy-production.result == 'success'

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up monitoring
	run: \|
	# Install monitoring tools
	pip install prometheus-client grafana-api

	- name: Create monitoring dashboard
	run: \|
	python scripts/create_ml_dashboard.py \
	--grafana-url ${{ secrets.GRAFANA_URL }} \
	--grafana-token ${{ secrets.GRAFANA_TOKEN }} \
	--deployment-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)"

	- name: Set up alerts
	run: \|
	python scripts/setup_ml_alerts.py \
	--prometheus-url ${{ secrets.PROMETHEUS_URL }} \
	--alert-manager-url ${{ secrets.ALERTMANAGER_URL }}

	- name: Generate deployment report
	run: \|
	python scripts/generate_deployment_report.py \
	--artifacts-path /tmp/artifacts \
	--output-path ml-deployment-report.html

	- name: Upload deployment report
	uses: actions/upload-artifact@v3
	with:
	name: ml-deployment-report
	path: ml-deployment-report.html
	retention-days: 90

	- name: Notify team
	uses: 8398a7/action-slack@v3
	with:
	status: success
	channel: '#ml-team'
	text: 'ML model training and deployment completed successfully'
	env:
	SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ML Training Pipeline #9

Workflow file

ML Training Pipeline #9

Uh oh!

Jobs

Run details

Workflow file for this run