Skip to content

fix opt chunk offload #316

fix opt chunk offload

fix opt chunk offload #316

Workflow file for this run

name: Distribute CI (A100)
on:
pull_request:
types: [opened, synchronize, reopened]
branches: [develop]
schedule:
- cron: "1 0 * * *"
workflow_call:
inputs:
run_downstream:
required: true
type: string
image_name:
required: true
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
env:
PR_ID: ${{ github.event.pull_request.number }}
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-A100
ci_scripts: /workspace/PaddleNLP/scripts/distribute
BRANCH: ${{ github.event.pull_request.base.ref }}
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
CI_name: distribute-ci
no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
GITHUB_EVENT_NAME: ${{ github.event_name }}
RUN_DOWNSTREAM: ${{ inputs.run_downstream }}
defaults:
run:
shell: bash
jobs:
check-bypass:
name: Check bypass
uses: ./.github/workflows/check-bypass.yml
with:
workflow-name: 'distribute-a100'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
distribute-a100-ci:
name: distribute-a100-ci
needs: check-bypass
if: ${{ needs.check-bypass.outputs.can-skip != 'true' }}
runs-on:
group: Distribute
steps:
- name: Determine Image Name
env:
IMAGE_NAME: ${{ inputs.image_name }}
run: |
if [[ -n "${IMAGE_NAME}" ]]; then
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
else
echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV"
fi
- name: Run Container
env:
work_dir: ${{ github.workspace }}
CACHE_DIR: /home/data/cfs/.cache
FLAGS_dynamic_static_unified_comm: "True"
FLAGS_dataloader_use_file_descriptor: "False"
python_version: "3.10"
paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> "$GITHUB_ENV"
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
export CUDA_SO="$(\ls -d /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls -d /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES="$(\ls -d /dev/nvidia* | xargs -I{} echo "-v {}:{}") $(\ls /dev/nvidia-caps/* | xargs -I{} echo "-v {}:{}")"
export SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
docker run -d -t --name ${container_name} ${CUDA_SO} ${DEVICES} ${SMI} --runtime=nvidia --shm-size=32G \
--network host -v /dev/shm:/dev/shm \
-v $work_dir/../../..:$work_dir/../../.. \
-v $work_dir:/workspace \
-v /home/.cache/pip:/home/.cache/pip \
-v /home/FleetX_CI:/fleetx_data \
-v /home/Llm_gpt_CI:/llm_gpt_data \
-v /home/Llama_CI:/llama_data \
-e BRANCH \
-e AGILE_COMPILE_BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e ci_scripts \
-e no_proxy \
-e CI_name \
-e paddle_whl \
-e FLAGS_dynamic_static_unified_comm \
-e FLAGS_dataloader_use_file_descriptor \
-e python_version \
-w /workspace $IMAGE_NAME
fi
- name: Download Code
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping.."
else
docker exec -t $container_name /bin/bash -c '
rm -rf * .[^.]*
echo "Downloading PaddleNLP.tar.gz"
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
echo "Extracting PaddleNLP.tar.gz"
tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
source $work_dir/../../../proxy
cd PaddleNLP
git config --global user.name "PaddleCI"
git config --global user.email "paddle_ci@example.com"
git pull
git submodule update --init --recursive --force
if [ -n "${PR_ID}" ]; then
git fetch origin pull/${PR_ID}/head
git checkout -b PR_${PR_ID} FETCH_HEAD
git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git
git fetch upstream ${BRANCH}
git merge ${BRANCH} --no-edit
git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
else
echo "Not in a pull_request event. Skipping PR-specific operations."
fi
git log --pretty=oneline -10
'
fi
- name: Test
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
ldconfig
ln -sf $(which python${python_version}) /usr/bin/python
pip config set global.cache-dir "/home/.cache/pip"
source $work_dir/../../../proxy
set -e
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl}
'
fi
- name: Upload Logs
if: always()
env:
home_path: ${{ github.workspace }}/..
bos_file: ${{ github.workspace }}/../bos_retry/BosClient.py
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
unset http_proxy && unset https_proxy
if [ ! -f "${{ env.bos_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/bos_retry.tar.gz https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
mkdir ${{ env.home_path }}/bos_retry
tar xf ${{ env.home_path }}/bos_retry.tar.gz -C ${{ env.home_path }}/bos_retry
fi
if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}"
elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
else
bos_prefix="schedule/$(date +%Y%m%d)"
fi
cd /workspace/case_logs
for FILE in /workspace/case_logs/*; do
file=$(basename "$FILE")
python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs
echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs/$file"
done
'
fi
- name: Terminate And Delete the Container
if: always()
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker rm -f $container_name 2>/dev/null || true