fix opt chunk offload #316
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Distribute CI (A100) | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened] | |
| branches: [develop] | |
| schedule: | |
| - cron: "1 0 * * *" | |
| workflow_call: | |
| inputs: | |
| run_downstream: | |
| required: true | |
| type: string | |
| image_name: | |
| required: true | |
| type: string | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number }} | |
| cancel-in-progress: true | |
| env: | |
| PR_ID: ${{ github.event.pull_request.number }} | |
| COMMIT_ID: ${{ github.event.pull_request.head.sha }} | |
| TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-A100 | |
| ci_scripts: /workspace/PaddleNLP/scripts/distribute | |
| BRANCH: ${{ github.event.pull_request.base.ref }} | |
| AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }} | |
| CI_name: distribute-ci | |
| no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" | |
| GITHUB_EVENT_NAME: ${{ github.event_name }} | |
| RUN_DOWNSTREAM: ${{ inputs.run_downstream }} | |
| defaults: | |
| run: | |
| shell: bash | |
| jobs: | |
| check-bypass: | |
| name: Check bypass | |
| uses: ./.github/workflows/check-bypass.yml | |
| with: | |
| workflow-name: 'distribute-a100' | |
| secrets: | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| distribute-a100-ci: | |
| name: distribute-a100-ci | |
| needs: check-bypass | |
| if: ${{ needs.check-bypass.outputs.can-skip != 'true' }} | |
| runs-on: | |
| group: Distribute | |
| steps: | |
| - name: Determine Image Name | |
| env: | |
| IMAGE_NAME: ${{ inputs.image_name }} | |
| run: | | |
| if [[ -n "${IMAGE_NAME}" ]]; then | |
| echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV" | |
| else | |
| echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV" | |
| fi | |
| - name: Run Container | |
| env: | |
| work_dir: ${{ github.workspace }} | |
| CACHE_DIR: /home/data/cfs/.cache | |
| FLAGS_dynamic_static_unified_comm: "True" | |
| FLAGS_dataloader_use_file_descriptor: "False" | |
| python_version: "3.10" | |
| paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl | |
| run: | | |
| container_name=${TASK}-$(date +%Y%m%d-%H%M%S) | |
| echo "container_name=${container_name}" >> "$GITHUB_ENV" | |
| if [[ "$RUN_DOWNSTREAM" == "false" ]]; then | |
| echo "Not in a pull_request or test_build event. Skipping..." | |
| else | |
| export CUDA_SO="$(\ls -d /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls -d /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" | |
| export DEVICES="$(\ls -d /dev/nvidia* | xargs -I{} echo "-v {}:{}") $(\ls /dev/nvidia-caps/* | xargs -I{} echo "-v {}:{}")" | |
| export SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi" | |
| docker run -d -t --name ${container_name} ${CUDA_SO} ${DEVICES} ${SMI} --runtime=nvidia --shm-size=32G \ | |
| --network host -v /dev/shm:/dev/shm \ | |
| -v $work_dir/../../..:$work_dir/../../.. \ | |
| -v $work_dir:/workspace \ | |
| -v /home/.cache/pip:/home/.cache/pip \ | |
| -v /home/FleetX_CI:/fleetx_data \ | |
| -v /home/Llm_gpt_CI:/llm_gpt_data \ | |
| -v /home/Llama_CI:/llama_data \ | |
| -e BRANCH \ | |
| -e AGILE_COMPILE_BRANCH \ | |
| -e PR_ID \ | |
| -e COMMIT_ID \ | |
| -e work_dir \ | |
| -e ci_scripts \ | |
| -e no_proxy \ | |
| -e CI_name \ | |
| -e paddle_whl \ | |
| -e FLAGS_dynamic_static_unified_comm \ | |
| -e FLAGS_dataloader_use_file_descriptor \ | |
| -e python_version \ | |
| -w /workspace $IMAGE_NAME | |
| fi | |
| - name: Download Code | |
| run: | | |
| if [[ "$RUN_DOWNSTREAM" == "false" ]]; then | |
| echo "Not in a pull_request or test_build event. Skipping.." | |
| else | |
| docker exec -t $container_name /bin/bash -c ' | |
| rm -rf * .[^.]* | |
| echo "Downloading PaddleNLP.tar.gz" | |
| wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate | |
| echo "Extracting PaddleNLP.tar.gz" | |
| tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar | |
| source $work_dir/../../../proxy | |
| cd PaddleNLP | |
| git config --global user.name "PaddleCI" | |
| git config --global user.email "paddle_ci@example.com" | |
| git pull | |
| git submodule update --init --recursive --force | |
| if [ -n "${PR_ID}" ]; then | |
| git fetch origin pull/${PR_ID}/head | |
| git checkout -b PR_${PR_ID} FETCH_HEAD | |
| git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git | |
| git fetch upstream ${BRANCH} | |
| git merge ${BRANCH} --no-edit | |
| git diff --numstat ${BRANCH} -- | awk "{print \$NF}" | |
| else | |
| echo "Not in a pull_request event. Skipping PR-specific operations." | |
| fi | |
| git log --pretty=oneline -10 | |
| ' | |
| fi | |
| - name: Test | |
| run: | | |
| if [[ "$RUN_DOWNSTREAM" == "false" ]]; then | |
| echo "Not in a pull_request or test_build event. Skipping..." | |
| else | |
| docker exec -t $container_name /bin/bash -c ' | |
| ldconfig | |
| ln -sf $(which python${python_version}) /usr/bin/python | |
| pip config set global.cache-dir "/home/.cache/pip" | |
| source $work_dir/../../../proxy | |
| set -e | |
| cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD | |
| timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl} | |
| ' | |
| fi | |
| - name: Upload Logs | |
| if: always() | |
| env: | |
| home_path: ${{ github.workspace }}/.. | |
| bos_file: ${{ github.workspace }}/../bos_retry/BosClient.py | |
| run: | | |
| if [[ "$RUN_DOWNSTREAM" == "false" ]]; then | |
| echo "Not in a pull_request or test_build event. Skipping..." | |
| else | |
| docker exec -t $container_name /bin/bash -c ' | |
| unset http_proxy && unset https_proxy | |
| if [ ! -f "${{ env.bos_file }}" ]; then | |
| wget -q --no-proxy -O ${{ env.home_path }}/bos_retry.tar.gz https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate | |
| mkdir ${{ env.home_path }}/bos_retry | |
| tar xf ${{ env.home_path }}/bos_retry.tar.gz -C ${{ env.home_path }}/bos_retry | |
| fi | |
| if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then | |
| bos_prefix="${PR_ID}/${COMMIT_ID}" | |
| elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then | |
| bos_prefix="${PR_ID}/${COMMIT_ID}/test_build" | |
| else | |
| bos_prefix="schedule/$(date +%Y%m%d)" | |
| fi | |
| cd /workspace/case_logs | |
| for FILE in /workspace/case_logs/*; do | |
| file=$(basename "$FILE") | |
| python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs | |
| echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs/$file" | |
| done | |
| ' | |
| fi | |
| - name: Terminate And Delete the Container | |
| if: always() | |
| run: | | |
| docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' | |
| docker rm -f $container_name 2>/dev/null || true |