inductor-perf-nightly-h100 #24
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | name: inductor-perf-nightly-h100 | |
| on: | |
| schedule: | |
| - cron: 15 0,12 * * 1-6 | |
| - cron: 0 7 * * 0 | |
| # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it | |
| # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs | |
| workflow_dispatch: | |
| inputs: | |
| training: | |
| description: Run training (on by default)? | |
| required: false | |
| type: boolean | |
| default: true | |
| inference: | |
| description: Run inference (on by default)? | |
| required: false | |
| type: boolean | |
| default: true | |
| default: | |
| description: Run inductor_default? | |
| required: false | |
| type: boolean | |
| default: false | |
| dynamic: | |
| description: Run inductor_dynamic_shapes? | |
| required: false | |
| type: boolean | |
| default: false | |
| cppwrapper: | |
| description: Run inductor_cpp_wrapper? | |
| required: false | |
| type: boolean | |
| default: false | |
| cudagraphs: | |
| description: Run inductor_cudagraphs? | |
| required: false | |
| type: boolean | |
| default: true | |
| freezing_cudagraphs: | |
| description: Run inductor_cudagraphs with freezing for inference? | |
| required: false | |
| type: boolean | |
| default: false | |
| aotinductor: | |
| description: Run aot_inductor for inference? | |
| required: false | |
| type: boolean | |
| default: false | |
| maxautotune: | |
| description: Run inductor_max_autotune? | |
| required: false | |
| type: boolean | |
| default: false | |
| benchmark_configs: | |
| description: The list of configs used the benchmark | |
| required: false | |
| type: string | |
| default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100 | |
| pull_request: | |
| # Changing these files guarantees that this workflow needs to be run | |
| paths: | |
| - .github/workflows/inductor-perf-test-nightly-h100.yml | |
| - .ci/docker/ci_commit_pins/huggingface-requirements.txt | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| permissions: | |
| id-token: write | |
| contents: read | |
| jobs: | |
| get-label-type: | |
| name: get-label-type | |
| uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | |
| if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | |
| with: | |
| triggering_actor: ${{ github.triggering_actor }} | |
| issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | |
| curr_branch: ${{ github.head_ref || github.ref_name }} | |
| curr_ref_type: ${{ github.ref_type }} | |
| opt_out_experiments: lf | |
| build: | |
| name: build | |
| uses: ./.github/workflows/_linux-build.yml | |
| needs: get-label-type | |
| with: | |
| runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | |
| # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100 | |
| # or newer GPUs, so it doesn't benefit much from existing compiler cache | |
| # from trunk. Also use a memory-intensive runner here because memory is | |
| # usually the bottleneck | |
| runner: linux.12xlarge.memory | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 | |
| docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks | |
| cuda-arch-list: '9.0' | |
| test-matrix: | | |
| { include: [ | |
| { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" }, | |
| { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" }, | |
| { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" }, | |
| { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" }, | |
| { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" }, | |
| { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" }, | |
| ]} | |
| selected-test-configs: ${{ inputs.benchmark_configs }} | |
| build-additional-packages: "vision audio fbgemm torchao" | |
| secrets: inherit | |
| test-periodically: | |
| name: test-periodically | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| if: github.event.schedule == '15 0,12 * * 1-6' | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 | |
| dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| timeout-minutes: 720 | |
| # disable monitor in perf tests, next step is to enable it | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit | |
| test-weekly: | |
| name: test-weekly | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| if: github.event.schedule == '0 7 * * 0' | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 | |
| dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| timeout-minutes: 1440 | |
| # disable monitor in perf tests, next step is to enable it | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit | |
| test: | |
| name: test | |
| uses: ./.github/workflows/_linux-test.yml | |
| needs: build | |
| # The pull_request trigger is used in PR to bump transformers pin which always | |
| # needs one round of benchmark | |
| if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }} | |
| with: | |
| build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 | |
| dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }} | |
| docker-image: ${{ needs.build.outputs.docker-image }} | |
| test-matrix: ${{ needs.build.outputs.test-matrix }} | |
| timeout-minutes: 720 | |
| # disable monitor in perf tests for more investigation | |
| disable-monitor: false | |
| monitor-log-interval: 15 | |
| monitor-data-collect-interval: 4 | |
| secrets: inherit |