|
| 1 | +/* SPDX-License-Identifier: MIT */ |
| 2 | +/* |
| 3 | + * Copyright 2024 Advanced Micro Devices, Inc. |
| 4 | + * |
| 5 | + * Permission is hereby granted, free of charge, to any person obtaining a |
| 6 | + * copy of this software and associated documentation files (the "Software"), |
| 7 | + * to deal in the Software without restriction, including without limitation |
| 8 | + * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| 9 | + * and/or sell copies of the Software, and to permit persons to whom the |
| 10 | + * Software is furnished to do so, subject to the following conditions: |
| 11 | + * |
| 12 | + * The above copyright notice and this permission notice shall be included in |
| 13 | + * all copies or substantial portions of the Software. |
| 14 | + * |
| 15 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| 18 | + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
| 19 | + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
| 20 | + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 21 | + * OTHER DEALINGS IN THE SOFTWARE. |
| 22 | + */ |
| 23 | + |
| 24 | +// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 192 Dwords cleaner shader. |
| 25 | +//To turn this shader program on for complitaion change this to main and lower shader main to main_1 |
| 26 | + |
| 27 | +// Navi3 : Clear SGPRs, VGPRs and LDS |
| 28 | +// Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot |
| 29 | +// Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD |
| 30 | +// Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS) |
| 31 | +// It takes 2 workgroups to use all of LDS: one on each CU of the WGP |
| 32 | +// Each wave clears SGPRs 0 - 107 |
| 33 | +// Each wave clears VGPRs 0 - 63 |
| 34 | +// The first wave of the workgroup clears its 64KB of LDS |
| 35 | +// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup |
| 36 | +// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared. |
| 37 | + |
| 38 | +shader main |
| 39 | + asic(GFX11) |
| 40 | + type(CS) |
| 41 | + wave_size(32) |
| 42 | +// Note: original source code from SQ team |
| 43 | + |
| 44 | +// Takes about 2500 clocks to run. |
| 45 | +// (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks) |
| 46 | +// |
| 47 | + S_BARRIER |
| 48 | + |
| 49 | + // |
| 50 | + // CLEAR VGPRs |
| 51 | + // |
| 52 | + s_mov_b32 m0, 0x00000058 // Loop 96/8=12 times (loop unrolled for performance) |
| 53 | + |
| 54 | +label_0005: |
| 55 | + v_movreld_b32 v0, 0 |
| 56 | + v_movreld_b32 v1, 0 |
| 57 | + v_movreld_b32 v2, 0 |
| 58 | + v_movreld_b32 v3, 0 |
| 59 | + v_movreld_b32 v4, 0 |
| 60 | + v_movreld_b32 v5, 0 |
| 61 | + v_movreld_b32 v6, 0 |
| 62 | + v_movreld_b32 v7, 0 |
| 63 | + s_sub_u32 m0, m0, 8 |
| 64 | + s_cbranch_scc0 label_0005 |
| 65 | + // |
| 66 | + // |
| 67 | + |
| 68 | + s_mov_b32 s2, 0x80000000 // Bit31 is first_wave |
| 69 | + s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set |
| 70 | + s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup |
| 71 | + // CLEAR LDS |
| 72 | + // |
| 73 | + s_mov_b32 exec_lo, 0xffffffff |
| 74 | + s_mov_b32 exec_hi, 0xffffffff |
| 75 | + v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63) |
| 76 | + v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63) |
| 77 | + v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte) |
| 78 | + s_mov_b32 s2, 0x00000003f // 64 loop iterations |
| 79 | + s_mov_b32 m0, 0xffffffff |
| 80 | + // Clear all of LDS space |
| 81 | + // Each FirstWave of WorkGroup clears 64kbyte block |
| 82 | + |
| 83 | +label_001F: |
| 84 | + ds_write2_b64 v1, v[2:3], v[2:3] offset1:32 |
| 85 | + ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96 |
| 86 | + v_add_co_u32 v1, vcc, 0x00000400, v1 |
| 87 | + s_sub_u32 s2, s2, 1 |
| 88 | + s_cbranch_scc0 label_001F |
| 89 | + // |
| 90 | + // CLEAR SGPRs |
| 91 | + // |
| 92 | +label_0023: |
| 93 | + s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance) |
| 94 | +label_sgpr_loop: |
| 95 | + s_movreld_b32 s0, 0 |
| 96 | + s_movreld_b32 s1, 0 |
| 97 | + s_movreld_b32 s2, 0 |
| 98 | + s_movreld_b32 s3, 0 |
| 99 | + s_sub_u32 m0, m0, 4 |
| 100 | + s_cbranch_scc0 label_sgpr_loop |
| 101 | + |
| 102 | + //clear vcc |
| 103 | + s_mov_b64 vcc, 0 //clear vcc |
| 104 | + s_mov_b32 flat_scratch_lo, 0 //clear flat scratch lo SGPR |
| 105 | + s_mov_b32 flat_scratch_hi, 0 //clear flat scratch hi SGPR |
| 106 | + s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1 |
| 107 | + s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3 |
| 108 | + s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5 |
| 109 | + s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7 |
| 110 | + s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9 |
| 111 | + s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11 |
| 112 | + s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13 |
| 113 | + s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15 |
| 114 | + |
| 115 | + s_endpgm |
| 116 | + |
| 117 | +end |
| 118 | + |
0 commit comments