@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
950
950
* @inst: xcc's instance number on a multi-XCC setup
951
951
*/
952
952
static void get_wave_count (struct amdgpu_device * adev , int queue_idx ,
953
- int * wave_cnt , int * vmid , uint32_t inst )
953
+ struct kfd_cu_occupancy * queue_cnt , uint32_t inst )
954
954
{
955
955
int pipe_idx ;
956
956
int queue_slot ;
957
957
unsigned int reg_val ;
958
-
958
+ unsigned int wave_cnt ;
959
959
/*
960
960
* Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
961
961
* parameters to read out waves in flight. Get VMID if there are
962
962
* non-zero waves in flight.
963
963
*/
964
- * vmid = 0xFF ;
965
- * wave_cnt = 0 ;
966
964
pipe_idx = queue_idx / adev -> gfx .mec .num_queue_per_pipe ;
967
965
queue_slot = queue_idx % adev -> gfx .mec .num_queue_per_pipe ;
968
- soc15_grbm_select (adev , 1 , pipe_idx , queue_slot , 0 , inst );
969
- reg_val = RREG32_SOC15_IP (GC , SOC15_REG_OFFSET (GC , inst , mmSPI_CSQ_WF_ACTIVE_COUNT_0 ) +
970
- queue_slot );
971
- * wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK ;
972
- if (* wave_cnt != 0 )
973
- * vmid = (RREG32_SOC15 (GC , inst , mmCP_HQD_VMID ) &
974
- CP_HQD_VMID__VMID_MASK ) >> CP_HQD_VMID__VMID__SHIFT ;
966
+ soc15_grbm_select (adev , 1 , pipe_idx , queue_slot , 0 , GET_INST (GC , inst ));
967
+ reg_val = RREG32_SOC15_IP (GC , SOC15_REG_OFFSET (GC , GET_INST (GC , inst ),
968
+ mmSPI_CSQ_WF_ACTIVE_COUNT_0 ) + queue_slot );
969
+ wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK ;
970
+ if (wave_cnt != 0 ) {
971
+ queue_cnt -> wave_cnt += wave_cnt ;
972
+ queue_cnt -> doorbell_off =
973
+ (RREG32_SOC15 (GC , GET_INST (GC , inst ), mmCP_HQD_PQ_DOORBELL_CONTROL ) &
974
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK ) >>
975
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT ;
976
+ }
975
977
}
976
978
977
979
/**
@@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
981
983
* or more queues running and submitting waves to compute units.
982
984
*
983
985
* @adev: Handle of device from which to get number of waves in flight
984
- * @pasid: Identifies the process for which this query call is invoked
985
- * @pasid_wave_cnt: Output parameter updated with number of waves in flight that
986
- * belong to process with given pasid
986
+ * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset
987
+ * for comparison later.
987
988
* @max_waves_per_cu: Output parameter updated with maximum number of waves
988
989
* possible per Compute Unit
989
990
* @inst: xcc's instance number on a multi-XCC setup
@@ -1011,34 +1012,28 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
1011
1012
* number of waves that are in flight for the queue at specified index. The
1012
1013
* index ranges from 0 to 7.
1013
1014
*
1014
- * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
1015
- * of the wave(s) .
1015
+ * If non-zero waves are in flight, store the corresponding doorbell offset
1016
+ * of the queue, along with the wave count .
1016
1017
*
1017
- * Determine if VMID from above step maps to pasid provided as parameter. If
1018
- * it matches agrregate the wave count. That the VMID will not match pasid is
1019
- * a normal condition i.e. a device is expected to support multiple queues
1020
- * from multiple proceses.
1018
+ * Determine if the queue belongs to the process by comparing the doorbell
1019
+ * offset against the process's queues. If it matches, aggregate the wave
1020
+ * count for the process.
1021
1021
*
1022
1022
* Reading registers referenced above involves programming GRBM appropriately
1023
1023
*/
1024
- void kgd_gfx_v9_get_cu_occupancy (struct amdgpu_device * adev , int pasid ,
1025
- int * pasid_wave_cnt , int * max_waves_per_cu , uint32_t inst )
1024
+ void kgd_gfx_v9_get_cu_occupancy (struct amdgpu_device * adev ,
1025
+ struct kfd_cu_occupancy * cu_occupancy ,
1026
+ int * max_waves_per_cu , uint32_t inst )
1026
1027
{
1027
1028
int qidx ;
1028
- int vmid ;
1029
1029
int se_idx ;
1030
- int sh_idx ;
1031
1030
int se_cnt ;
1032
- int sh_cnt ;
1033
- int wave_cnt ;
1034
1031
int queue_map ;
1035
- int pasid_tmp ;
1036
1032
int max_queue_cnt ;
1037
- int vmid_wave_cnt = 0 ;
1038
1033
DECLARE_BITMAP (cp_queue_bitmap , AMDGPU_MAX_QUEUES );
1039
1034
1040
1035
lock_spi_csq_mutexes (adev );
1041
- soc15_grbm_select (adev , 1 , 0 , 0 , 0 , inst );
1036
+ soc15_grbm_select (adev , 1 , 0 , 0 , 0 , GET_INST ( GC , inst ) );
1042
1037
1043
1038
/*
1044
1039
* Iterate through the shader engines and arrays of the device
@@ -1048,51 +1043,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
1048
1043
AMDGPU_MAX_QUEUES );
1049
1044
max_queue_cnt = adev -> gfx .mec .num_pipe_per_mec *
1050
1045
adev -> gfx .mec .num_queue_per_pipe ;
1051
- sh_cnt = adev -> gfx .config .max_sh_per_se ;
1052
1046
se_cnt = adev -> gfx .config .max_shader_engines ;
1053
1047
for (se_idx = 0 ; se_idx < se_cnt ; se_idx ++ ) {
1054
- for (sh_idx = 0 ; sh_idx < sh_cnt ; sh_idx ++ ) {
1048
+ amdgpu_gfx_select_se_sh (adev , se_idx , 0 , 0xffffffff , inst );
1049
+ queue_map = RREG32_SOC15 (GC , GET_INST (GC , inst ), mmSPI_CSQ_WF_ACTIVE_STATUS );
1050
+
1051
+ /*
1052
+ * Assumption: queue map encodes following schema: four
1053
+ * pipes per each micro-engine, with each pipe mapping
1054
+ * eight queues. This schema is true for GFX9 devices
1055
+ * and must be verified for newer device families
1056
+ */
1057
+ for (qidx = 0 ; qidx < max_queue_cnt ; qidx ++ ) {
1058
+ /* Skip qeueus that are not associated with
1059
+ * compute functions
1060
+ */
1061
+ if (!test_bit (qidx , cp_queue_bitmap ))
1062
+ continue ;
1055
1063
1056
- amdgpu_gfx_select_se_sh ( adev , se_idx , sh_idx , 0xffffffff , inst );
1057
- queue_map = RREG32_SOC15 ( GC , inst , mmSPI_CSQ_WF_ACTIVE_STATUS ) ;
1064
+ if (!( queue_map & ( 1 << qidx )))
1065
+ continue ;
1058
1066
1059
- /*
1060
- * Assumption: queue map encodes following schema: four
1061
- * pipes per each micro-engine, with each pipe mapping
1062
- * eight queues. This schema is true for GFX9 devices
1063
- * and must be verified for newer device families
1064
- */
1065
- for (qidx = 0 ; qidx < max_queue_cnt ; qidx ++ ) {
1066
-
1067
- /* Skip qeueus that are not associated with
1068
- * compute functions
1069
- */
1070
- if (!test_bit (qidx , cp_queue_bitmap ))
1071
- continue ;
1072
-
1073
- if (!(queue_map & (1 << qidx )))
1074
- continue ;
1075
-
1076
- /* Get number of waves in flight and aggregate them */
1077
- get_wave_count (adev , qidx , & wave_cnt , & vmid ,
1078
- inst );
1079
- if (wave_cnt != 0 ) {
1080
- pasid_tmp =
1081
- RREG32 (SOC15_REG_OFFSET (OSSSYS , inst ,
1082
- mmIH_VMID_0_LUT ) + vmid );
1083
- if (pasid_tmp == pasid )
1084
- vmid_wave_cnt += wave_cnt ;
1085
- }
1086
- }
1067
+ /* Get number of waves in flight and aggregate them */
1068
+ get_wave_count (adev , qidx , & cu_occupancy [qidx ],
1069
+ inst );
1087
1070
}
1088
1071
}
1089
1072
1090
1073
amdgpu_gfx_select_se_sh (adev , 0xffffffff , 0xffffffff , 0xffffffff , inst );
1091
- soc15_grbm_select (adev , 0 , 0 , 0 , 0 , inst );
1074
+ soc15_grbm_select (adev , 0 , 0 , 0 , 0 , GET_INST ( GC , inst ) );
1092
1075
unlock_spi_csq_mutexes (adev );
1093
1076
1094
1077
/* Update the output parameters and return */
1095
- * pasid_wave_cnt = vmid_wave_cnt ;
1096
1078
* max_waves_per_cu = adev -> gfx .cu_info .simd_per_cu *
1097
1079
adev -> gfx .cu_info .max_waves_per_simd ;
1098
1080
}
0 commit comments