intel-lab-lkp
diff --git a/‎Documentation/gpu/amdgpu/index.rst
Lines changed: 1 addition & 0 deletions b/‎Documentation/gpu/amdgpu/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎Documentation/gpu/amdgpu/process-isolation.rst
Lines changed: 59 additions & 0 deletions b/‎Documentation/gpu/amdgpu/process-isolation.rst
Lines changed: 59 additions & 0 deletions
diff --git a/‎drivers/gpu/drm/amd/amdgpu/amdgpu.h
Lines changed: 8 additions & 0 deletions b/‎drivers/gpu/drm/amd/amdgpu/amdgpu.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
Lines changed: 1 addition & 1 deletion b/‎drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
Lines changed: 1 addition & 1 deletion
diff --git a/‎drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Lines changed: 11 additions & 2 deletions b/‎drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Lines changed: 11 additions & 2 deletions
diff --git a/‎drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Lines changed: 53 additions & 0 deletions b/‎drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Lines changed: 53 additions & 0 deletions
@@ -16,4 +16,5 @@ Next (GCN), Radeon DNA (RDNA), and Compute DNA (CDNA) architectures.
    thermal
    driver-misc
    debugging
+   process-isolation
    amdgpu-glossary
@@ -0,0 +1,59 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+ AMDGPU Process Isolation
+=========================
+
+The AMDGPU driver includes a feature that enables automatic process isolation on the graphics engine. This feature serializes access to the graphics engine and adds a cleaner shader which clears the Local Data Store (LDS) and General Purpose Registers (GPRs) between jobs. All processes using the GPU, including both graphics and compute workloads, are serialized when this feature is enabled. On GPUs that support partitionable graphics engines, this feature can be enabled on a per-partition basis.
+
+In addition, there is an interface to manually run the cleaner shader when the use of the GPU is complete. This may be preferable in some use cases, such as a single-user system where the login manager triggers the cleaner shader when the user logs out.
+
+Process Isolation
+=================
+
+The `run_cleaner_shader` and `enforce_isolation` sysfs interfaces allow users to manually execute the cleaner shader and control the process isolation feature, respectively.
+
+Partition Handling
+------------------
+
+The `enforce_isolation` file in sysfs can be used to enable process isolation and automatic shader cleanup between processes. On GPUs that support graphics engine partitioning, this can be enabled per partition. The partition and its current setting (0 disabled, 1 enabled) can be read from sysfs. On GPUs that do not support graphics engine partitioning, only a single partition will be present. Writing 1 to the partition position enables enforce isolation, writing 0 disables it.
+
+Example of enabling enforce isolation on a GPU with multiple partitions:
+
+.. code-block:: console
+
+    $ echo 1 0 1 0 > /sys/class/drm/card0/device/enforce_isolation
+    $ cat /sys/class/drm/card0/device/enforce_isolation
+    1 0 1 0
+
+The output indicates that enforce isolation is enabled on zeroth and second parition and disabled on first and fourth parition.
+
+For devices with a single partition or those that do not support partitions, there will be only one element:
+
+.. code-block:: console
+
+    $ echo 1 > /sys/class/drm/card0/device/enforce_isolation
+    $ cat /sys/class/drm/card0/device/enforce_isolation
+    1
+
+Cleaner Shader Execution
+========================
+
+The driver can trigger a cleaner shader to clean up the LDS and GPR state on the graphics engine. When process isolation is enabled, this happens automatically between processes. In addition, there is a sysfs file to manually trigger cleaner shader execution.
+
+To manually trigger the execution of the cleaner shader, write `0` to the `run_cleaner_shader` sysfs file:
+
+.. code-block:: console
+
+    $ echo 0 > /sys/class/drm/card0/device/run_cleaner_shader
+
+For multi-partition devices, you can specify the partition index when triggering the cleaner shader:
+
+.. code-block:: console
+
+    $ echo 0 > /sys/class/drm/card0/device/run_cleaner_shader # For partition 0
+    $ echo 1 > /sys/class/drm/card0/device/run_cleaner_shader # For partition 1
+    $ echo 2 > /sys/class/drm/card0/device/run_cleaner_shader # For partition 2
+    # ... and so on for each partition
+
+This command initiates the cleaner shader, which will run and complete before any new tasks are scheduled on the GPU.
@@ -299,6 +299,12 @@ extern int amdgpu_wbrf;
 #define AMDGPU_RESET_VCE			(1 << 13)
 #define AMDGPU_RESET_VCE1			(1 << 14)
 
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
 /* max cursor sizes (in pixels) */
 #define CIK_CURSOR_WIDTH 128
 #define CIK_CURSOR_HEIGHT 128
@@ -1464,6 +1470,8 @@ struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
 					    struct dma_fence *gang);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
 /* atpx handler */
 #if defined(CONFIG_VGA_SWITCHEROO)
 
@@ -158,7 +158,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 		return -EINVAL;
 	}
 
-	if (start + count >= max_count)
+	if (start + count > max_count)
 		return -EINVAL;
 
 	count = min_t(int, count, max_count);
 
@@ -834,6 +834,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
 		return -EINVAL;
 
+	if (!kiq_ring->sched.ready || adev->job_hang)
+		return 0;
+
 	ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
 	if (!ring_funcs)
 		return -ENOMEM;
@@ -858,8 +861,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 
 	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
 
-	if (kiq_ring->sched.ready && !adev->job_hang)
-		r = amdgpu_ring_test_helper(kiq_ring);
+	/* Submit unmap queue packet */
+	amdgpu_ring_commit(kiq_ring);
+	/*
+	 * Ring test will do a basic scratch register change check. Just run
+	 * this to ensure that unmap queues that is submitted before got
+	 * processed successfully before returning.
+	 */
+	r = amdgpu_ring_test_helper(kiq_ring);
 
 	spin_unlock(&kiq->ring_lock);
 
 
@@ -4236,7 +4236,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	 * for throttling interrupt) = 60 seconds.
 	 */
 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
+	ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
+
 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
+	ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
 
 	/* Registers mapping */
 	/* TODO: block userspace mapping of io register */
@@ -5186,6 +5189,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
 		amdgpu_ras_resume(adev);
+
+	amdgpu_virt_ras_telemetry_post_reset(adev);
+
 	return 0;
 }
 
@@ -6200,6 +6206,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
 	bool p2p_access =
 		!adev->gmc.xgmi.connected_to_cpu &&
 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
+	if (!p2p_access)
+		dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
+			pci_name(peer_adev->pdev));
 
 	bool is_large_bar = adev->gmc.visible_vram_size &&
 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
@@ -6715,3 +6724,47 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
 	}
 	return ret;
 }
+
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
+{
+	ssize_t size = 0;
+
+	if (!ring || !ring->adev)
+		return size;
+
+	if (amdgpu_device_should_recover_gpu(ring->adev))
+		size |= AMDGPU_RESET_TYPE_FULL;
+
+	if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
+	    !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
+		size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+	return size;
+}
+
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
+{
+	ssize_t size = 0;
+
+	if (supported_reset == 0) {
+		size += sysfs_emit_at(buf, size, "unsupported");
+		size += sysfs_emit_at(buf, size, "\n");
+		return size;
+
+	}
+
+	if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
+		size += sysfs_emit_at(buf, size, "soft ");
+
+	if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
+		size += sysfs_emit_at(buf, size, "queue ");
+
+	if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
+		size += sysfs_emit_at(buf, size, "pipe ");
+
+	if (supported_reset & AMDGPU_RESET_TYPE_FULL)
+		size += sysfs_emit_at(buf, size, "full ");
+
+	size += sysfs_emit_at(buf, size, "\n");
+	return size;
+}
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_`
`158`	`158`	`return -EINVAL;`
`159`	`159`	`}`
`160`	`160`
`161`		`- if (start + count >= max_count)`
	`161`	`+ if (start + count > max_count)`
`162`	`162`	`return -EINVAL;`
`163`	`163`
`164`	`164`	`count = min_t(int, count, max_count);`