Skip to content

Commit aa775ed

Browse files
committed
Merge tag 'drm-habanalabs-next-2024-02-26' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains habanalabs driver and accel changes for v6.9. The notable changes are: - New features and improvements: - Configure interrupt affinity according to NUMA nodes for the MSI-X interrupts that are assigned to the userspace application which acquires the device. - Move the HBM MMU page tables to reside inside the HBM to minimize latency when doing page-walks. - Improve the device reset mechanism when consecutive heartbeat failures occur (firmware fails to ack on heartbeat message). - Check also extended errors in the PCIe addr_dec interrupt information. - Rate limit the error messages that can be printed to dmesg log by userspace actions. - Firmware related fixes: - Handle requests from firmware to reserve device memory - Bug fixes and code cleanups: - constify the struct device_type usage in accel (accel_sysfs_device_minor). - Fix the PCI health check by reading uncached register. - Fix reporting of drain events. - Fix debugfs files permissions. - Fix calculation of DRAM BAR base address. Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> From: Oded Gabbay <ogabbay@kernel.org> Link: https://patchwork.freedesktop.org/patch/msgid/ZdxJprop0EniVQtf@ogabbay-vm-u22.habana-labs.com
2 parents 19b232b + 576d7cc commit aa775ed

File tree

21 files changed

+1008
-510
lines changed

21 files changed

+1008
-510
lines changed

drivers/accel/drm_accel.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ static struct idr accel_minors_idr;
2323

2424
static struct dentry *accel_debugfs_root;
2525

26-
static struct device_type accel_sysfs_device_minor = {
26+
static const struct device_type accel_sysfs_device_minor = {
2727
.name = "accel_minor"
2828
};
2929

drivers/accel/habanalabs/common/command_submission.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
13601360
return -EINVAL;
13611361
}
13621362

1363-
if (!hl_device_operational(hdev, &status)) {
1363+
if (!hl_device_operational(hdev, &status))
13641364
return -EBUSY;
1365-
}
13661365

13671366
if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
13681367
!hdev->supports_staged_submission) {

drivers/accel/habanalabs/common/debugfs.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
484484
struct hl_debugfs_entry *entry = s->private;
485485
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
486486
struct hl_device *hdev = dev_entry->hdev;
487-
char kbuf[MMU_KBUF_SIZE];
487+
char kbuf[MMU_KBUF_SIZE] = {0};
488488
char *c;
489489
ssize_t rc;
490490

@@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
546546
struct hl_debugfs_entry *entry = s->private;
547547
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
548548
struct hl_device *hdev = dev_entry->hdev;
549-
char kbuf[MMU_KBUF_SIZE];
549+
char kbuf[MMU_KBUF_SIZE] = {0};
550550
ssize_t rc;
551551

552552
if (count > sizeof(kbuf) - 1)
@@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
16431643
&hl_data64b_fops);
16441644

16451645
debugfs_create_file("set_power_state",
1646-
0200,
1646+
0644,
16471647
root,
16481648
dev_entry,
16491649
&hl_power_fops);
16501650

16511651
debugfs_create_file("device",
1652-
0200,
1652+
0644,
16531653
root,
16541654
dev_entry,
16551655
&hl_device_fops);
16561656

16571657
debugfs_create_file("clk_gate",
1658-
0200,
1658+
0644,
16591659
root,
16601660
dev_entry,
16611661
&hl_clk_gate_fops);
@@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
16671667
&hl_stop_on_err_fops);
16681668

16691669
debugfs_create_file("dump_security_violations",
1670-
0644,
1670+
0400,
16711671
root,
16721672
dev_entry,
16731673
&hl_security_violations_fops);
16741674

16751675
debugfs_create_file("dump_razwi_events",
1676-
0644,
1676+
0400,
16771677
root,
16781678
dev_entry,
16791679
&hl_razwi_check_fops);
@@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
17061706
&hdev->reset_info.skip_reset_on_timeout);
17071707

17081708
debugfs_create_file("state_dump",
1709-
0600,
1709+
0644,
17101710
root,
17111711
dev_entry,
17121712
&hl_state_dump_fops);
@@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
17241724

17251725
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
17261726
debugfs_create_file(hl_debugfs_list[i].name,
1727-
0444,
1727+
0644,
17281728
root,
17291729
entry,
17301730
&hl_debugfs_fops);

drivers/accel/habanalabs/common/device.c

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
5555
if (is_power_of_2(prop->dram_pci_bar_size))
5656
bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
5757
else
58-
bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
58+
bar_base_addr = region->region_base +
59+
div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
5960
prop->dram_pci_bar_size;
6061

6162
old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
@@ -1034,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)
10341035

10351036
static bool is_pci_link_healthy(struct hl_device *hdev)
10361037
{
1037-
u16 vendor_id;
1038+
u16 device_id;
10381039

10391040
if (!hdev->pdev)
10401041
return false;
10411042

1042-
pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
1043+
pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
10431044

1044-
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
1045+
return (device_id == hdev->pdev->device);
10451046
}
10461047

10471048
static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
@@ -1768,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
17681769
hdev->device_cpu_disabled = false;
17691770
hdev->reset_info.hard_reset_pending = false;
17701771

1772+
/*
1773+
* Put the device in an unusable state if there are 2 back to back resets due to
1774+
* fatal errors.
1775+
*/
17711776
if (hdev->reset_info.reset_trigger_repeated &&
1772-
(hdev->reset_info.prev_reset_trigger ==
1773-
HL_DRV_RESET_FW_FATAL_ERR)) {
1774-
/* if there 2 back to back resets from FW,
1775-
* ensure driver puts the driver in a unusable state
1776-
*/
1777+
(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
1778+
hdev->reset_info.prev_reset_trigger ==
1779+
HL_DRV_RESET_HEARTBEAT)) {
17771780
dev_crit(hdev->dev,
1778-
"%s Consecutive FW fatal errors received, stopping hard reset\n",
1781+
"%s Consecutive fatal errors, stopping hard reset\n",
17791782
dev_name(&(hdev)->pdev->dev));
17801783
rc = -EIO;
17811784
goto out_err;
@@ -2801,3 +2804,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
28012804
atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
28022805
captured_err_info->undef_opcode.write_enable = true;
28032806
}
2807+
2808+
void hl_init_cpu_for_irq(struct hl_device *hdev)
2809+
{
2810+
#ifdef CONFIG_NUMA
2811+
struct cpumask *available_mask = &hdev->irq_affinity_mask;
2812+
int numa_node = hdev->pdev->dev.numa_node, i;
2813+
static struct cpumask cpu_mask;
2814+
2815+
if (numa_node < 0)
2816+
return;
2817+
2818+
if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
2819+
dev_err(hdev->dev, "No available affinities in current numa node\n");
2820+
return;
2821+
}
2822+
2823+
/* Remove HT siblings */
2824+
for_each_cpu(i, &cpu_mask)
2825+
cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
2826+
#endif
2827+
}
2828+
2829+
void hl_set_irq_affinity(struct hl_device *hdev, int irq)
2830+
{
2831+
if (cpumask_empty(&hdev->irq_affinity_mask)) {
2832+
dev_dbg(hdev->dev, "affinity mask is empty\n");
2833+
return;
2834+
}
2835+
2836+
if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
2837+
dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
2838+
}

drivers/accel/habanalabs/common/firmware_if.c

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
501501
0, &result);
502502

503503
if (rc)
504-
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
504+
dev_err(hdev->dev, "failed to unmask event %d", event_type);
505505

506506
return rc;
507507
}
@@ -540,7 +540,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
540540
total_pkt_size, 0, &result);
541541

542542
if (rc)
543-
dev_err(hdev->dev, "failed to unmask IRQ array\n");
543+
dev_err(hdev->dev, "failed to unmask event array\n");
544544

545545
kfree(pkt);
546546

@@ -2718,18 +2718,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
27182718
hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
27192719
}
27202720

2721+
rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
2722+
if (rc)
2723+
goto protocol_err;
2724+
2725+
if (hdev->asic_prop.support_dynamic_resereved_fw_size)
2726+
hdev->asic_prop.reserved_fw_mem_size =
2727+
le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;
2728+
27212729
if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
27222730
struct lkd_fw_binning_info *binning_info;
27232731

2724-
rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
2725-
sizeof(struct lkd_msg_comms));
2726-
if (rc)
2727-
goto protocol_err;
2728-
27292732
/* read preboot version */
27302733
rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
27312734
fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
2732-
27332735
if (rc)
27342736
return rc;
27352737

@@ -2756,11 +2758,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
27562758
hdev->decoder_binning, hdev->rotator_binning);
27572759
}
27582760

2759-
if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
2760-
hdev->asic_prop.reserved_fw_mem_size =
2761-
le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
2762-
}
2763-
27642761
return 0;
27652762
}
27662763

@@ -2795,7 +2792,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
27952792
hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
27962793

27972794
if (!(hdev->fw_components & FW_TYPE_LINUX)) {
2798-
dev_info(hdev->dev, "Skip loading Linux F/W\n");
2795+
dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
27992796
return 0;
28002797
}
28012798

0 commit comments

Comments
 (0)