Rust-for-Linux
diff --git a/‎arch/arm64/include/asm/io.h
Lines changed: 132 additions & 0 deletions b/‎arch/arm64/include/asm/io.h
Lines changed: 132 additions & 0 deletions
diff --git a/‎arch/arm64/kernel/io.c
Lines changed: 42 additions & 0 deletions b/‎arch/arm64/kernel/io.c
Lines changed: 42 additions & 0 deletions
diff --git a/‎arch/s390/include/asm/io.h
Lines changed: 15 additions & 0 deletions b/‎arch/s390/include/asm/io.h
Lines changed: 15 additions & 0 deletions
diff --git a/‎arch/s390/pci/pci.c
Lines changed: 0 additions & 6 deletions b/‎arch/s390/pci/pci.c
Lines changed: 0 additions & 6 deletions
diff --git a/‎arch/x86/include/asm/io.h
Lines changed: 17 additions & 0 deletions b/‎arch/x86/include/asm/io.h
Lines changed: 17 additions & 0 deletions
diff --git a/‎arch/x86/lib/Makefile
Lines changed: 0 additions & 1 deletion b/‎arch/x86/lib/Makefile
Lines changed: 0 additions & 1 deletion
diff --git a/‎arch/x86/lib/iomap_copy_64.S
Lines changed: 0 additions & 15 deletions b/‎arch/x86/lib/iomap_copy_64.S
Lines changed: 0 additions & 15 deletions
diff --git a/‎drivers/infiniband/core/cma.c
Lines changed: 3 additions & 1 deletion b/‎drivers/infiniband/core/cma.c
Lines changed: 3 additions & 1 deletion
diff --git a/‎drivers/infiniband/core/device.c
Lines changed: 3 additions & 7 deletions b/‎drivers/infiniband/core/device.c
Lines changed: 3 additions & 7 deletions
diff --git a/‎drivers/infiniband/core/lag.c
Lines changed: 1 addition & 2 deletions b/‎drivers/infiniband/core/lag.c
Lines changed: 1 addition & 2 deletions
@@ -139,6 +139,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
 #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
 
+/*
+ * The ARM64 iowrite implementation is intended to support drivers that want to
+ * use write combining. For instance PCI drivers using write combining with a 64
+ * byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
+ *
+ * Newer ARM core have sensitive write combining buffers, it is important that
+ * the stores be contiguous blocks of store instructions. Normal memcpy
+ * approaches have a very low chance to generate write combining.
+ *
+ * Since this is the only API on ARM64 that should be used with write combining
+ * it also integrates the DGH hint which is supposed to lower the latency to
+ * emit the large TLP from the CPU.
+ */
+
+static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
+						 const u32 *from, size_t count)
+{
+	switch (count) {
+	case 8:
+		asm volatile("str %w0, [%8, #4 * 0]\n"
+			     "str %w1, [%8, #4 * 1]\n"
+			     "str %w2, [%8, #4 * 2]\n"
+			     "str %w3, [%8, #4 * 3]\n"
+			     "str %w4, [%8, #4 * 4]\n"
+			     "str %w5, [%8, #4 * 5]\n"
+			     "str %w6, [%8, #4 * 6]\n"
+			     "str %w7, [%8, #4 * 7]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+			       "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+		break;
+	case 4:
+		asm volatile("str %w0, [%4, #4 * 0]\n"
+			     "str %w1, [%4, #4 * 1]\n"
+			     "str %w2, [%4, #4 * 2]\n"
+			     "str %w3, [%4, #4 * 3]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "r"(to));
+		break;
+	case 2:
+		asm volatile("str %w0, [%2, #4 * 0]\n"
+			     "str %w1, [%2, #4 * 1]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+		break;
+	case 1:
+		__raw_writel(*from, to);
+		break;
+	default:
+		BUILD_BUG();
+	}
+}
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite32_copy(void __iomem *to, const void *from,
+					  size_t count)
+{
+	if (count == 8 || count == 4 || count == 2 || count == 1) {
+		__const_memcpy_toio_aligned32(to, from, count);
+		dgh();
+	} else {
+		__iowrite32_copy_full(to, from, count);
+	}
+}
+
+#define __iowrite32_copy(to, from, count)                  \
+	(__builtin_constant_p(count) ?                     \
+		 __const_iowrite32_copy(to, from, count) : \
+		 __iowrite32_copy_full(to, from, count))
+
+static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
+						 const u64 *from, size_t count)
+{
+	switch (count) {
+	case 8:
+		asm volatile("str %x0, [%8, #8 * 0]\n"
+			     "str %x1, [%8, #8 * 1]\n"
+			     "str %x2, [%8, #8 * 2]\n"
+			     "str %x3, [%8, #8 * 3]\n"
+			     "str %x4, [%8, #8 * 4]\n"
+			     "str %x5, [%8, #8 * 5]\n"
+			     "str %x6, [%8, #8 * 6]\n"
+			     "str %x7, [%8, #8 * 7]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+			       "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+		break;
+	case 4:
+		asm volatile("str %x0, [%4, #8 * 0]\n"
+			     "str %x1, [%4, #8 * 1]\n"
+			     "str %x2, [%4, #8 * 2]\n"
+			     "str %x3, [%4, #8 * 3]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "r"(to));
+		break;
+	case 2:
+		asm volatile("str %x0, [%2, #8 * 0]\n"
+			     "str %x1, [%2, #8 * 1]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+		break;
+	case 1:
+		__raw_writeq(*from, to);
+		break;
+	default:
+		BUILD_BUG();
+	}
+}
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
+					  size_t count)
+{
+	if (count == 8 || count == 4 || count == 2 || count == 1) {
+		__const_memcpy_toio_aligned64(to, from, count);
+		dgh();
+	} else {
+		__iowrite64_copy_full(to, from, count);
+	}
+}
+
+#define __iowrite64_copy(to, from, count)                  \
+	(__builtin_constant_p(count) ?                     \
+		 __const_iowrite64_copy(to, from, count) : \
+		 __iowrite64_copy_full(to, from, count))
+
 /*
  * I/O memory mapping functions.
  */
 
@@ -37,6 +37,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 }
 EXPORT_SYMBOL(__memcpy_fromio);
 
+/*
+ * This generates a memcpy that works on a from/to address which is aligned to
+ * bits. Count is in terms of the number of bits sized quantities to copy. It
+ * optimizes to use the STR groupings when possible so that it is WC friendly.
+ */
+#define memcpy_toio_aligned(to, from, count, bits)                        \
+	({                                                                \
+		volatile u##bits __iomem *_to = to;                       \
+		const u##bits *_from = from;                              \
+		size_t _count = count;                                    \
+		const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
+                                                                          \
+		for (; _from < _end_from; _from += 8, _to += 8)           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 8); \
+		if ((_count % 8) >= 4) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 4); \
+			_from += 4;                                       \
+			_to += 4;                                         \
+		}                                                         \
+		if ((_count % 4) >= 2) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 2); \
+			_from += 2;                                       \
+			_to += 2;                                         \
+		}                                                         \
+		if (_count % 2)                                           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 1); \
+	})
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 64);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite64_copy_full);
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 32);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite32_copy_full);
+
 /*
  * Copy data from "real" memory space to IO memory space.
  */
 
@@ -73,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p)
 #define __raw_writel	zpci_write_u32
 #define __raw_writeq	zpci_write_u64
 
+/* combine single writes by using store-block insn */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 4);
+}
+#define __iowrite32_copy __iowrite32_copy
+
+static inline void __iowrite64_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 8);
+}
+#define __iowrite64_copy __iowrite64_copy
+
 #endif /* CONFIG_PCI */
 
 #include <asm-generic/io.h>
 
@@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 	return 0;
 }
 
-/* combine single writes by using store-block insn */
-void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
-{
-	zpci_memcpy_toio(to, from, count * 8);
-}
-
 void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 			   unsigned long prot)
 {
 
@@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_toio memcpy_toio
 #define memset_io memset_io
 
+#ifdef CONFIG_X86_64
+/*
+ * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
+ * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
+ * loop.
+ */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	asm volatile("rep ; movsl"
+		     : "=&c"(count), "=&D"(to), "=&S"(from)
+		     : "0"(count), "1"(to), "2"(from)
+		     : "memory");
+}
+#define __iowrite32_copy __iowrite32_copy
+#endif
+
 /*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
 
@@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += atomic64_386_32.o
 endif
 else
-        obj-y += iomap_copy_64.o
 ifneq ($(CONFIG_GENERIC_CSUM),y)
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
 endif
 
@@ -715,8 +715,10 @@ cma_validate_port(struct ib_device *device, u32 port,
 		rcu_read_lock();
 		ndev = rcu_dereference(sgid_attr->ndev);
 		if (!net_eq(dev_net(ndev), dev_addr->net) ||
-		    ndev->ifindex != bound_if_index)
+		    ndev->ifindex != bound_if_index) {
+			rdma_put_gid_attr(sgid_attr);
 			sgid_attr = ERR_PTR(-ENODEV);
+		}
 		rcu_read_unlock();
 		goto out;
 	}
 
@@ -2174,8 +2174,7 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
 	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
 
 	add_ndev_hash(pdata);
-	if (old_ndev)
-		__dev_put(old_ndev);
+	__dev_put(old_ndev);
 
 	return 0;
 }
@@ -2235,8 +2234,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
 		spin_lock(&pdata->netdev_lock);
 		res = rcu_dereference_protected(
 			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
-		if (res)
-			dev_hold(res);
+		dev_hold(res);
 		spin_unlock(&pdata->netdev_lock);
 	}
 
@@ -2311,9 +2309,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
 
 			if (filter(ib_dev, port, idev, filter_cookie))
 				cb(ib_dev, port, idev, cookie);
-
-			if (idev)
-				dev_put(idev);
+			dev_put(idev);
 		}
 }
 
 
@@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
 	slave = netdev_get_xmit_slave(master, skb,
 				      !!(device->lag_flags &
 					 RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
-	if (slave)
-		dev_hold(slave);
+	dev_hold(slave);
 	rcu_read_unlock();
 	kfree_skb(skb);
 	return slave;
Original file line number	Diff line number	Diff line change
`@@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void data, const struct resource res,`
`250`	`250`	`return 0;`
`251`	`251`	`}`
`252`	`252`
`253`		`-/* combine single writes by using store-block insn */`
`254`		`-void __iowrite64_copy(void __iomem to, const void from, size_t count)`
`255`		`-{`
`256`		`- zpci_memcpy_toio(to, from, count * 8);`
`257`		`-}`
`258`		`-`
`259`	`253`	`void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,`
`260`	`254`	`unsigned long prot)`
`261`	`255`	`{`
Original file line number	Diff line number	Diff line change
`@@ -2174,8 +2174,7 @@ int ib_device_set_netdev(struct ib_device ib_dev, struct net_device ndev,`
`2174`	`2174`	`spin_unlock_irqrestore(&pdata->netdev_lock, flags);`
`2175`	`2175`
`2176`	`2176`	`add_ndev_hash(pdata);`
`2177`		`- if (old_ndev)`
`2178`		`- __dev_put(old_ndev);`
	`2177`	`+ __dev_put(old_ndev);`
`2179`	`2178`
`2180`	`2179`	`return 0;`
`2181`	`2180`	`}`
`@@ -2235,8 +2234,7 @@ struct net_device ib_device_get_netdev(struct ib_device ib_dev,`
`2235`	`2234`	`spin_lock(&pdata->netdev_lock);`
`2236`	`2235`	`res = rcu_dereference_protected(`
`2237`	`2236`	`pdata->netdev, lockdep_is_held(&pdata->netdev_lock));`
`2238`		`- if (res)`
`2239`		`- dev_hold(res);`
	`2237`	`+ dev_hold(res);`
`2240`	`2238`	`spin_unlock(&pdata->netdev_lock);`
`2241`	`2239`	`}`
`2242`	`2240`
`@@ -2311,9 +2309,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,`
`2311`	`2309`
`2312`	`2310`	`if (filter(ib_dev, port, idev, filter_cookie))`
`2313`	`2311`	`cb(ib_dev, port, idev, cookie);`
`2314`		`-`
`2315`		`- if (idev)`
`2316`		`- dev_put(idev);`
	`2312`	`+ dev_put(idev);`
`2317`	`2313`	`}`
`2318`	`2314`	`}`
`2319`	`2315`