Skip to content

Commit 25f4874

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe: "Aside from the usual things this has an arch update for __iowrite64_copy() used by the RDMA drivers. This API was intended to generate large 64 byte MemWr TLPs on PCI. These days most processors had done this by just repeating writel() in a loop. S390 and some new ARM64 designs require a special helper to get this to generate. - Small improvements and fixes for erdma, efa, hfi1, bnxt_re - Fix a UAF crash after module unload on leaking restrack entry - Continue adding full RDMA support in mana with support for EQs, GID's and CQs - Improvements to the mkey cache in mlx5 - DSCP traffic class support in hns and several bug fixes - Cap the maximum number of MADs in the receive queue to avoid OOM - Another batch of rxe bug fixes from large scale testing - __iowrite64_copy() optimizations for write combining MMIO memory - Remove NULL checks before dev_put/hold() - EFA support for receive with immediate - Fix a recent memleaking regression in a cma error path" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (70 commits) RDMA/cma: Fix kmemleak in rdma_core observed during blktests nvme/rdma use siw RDMA/IPoIB: Fix format truncation compilation errors bnxt_re: avoid shift undefined behavior in bnxt_qplib_alloc_init_hwq RDMA/efa: Support QP with unsolicited write w/ imm. receive IB/hfi1: Remove generic .ndo_get_stats64 IB/hfi1: Do not use custom stat allocator RDMA/hfi1: Use RMW accessors for changing LNKCTL2 RDMA/mana_ib: implement uapi for creation of rnic cq RDMA/mana_ib: boundary check before installing cq callbacks RDMA/mana_ib: introduce a helper to remove cq callbacks RDMA/mana_ib: create and destroy RNIC cqs RDMA/mana_ib: create EQs for RNIC CQs RDMA/core: Remove NULL check before dev_{put, hold} RDMA/ipoib: Remove NULL check before dev_{put, hold} RDMA/mlx5: Remove NULL check before dev_{put, hold} RDMA/mlx5: Track DCT, DCI and REG_UMR QPs as diver_detail resources. RDMA/core: Add an option to display driver-specific QPs in the rdmatool RDMA/efa: Add shutdown notifier RDMA/mana_ib: Fix missing ret value IB/mlx5: Use __iowrite64_copy() for write combining stores ...
2 parents 56172ac + 9c07318 commit 25f4874

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+1584
-768
lines changed

arch/arm64/include/asm/io.h

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
139139
#define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l))
140140
#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l))
141141

142+
/*
143+
* The ARM64 iowrite implementation is intended to support drivers that want to
144+
* use write combining. For instance PCI drivers using write combining with a 64
145+
* byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
146+
*
147+
* Newer ARM core have sensitive write combining buffers, it is important that
148+
* the stores be contiguous blocks of store instructions. Normal memcpy
149+
* approaches have a very low chance to generate write combining.
150+
*
151+
* Since this is the only API on ARM64 that should be used with write combining
152+
* it also integrates the DGH hint which is supposed to lower the latency to
153+
* emit the large TLP from the CPU.
154+
*/
155+
156+
static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
157+
const u32 *from, size_t count)
158+
{
159+
switch (count) {
160+
case 8:
161+
asm volatile("str %w0, [%8, #4 * 0]\n"
162+
"str %w1, [%8, #4 * 1]\n"
163+
"str %w2, [%8, #4 * 2]\n"
164+
"str %w3, [%8, #4 * 3]\n"
165+
"str %w4, [%8, #4 * 4]\n"
166+
"str %w5, [%8, #4 * 5]\n"
167+
"str %w6, [%8, #4 * 6]\n"
168+
"str %w7, [%8, #4 * 7]\n"
169+
:
170+
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
171+
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
172+
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
173+
break;
174+
case 4:
175+
asm volatile("str %w0, [%4, #4 * 0]\n"
176+
"str %w1, [%4, #4 * 1]\n"
177+
"str %w2, [%4, #4 * 2]\n"
178+
"str %w3, [%4, #4 * 3]\n"
179+
:
180+
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
181+
"rZ"(from[3]), "r"(to));
182+
break;
183+
case 2:
184+
asm volatile("str %w0, [%2, #4 * 0]\n"
185+
"str %w1, [%2, #4 * 1]\n"
186+
:
187+
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
188+
break;
189+
case 1:
190+
__raw_writel(*from, to);
191+
break;
192+
default:
193+
BUILD_BUG();
194+
}
195+
}
196+
197+
void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);
198+
199+
static inline void __const_iowrite32_copy(void __iomem *to, const void *from,
200+
size_t count)
201+
{
202+
if (count == 8 || count == 4 || count == 2 || count == 1) {
203+
__const_memcpy_toio_aligned32(to, from, count);
204+
dgh();
205+
} else {
206+
__iowrite32_copy_full(to, from, count);
207+
}
208+
}
209+
210+
#define __iowrite32_copy(to, from, count) \
211+
(__builtin_constant_p(count) ? \
212+
__const_iowrite32_copy(to, from, count) : \
213+
__iowrite32_copy_full(to, from, count))
214+
215+
static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
216+
const u64 *from, size_t count)
217+
{
218+
switch (count) {
219+
case 8:
220+
asm volatile("str %x0, [%8, #8 * 0]\n"
221+
"str %x1, [%8, #8 * 1]\n"
222+
"str %x2, [%8, #8 * 2]\n"
223+
"str %x3, [%8, #8 * 3]\n"
224+
"str %x4, [%8, #8 * 4]\n"
225+
"str %x5, [%8, #8 * 5]\n"
226+
"str %x6, [%8, #8 * 6]\n"
227+
"str %x7, [%8, #8 * 7]\n"
228+
:
229+
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
230+
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
231+
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
232+
break;
233+
case 4:
234+
asm volatile("str %x0, [%4, #8 * 0]\n"
235+
"str %x1, [%4, #8 * 1]\n"
236+
"str %x2, [%4, #8 * 2]\n"
237+
"str %x3, [%4, #8 * 3]\n"
238+
:
239+
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
240+
"rZ"(from[3]), "r"(to));
241+
break;
242+
case 2:
243+
asm volatile("str %x0, [%2, #8 * 0]\n"
244+
"str %x1, [%2, #8 * 1]\n"
245+
:
246+
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
247+
break;
248+
case 1:
249+
__raw_writeq(*from, to);
250+
break;
251+
default:
252+
BUILD_BUG();
253+
}
254+
}
255+
256+
void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);
257+
258+
static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
259+
size_t count)
260+
{
261+
if (count == 8 || count == 4 || count == 2 || count == 1) {
262+
__const_memcpy_toio_aligned64(to, from, count);
263+
dgh();
264+
} else {
265+
__iowrite64_copy_full(to, from, count);
266+
}
267+
}
268+
269+
#define __iowrite64_copy(to, from, count) \
270+
(__builtin_constant_p(count) ? \
271+
__const_iowrite64_copy(to, from, count) : \
272+
__iowrite64_copy_full(to, from, count))
273+
142274
/*
143275
* I/O memory mapping functions.
144276
*/

arch/arm64/kernel/io.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
3737
}
3838
EXPORT_SYMBOL(__memcpy_fromio);
3939

40+
/*
41+
* This generates a memcpy that works on a from/to address which is aligned to
42+
* bits. Count is in terms of the number of bits sized quantities to copy. It
43+
* optimizes to use the STR groupings when possible so that it is WC friendly.
44+
*/
45+
#define memcpy_toio_aligned(to, from, count, bits) \
46+
({ \
47+
volatile u##bits __iomem *_to = to; \
48+
const u##bits *_from = from; \
49+
size_t _count = count; \
50+
const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
51+
\
52+
for (; _from < _end_from; _from += 8, _to += 8) \
53+
__const_memcpy_toio_aligned##bits(_to, _from, 8); \
54+
if ((_count % 8) >= 4) { \
55+
__const_memcpy_toio_aligned##bits(_to, _from, 4); \
56+
_from += 4; \
57+
_to += 4; \
58+
} \
59+
if ((_count % 4) >= 2) { \
60+
__const_memcpy_toio_aligned##bits(_to, _from, 2); \
61+
_from += 2; \
62+
_to += 2; \
63+
} \
64+
if (_count % 2) \
65+
__const_memcpy_toio_aligned##bits(_to, _from, 1); \
66+
})
67+
68+
void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
69+
{
70+
memcpy_toio_aligned(to, from, count, 64);
71+
dgh();
72+
}
73+
EXPORT_SYMBOL(__iowrite64_copy_full);
74+
75+
void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
76+
{
77+
memcpy_toio_aligned(to, from, count, 32);
78+
dgh();
79+
}
80+
EXPORT_SYMBOL(__iowrite32_copy_full);
81+
4082
/*
4183
* Copy data from "real" memory space to IO memory space.
4284
*/

arch/s390/include/asm/io.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p)
7373
#define __raw_writel zpci_write_u32
7474
#define __raw_writeq zpci_write_u64
7575

76+
/* combine single writes by using store-block insn */
77+
static inline void __iowrite32_copy(void __iomem *to, const void *from,
78+
size_t count)
79+
{
80+
zpci_memcpy_toio(to, from, count * 4);
81+
}
82+
#define __iowrite32_copy __iowrite32_copy
83+
84+
static inline void __iowrite64_copy(void __iomem *to, const void *from,
85+
size_t count)
86+
{
87+
zpci_memcpy_toio(to, from, count * 8);
88+
}
89+
#define __iowrite64_copy __iowrite64_copy
90+
7691
#endif /* CONFIG_PCI */
7792

7893
#include <asm-generic/io.h>

arch/s390/pci/pci.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
250250
return 0;
251251
}
252252

253-
/* combine single writes by using store-block insn */
254-
void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
255-
{
256-
zpci_memcpy_toio(to, from, count * 8);
257-
}
258-
259253
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
260254
unsigned long prot)
261255
{

arch/x86/include/asm/io.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
209209
#define memcpy_toio memcpy_toio
210210
#define memset_io memset_io
211211

212+
#ifdef CONFIG_X86_64
213+
/*
214+
* Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
215+
* x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
216+
* loop.
217+
*/
218+
static inline void __iowrite32_copy(void __iomem *to, const void *from,
219+
size_t count)
220+
{
221+
asm volatile("rep ; movsl"
222+
: "=&c"(count), "=&D"(to), "=&S"(from)
223+
: "0"(count), "1"(to), "2"(from)
224+
: "memory");
225+
}
226+
#define __iowrite32_copy __iowrite32_copy
227+
#endif
228+
212229
/*
213230
* ISA space is 'always mapped' on a typical x86 system, no need to
214231
* explicitly ioremap() it. The fact that the ISA IO space is mapped

arch/x86/lib/Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
5353
lib-y += atomic64_386_32.o
5454
endif
5555
else
56-
obj-y += iomap_copy_64.o
5756
ifneq ($(CONFIG_GENERIC_CSUM),y)
5857
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
5958
endif

arch/x86/lib/iomap_copy_64.S

Lines changed: 0 additions & 15 deletions
This file was deleted.

drivers/infiniband/core/cma.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,8 +715,10 @@ cma_validate_port(struct ib_device *device, u32 port,
715715
rcu_read_lock();
716716
ndev = rcu_dereference(sgid_attr->ndev);
717717
if (!net_eq(dev_net(ndev), dev_addr->net) ||
718-
ndev->ifindex != bound_if_index)
718+
ndev->ifindex != bound_if_index) {
719+
rdma_put_gid_attr(sgid_attr);
719720
sgid_attr = ERR_PTR(-ENODEV);
721+
}
720722
rcu_read_unlock();
721723
goto out;
722724
}

drivers/infiniband/core/device.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2174,8 +2174,7 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
21742174
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
21752175

21762176
add_ndev_hash(pdata);
2177-
if (old_ndev)
2178-
__dev_put(old_ndev);
2177+
__dev_put(old_ndev);
21792178

21802179
return 0;
21812180
}
@@ -2235,8 +2234,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
22352234
spin_lock(&pdata->netdev_lock);
22362235
res = rcu_dereference_protected(
22372236
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2238-
if (res)
2239-
dev_hold(res);
2237+
dev_hold(res);
22402238
spin_unlock(&pdata->netdev_lock);
22412239
}
22422240

@@ -2311,9 +2309,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
23112309

23122310
if (filter(ib_dev, port, idev, filter_cookie))
23132311
cb(ib_dev, port, idev, cookie);
2314-
2315-
if (idev)
2316-
dev_put(idev);
2312+
dev_put(idev);
23172313
}
23182314
}
23192315

drivers/infiniband/core/lag.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
9393
slave = netdev_get_xmit_slave(master, skb,
9494
!!(device->lag_flags &
9595
RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
96-
if (slave)
97-
dev_hold(slave);
96+
dev_hold(slave);
9897
rcu_read_unlock();
9998
kfree_skb(skb);
10099
return slave;

0 commit comments

Comments
 (0)