Skip to content

Commit 8974efa

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma fixes from Jason Gunthorpe: - Several hfi1 patches fixing some long standing driver bugs - Overflow when working with sg lists with elements greater than 4G - An rxe regression with object numbering after the mrs reach their limit - A theoretical problem with the scatterlist merging code * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: lib/scatterlist: Fix to calculate the last_pg properly IB/hfi1: Remove user expected buffer invalidate race IB/hfi1: Immediately remove invalid memory from hardware IB/hfi1: Fix expected receive setup error exit issues IB/hfi1: Reserve user expected TIDs IB/hfi1: Reject a zero-length user expected buffer RDMA/core: Fix ib block iterator counter overflow RDMA/rxe: Prevent faulty rkey generation RDMA/rxe: Fix inaccurate constants in rxe_type_info
2 parents edc0035 + 0f097f0 commit 8974efa

File tree

6 files changed

+179
-88
lines changed

6 files changed

+179
-88
lines changed

drivers/infiniband/core/verbs.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
29572957
bool __rdma_block_iter_next(struct ib_block_iter *biter)
29582958
{
29592959
unsigned int block_offset;
2960+
unsigned int sg_delta;
29602961

29612962
if (!biter->__sg_nents || !biter->__sg)
29622963
return false;
29632964

29642965
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
29652966
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
2966-
biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
2967+
sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
29672968

2968-
if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
2969+
if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
2970+
biter->__sg_advance += sg_delta;
2971+
} else {
29692972
biter->__sg_advance = 0;
29702973
biter->__sg = sg_next(biter->__sg);
29712974
biter->__sg_nents--;

drivers/infiniband/hw/hfi1/user_exp_rcv.c

Lines changed: 140 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,25 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
2323
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
2424
const struct mmu_notifier_range *range,
2525
unsigned long cur_seq);
26+
static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
27+
const struct mmu_notifier_range *range,
28+
unsigned long cur_seq);
2629
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
2730
struct tid_group *grp,
2831
unsigned int start, u16 count,
2932
u32 *tidlist, unsigned int *tididx,
3033
unsigned int *pmapped);
31-
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
32-
struct tid_group **grp);
34+
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
35+
static void __clear_tid_node(struct hfi1_filedata *fd,
36+
struct tid_rb_node *node);
3337
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
3438

3539
static const struct mmu_interval_notifier_ops tid_mn_ops = {
3640
.invalidate = tid_rb_invalidate,
3741
};
42+
static const struct mmu_interval_notifier_ops tid_cover_ops = {
43+
.invalidate = tid_cover_invalidate,
44+
};
3845

3946
/*
4047
* Initialize context and file private data needed for Expected
@@ -253,53 +260,65 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
253260
tididx = 0, mapped, mapped_pages = 0;
254261
u32 *tidlist = NULL;
255262
struct tid_user_buf *tidbuf;
263+
unsigned long mmu_seq = 0;
256264

257265
if (!PAGE_ALIGNED(tinfo->vaddr))
258266
return -EINVAL;
267+
if (tinfo->length == 0)
268+
return -EINVAL;
259269

260270
tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
261271
if (!tidbuf)
262272
return -ENOMEM;
263273

274+
mutex_init(&tidbuf->cover_mutex);
264275
tidbuf->vaddr = tinfo->vaddr;
265276
tidbuf->length = tinfo->length;
266277
tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
267278
GFP_KERNEL);
268279
if (!tidbuf->psets) {
269-
kfree(tidbuf);
270-
return -ENOMEM;
280+
ret = -ENOMEM;
281+
goto fail_release_mem;
282+
}
283+
284+
if (fd->use_mn) {
285+
ret = mmu_interval_notifier_insert(
286+
&tidbuf->notifier, current->mm,
287+
tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
288+
&tid_cover_ops);
289+
if (ret)
290+
goto fail_release_mem;
291+
mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
271292
}
272293

273294
pinned = pin_rcv_pages(fd, tidbuf);
274295
if (pinned <= 0) {
275-
kfree(tidbuf->psets);
276-
kfree(tidbuf);
277-
return pinned;
296+
ret = (pinned < 0) ? pinned : -ENOSPC;
297+
goto fail_unpin;
278298
}
279299

280300
/* Find sets of physically contiguous pages */
281301
tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
282302

283-
/*
284-
* We don't need to access this under a lock since tid_used is per
285-
* process and the same process cannot be in hfi1_user_exp_rcv_clear()
286-
* and hfi1_user_exp_rcv_setup() at the same time.
287-
*/
303+
/* Reserve the number of expected tids to be used. */
288304
spin_lock(&fd->tid_lock);
289305
if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
290306
pageset_count = fd->tid_limit - fd->tid_used;
291307
else
292308
pageset_count = tidbuf->n_psets;
309+
fd->tid_used += pageset_count;
293310
spin_unlock(&fd->tid_lock);
294311

295-
if (!pageset_count)
296-
goto bail;
312+
if (!pageset_count) {
313+
ret = -ENOSPC;
314+
goto fail_unreserve;
315+
}
297316

298317
ngroups = pageset_count / dd->rcv_entries.group_size;
299318
tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
300319
if (!tidlist) {
301320
ret = -ENOMEM;
302-
goto nomem;
321+
goto fail_unreserve;
303322
}
304323

305324
tididx = 0;
@@ -395,43 +414,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
395414
}
396415
unlock:
397416
mutex_unlock(&uctxt->exp_mutex);
398-
nomem:
399417
hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
400418
mapped_pages, ret);
401-
if (tididx) {
402-
spin_lock(&fd->tid_lock);
403-
fd->tid_used += tididx;
404-
spin_unlock(&fd->tid_lock);
405-
tinfo->tidcnt = tididx;
406-
tinfo->length = mapped_pages * PAGE_SIZE;
407-
408-
if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
409-
tidlist, sizeof(tidlist[0]) * tididx)) {
410-
/*
411-
* On failure to copy to the user level, we need to undo
412-
* everything done so far so we don't leak resources.
413-
*/
414-
tinfo->tidlist = (unsigned long)&tidlist;
415-
hfi1_user_exp_rcv_clear(fd, tinfo);
416-
tinfo->tidlist = 0;
417-
ret = -EFAULT;
418-
goto bail;
419+
420+
/* fail if nothing was programmed, set error if none provided */
421+
if (tididx == 0) {
422+
if (ret >= 0)
423+
ret = -ENOSPC;
424+
goto fail_unreserve;
425+
}
426+
427+
/* adjust reserved tid_used to actual count */
428+
spin_lock(&fd->tid_lock);
429+
fd->tid_used -= pageset_count - tididx;
430+
spin_unlock(&fd->tid_lock);
431+
432+
/* unpin all pages not covered by a TID */
433+
unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
434+
false);
435+
436+
if (fd->use_mn) {
437+
/* check for an invalidate during setup */
438+
bool fail = false;
439+
440+
mutex_lock(&tidbuf->cover_mutex);
441+
fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
442+
mutex_unlock(&tidbuf->cover_mutex);
443+
444+
if (fail) {
445+
ret = -EBUSY;
446+
goto fail_unprogram;
419447
}
420448
}
421449

422-
/*
423-
* If not everything was mapped (due to insufficient RcvArray entries,
424-
* for example), unpin all unmapped pages so we can pin them nex time.
425-
*/
426-
if (mapped_pages != pinned)
427-
unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
428-
(pinned - mapped_pages), false);
429-
bail:
450+
tinfo->tidcnt = tididx;
451+
tinfo->length = mapped_pages * PAGE_SIZE;
452+
453+
if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
454+
tidlist, sizeof(tidlist[0]) * tididx)) {
455+
ret = -EFAULT;
456+
goto fail_unprogram;
457+
}
458+
459+
if (fd->use_mn)
460+
mmu_interval_notifier_remove(&tidbuf->notifier);
461+
kfree(tidbuf->pages);
430462
kfree(tidbuf->psets);
463+
kfree(tidbuf);
431464
kfree(tidlist);
465+
return 0;
466+
467+
fail_unprogram:
468+
/* unprogram, unmap, and unpin all allocated TIDs */
469+
tinfo->tidlist = (unsigned long)tidlist;
470+
hfi1_user_exp_rcv_clear(fd, tinfo);
471+
tinfo->tidlist = 0;
472+
pinned = 0; /* nothing left to unpin */
473+
pageset_count = 0; /* nothing left reserved */
474+
fail_unreserve:
475+
spin_lock(&fd->tid_lock);
476+
fd->tid_used -= pageset_count;
477+
spin_unlock(&fd->tid_lock);
478+
fail_unpin:
479+
if (fd->use_mn)
480+
mmu_interval_notifier_remove(&tidbuf->notifier);
481+
if (pinned > 0)
482+
unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
483+
fail_release_mem:
432484
kfree(tidbuf->pages);
485+
kfree(tidbuf->psets);
433486
kfree(tidbuf);
434-
return ret > 0 ? 0 : ret;
487+
kfree(tidlist);
488+
return ret;
435489
}
436490

437491
int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
@@ -452,7 +506,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
452506

453507
mutex_lock(&uctxt->exp_mutex);
454508
for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
455-
ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
509+
ret = unprogram_rcvarray(fd, tidinfo[tididx]);
456510
if (ret) {
457511
hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
458512
ret);
@@ -706,6 +760,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
706760
}
707761

708762
node->fdata = fd;
763+
mutex_init(&node->invalidate_mutex);
709764
node->phys = page_to_phys(pages[0]);
710765
node->npages = npages;
711766
node->rcventry = rcventry;
@@ -721,11 +776,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
721776
&tid_mn_ops);
722777
if (ret)
723778
goto out_unmap;
724-
/*
725-
* FIXME: This is in the wrong order, the notifier should be
726-
* established before the pages are pinned by pin_rcv_pages.
727-
*/
728-
mmu_interval_read_begin(&node->notifier);
729779
}
730780
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
731781

@@ -745,8 +795,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
745795
return -EFAULT;
746796
}
747797

748-
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
749-
struct tid_group **grp)
798+
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
750799
{
751800
struct hfi1_ctxtdata *uctxt = fd->uctxt;
752801
struct hfi1_devdata *dd = uctxt->dd;
@@ -769,33 +818,41 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
769818
if (!node || node->rcventry != (uctxt->expected_base + rcventry))
770819
return -EBADF;
771820

772-
if (grp)
773-
*grp = node->grp;
774-
775821
if (fd->use_mn)
776822
mmu_interval_notifier_remove(&node->notifier);
777823
cacheless_tid_rb_remove(fd, node);
778824

779825
return 0;
780826
}
781827

782-
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
828+
static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
783829
{
784830
struct hfi1_ctxtdata *uctxt = fd->uctxt;
785831
struct hfi1_devdata *dd = uctxt->dd;
786832

833+
mutex_lock(&node->invalidate_mutex);
834+
if (node->freed)
835+
goto done;
836+
node->freed = true;
837+
787838
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
788839
node->npages,
789840
node->notifier.interval_tree.start, node->phys,
790841
node->dma_addr);
791842

792-
/*
793-
* Make sure device has seen the write before we unpin the
794-
* pages.
795-
*/
843+
/* Make sure device has seen the write before pages are unpinned */
796844
hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
797845

798846
unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
847+
done:
848+
mutex_unlock(&node->invalidate_mutex);
849+
}
850+
851+
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
852+
{
853+
struct hfi1_ctxtdata *uctxt = fd->uctxt;
854+
855+
__clear_tid_node(fd, node);
799856

800857
node->grp->used--;
801858
node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
@@ -854,10 +911,16 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
854911
if (node->freed)
855912
return true;
856913

914+
/* take action only if unmapping */
915+
if (range->event != MMU_NOTIFY_UNMAP)
916+
return true;
917+
857918
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
858919
node->notifier.interval_tree.start,
859920
node->rcventry, node->npages, node->dma_addr);
860-
node->freed = true;
921+
922+
/* clear the hardware rcvarray entry */
923+
__clear_tid_node(fdata, node);
861924

862925
spin_lock(&fdata->invalid_lock);
863926
if (fdata->invalid_tid_idx < uctxt->expected_count) {
@@ -887,6 +950,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
887950
return true;
888951
}
889952

953+
static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
954+
const struct mmu_notifier_range *range,
955+
unsigned long cur_seq)
956+
{
957+
struct tid_user_buf *tidbuf =
958+
container_of(mni, struct tid_user_buf, notifier);
959+
960+
/* take action only if unmapping */
961+
if (range->event == MMU_NOTIFY_UNMAP) {
962+
mutex_lock(&tidbuf->cover_mutex);
963+
mmu_interval_set_seq(mni, cur_seq);
964+
mutex_unlock(&tidbuf->cover_mutex);
965+
}
966+
967+
return true;
968+
}
969+
890970
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
891971
struct tid_rb_node *tnode)
892972
{

drivers/infiniband/hw/hfi1/user_exp_rcv.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ struct tid_pageset {
1616
};
1717

1818
struct tid_user_buf {
19+
struct mmu_interval_notifier notifier;
20+
struct mutex cover_mutex;
1921
unsigned long vaddr;
2022
unsigned long length;
2123
unsigned int npages;
@@ -27,6 +29,7 @@ struct tid_user_buf {
2729
struct tid_rb_node {
2830
struct mmu_interval_notifier notifier;
2931
struct hfi1_filedata *fdata;
32+
struct mutex invalidate_mutex; /* covers hw removal */
3033
unsigned long phys;
3134
struct tid_group *grp;
3235
u32 rcventry;

drivers/infiniband/sw/rxe/rxe_param.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,11 @@ enum rxe_device_param {
9898
RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX,
9999

100100
RXE_MIN_MR_INDEX = 0x00000001,
101-
RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE,
102-
RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX,
103-
RXE_MIN_MW_INDEX = 0x00010001,
104-
RXE_MAX_MW_INDEX = 0x00020000,
105-
RXE_MAX_MW = 0x00001000,
101+
RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1,
102+
RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX,
103+
RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1,
104+
RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE,
105+
RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX,
106106

107107
RXE_MAX_PKT_PER_ACK = 64,
108108

0 commit comments

Comments
 (0)