Skip to content

Commit 991d98f

Browse files
djbwdavejiang
authored andcommitted
cxl: Make cxl_dpa_alloc() DPA partition number agnostic
cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM allocations being distinct from RAM allocations in specific ways when in practice the allocation rules are only relative to DPA partition index. The rules for cxl_dpa_alloc() are: - allocations can only come from 1 partition - if allocating at partition-index-N, all free space in partitions less than partition-index-N must be skipped over Use the new 'struct cxl_dpa_partition' array to support allocation with an arbitrary number of DPA partitions on the device. A follow-on patch can go further to cleanup 'enum cxl_decoder_mode' concept and supersede it with looking up the memory properties from partition metadata. Until then cxl_part_mode() temporarily bridges code that looks up partitions by @cxled->mode. Reviewed-by: Ira Weiny <ira.weiny@intel.com> Reviewed-by: Alejandro Lucero <alucerop@amd.com> Reviewed-by: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: Alejandro Lucero <alucerop@amd.com> Link: https://patch.msgid.link/173864306400.668823.12143134425285426523.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang <dave.jiang@intel.com>
1 parent 8e4c411 commit 991d98f

File tree

2 files changed

+156
-57
lines changed

2 files changed

+156
-57
lines changed

drivers/cxl/core/hdm.c

Lines changed: 142 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,37 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
223223
}
224224
EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
225225

226+
/* See request_skip() kernel-doc */
227+
static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds,
228+
const resource_size_t skip_base,
229+
const resource_size_t skip_len,
230+
const char *requester)
231+
{
232+
const resource_size_t skip_end = skip_base + skip_len - 1;
233+
234+
for (int i = 0; i < cxlds->nr_partitions; i++) {
235+
const struct resource *part_res = &cxlds->part[i].res;
236+
resource_size_t adjust_start, adjust_end, size;
237+
238+
adjust_start = max(skip_base, part_res->start);
239+
adjust_end = min(skip_end, part_res->end);
240+
241+
if (adjust_end < adjust_start)
242+
continue;
243+
244+
size = adjust_end - adjust_start + 1;
245+
246+
if (!requester)
247+
__release_region(&cxlds->dpa_res, adjust_start, size);
248+
else if (!__request_region(&cxlds->dpa_res, adjust_start, size,
249+
requester, 0))
250+
return adjust_start - skip_base;
251+
}
252+
253+
return skip_len;
254+
}
255+
#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL)
256+
226257
/*
227258
* Must be called in a context that synchronizes against this decoder's
228259
* port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -241,7 +272,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
241272
skip_start = res->start - cxled->skip;
242273
__release_region(&cxlds->dpa_res, res->start, resource_size(res));
243274
if (cxled->skip)
244-
__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
275+
release_skip(cxlds, skip_start, cxled->skip);
245276
cxled->skip = 0;
246277
cxled->dpa_res = NULL;
247278
put_device(&cxled->cxld.dev);
@@ -268,6 +299,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
268299
__cxl_dpa_release(cxled);
269300
}
270301

302+
/**
303+
* request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree
304+
* @cxlds: CXL.mem device context that parents @cxled
305+
* @cxled: Endpoint decoder establishing new allocation that skips lower DPA
306+
* @skip_base: DPA < start of new DPA allocation (DPAnew)
307+
* @skip_len: @skip_base + @skip_len == DPAnew
308+
*
309+
* DPA 'skip' arises from out-of-sequence DPA allocation events relative
310+
* to free capacity across multiple partitions. It is a wasteful event
311+
* as usable DPA gets thrown away, but if a deployment has, for example,
312+
* a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM
313+
* DPA, the free RAM DPA must be sacrificed to start allocating PMEM.
314+
* See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder
315+
* Protection" for more details.
316+
*
317+
* A 'skip' always covers the last allocated DPA in a previous partition
318+
* to the start of the current partition to allocate. Allocations never
319+
* start in the middle of a partition, and allocations are always
320+
* de-allocated in reverse order (see cxl_dpa_free(), or natural devm
321+
* unwind order from forced in-order allocation).
322+
*
323+
* If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip'
324+
* would always be contained to a single partition. Given
325+
* @cxlds->nr_partitions may be > 2 it results in cases where the 'skip'
326+
* might span "tail capacity of partition[0], all of partition[1], ...,
327+
* all of partition[N-1]" to support allocating from partition[N]. That
328+
* in turn interacts with the partition 'struct resource' boundaries
329+
* within @cxlds->dpa_res whereby 'skip' requests need to be divided by
330+
* partition. I.e. this is a quirk of using a 'struct resource' tree to
331+
* detect range conflicts while also tracking partition boundaries in
332+
* @cxlds->dpa_res.
333+
*/
334+
static int request_skip(struct cxl_dev_state *cxlds,
335+
struct cxl_endpoint_decoder *cxled,
336+
const resource_size_t skip_base,
337+
const resource_size_t skip_len)
338+
{
339+
resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len,
340+
dev_name(&cxled->cxld.dev));
341+
342+
if (skipped == skip_len)
343+
return 0;
344+
345+
dev_dbg(cxlds->dev,
346+
"%s: failed to reserve skipped space (%pa %pa %pa)\n",
347+
dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped);
348+
349+
release_skip(cxlds, skip_base, skipped);
350+
351+
return -EBUSY;
352+
}
353+
271354
static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
272355
resource_size_t base, resource_size_t len,
273356
resource_size_t skipped)
@@ -276,7 +359,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
276359
struct cxl_port *port = cxled_to_port(cxled);
277360
struct cxl_dev_state *cxlds = cxlmd->cxlds;
278361
struct device *dev = &port->dev;
362+
enum cxl_decoder_mode mode;
279363
struct resource *res;
364+
int rc;
280365

281366
lockdep_assert_held_write(&cxl_dpa_rwsem);
282367

@@ -305,37 +390,33 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
305390
}
306391

307392
if (skipped) {
308-
res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
309-
dev_name(&cxled->cxld.dev), 0);
310-
if (!res) {
311-
dev_dbg(dev,
312-
"decoder%d.%d: failed to reserve skipped space\n",
313-
port->id, cxled->cxld.id);
314-
return -EBUSY;
315-
}
393+
rc = request_skip(cxlds, cxled, base - skipped, skipped);
394+
if (rc)
395+
return rc;
316396
}
317397
res = __request_region(&cxlds->dpa_res, base, len,
318398
dev_name(&cxled->cxld.dev), 0);
319399
if (!res) {
320400
dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
321401
port->id, cxled->cxld.id);
322402
if (skipped)
323-
__release_region(&cxlds->dpa_res, base - skipped,
324-
skipped);
403+
release_skip(cxlds, base - skipped, skipped);
325404
return -EBUSY;
326405
}
327406
cxled->dpa_res = res;
328407
cxled->skip = skipped;
329408

330-
if (to_pmem_res(cxlds) && resource_contains(to_pmem_res(cxlds), res))
331-
cxled->mode = CXL_DECODER_PMEM;
332-
else if (to_ram_res(cxlds) && resource_contains(to_ram_res(cxlds), res))
333-
cxled->mode = CXL_DECODER_RAM;
334-
else {
409+
mode = CXL_DECODER_NONE;
410+
for (int i = 0; cxlds->nr_partitions; i++)
411+
if (resource_contains(&cxlds->part[i].res, res)) {
412+
mode = cxl_part_mode(cxlds->part[i].mode);
413+
break;
414+
}
415+
416+
if (mode == CXL_DECODER_NONE)
335417
dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
336418
port->id, cxled->cxld.id, res);
337-
cxled->mode = CXL_DECODER_NONE;
338-
}
419+
cxled->mode = mode;
339420

340421
port->hdm_end++;
341422
get_device(&cxled->cxld.dev);
@@ -542,15 +623,13 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
542623
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
543624
{
544625
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
545-
resource_size_t free_ram_start, free_pmem_start;
546626
struct cxl_port *port = cxled_to_port(cxled);
547627
struct cxl_dev_state *cxlds = cxlmd->cxlds;
548628
struct device *dev = &cxled->cxld.dev;
549-
resource_size_t start, avail, skip;
629+
struct resource *res, *prev = NULL;
630+
resource_size_t start, avail, skip, skip_start;
550631
struct resource *p, *last;
551-
const struct resource *ram_res = to_ram_res(cxlds);
552-
const struct resource *pmem_res = to_pmem_res(cxlds);
553-
int rc;
632+
int part, rc;
554633

555634
down_write(&cxl_dpa_rwsem);
556635
if (cxled->cxld.region) {
@@ -566,47 +645,53 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
566645
goto out;
567646
}
568647

569-
for (p = ram_res->child, last = NULL; p; p = p->sibling)
570-
last = p;
571-
if (last)
572-
free_ram_start = last->end + 1;
573-
else
574-
free_ram_start = ram_res->start;
648+
part = -1;
649+
for (int i = 0; i < cxlds->nr_partitions; i++) {
650+
if (cxled->mode == cxl_part_mode(cxlds->part[i].mode)) {
651+
part = i;
652+
break;
653+
}
654+
}
655+
656+
if (part < 0) {
657+
rc = -EBUSY;
658+
goto out;
659+
}
575660

576-
for (p = pmem_res->child, last = NULL; p; p = p->sibling)
661+
res = &cxlds->part[part].res;
662+
for (p = res->child, last = NULL; p; p = p->sibling)
577663
last = p;
578664
if (last)
579-
free_pmem_start = last->end + 1;
665+
start = last->end + 1;
580666
else
581-
free_pmem_start = pmem_res->start;
667+
start = res->start;
582668

583-
if (cxled->mode == CXL_DECODER_RAM) {
584-
start = free_ram_start;
585-
avail = ram_res->end - start + 1;
586-
skip = 0;
587-
} else if (cxled->mode == CXL_DECODER_PMEM) {
588-
resource_size_t skip_start, skip_end;
589-
590-
start = free_pmem_start;
591-
avail = pmem_res->end - start + 1;
592-
skip_start = free_ram_start;
593-
594-
/*
595-
* If some pmem is already allocated, then that allocation
596-
* already handled the skip.
597-
*/
598-
if (pmem_res->child &&
599-
skip_start == pmem_res->child->start)
600-
skip_end = skip_start - 1;
601-
else
602-
skip_end = start - 1;
603-
skip = skip_end - skip_start + 1;
604-
} else {
605-
dev_dbg(dev, "mode not set\n");
606-
rc = -EINVAL;
607-
goto out;
669+
/*
670+
* To allocate at partition N, a skip needs to be calculated for all
671+
* unallocated space at lower partitions indices.
672+
*
673+
* If a partition has any allocations, the search can end because a
674+
* previous cxl_dpa_alloc() invocation is assumed to have accounted for
675+
* all previous partitions.
676+
*/
677+
skip_start = CXL_RESOURCE_NONE;
678+
for (int i = part; i; i--) {
679+
prev = &cxlds->part[i - 1].res;
680+
for (p = prev->child, last = NULL; p; p = p->sibling)
681+
last = p;
682+
if (last) {
683+
skip_start = last->end + 1;
684+
break;
685+
}
686+
skip_start = prev->start;
608687
}
609688

689+
avail = res->end - start + 1;
690+
if (skip_start == CXL_RESOURCE_NONE)
691+
skip = 0;
692+
else
693+
skip = res->start - skip_start;
694+
610695
if (size > avail) {
611696
dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
612697
cxl_decoder_mode_name(cxled->mode), &avail);

drivers/cxl/cxlmem.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,20 @@ static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
529529
return resource_size(res);
530530
}
531531

532+
/*
533+
* Translate the operational mode of memory capacity with the
534+
* operational mode of a decoder
535+
* TODO: kill 'enum cxl_decoder_mode' to obviate this helper
536+
*/
537+
static inline enum cxl_decoder_mode cxl_part_mode(enum cxl_partition_mode mode)
538+
{
539+
if (mode == CXL_PARTMODE_RAM)
540+
return CXL_DECODER_RAM;
541+
if (mode == CXL_PARTMODE_PMEM)
542+
return CXL_DECODER_PMEM;
543+
return CXL_DECODER_NONE;
544+
}
545+
532546
static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
533547
{
534548
return dev_get_drvdata(cxl_mbox->host);

0 commit comments

Comments
 (0)