Skip to content

Commit 979c634

Browse files
committed
nvme-pci: add support for sgl metadata
Supporting this mode allows creating and merging multi-segment metadata requests that wouldn't be possible otherwise. It also allows directly using user space requests that straddle physically discontiguous pages. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Keith Busch <kbusch@kernel.org>
1 parent 5dd18f0 commit 979c634

File tree

3 files changed

+137
-15
lines changed

3 files changed

+137
-15
lines changed

drivers/nvme/host/nvme.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,13 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
11261126
return ctrl->sgls & ((1 << 0) | (1 << 1));
11271127
}
11281128

1129+
static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
1130+
{
1131+
if (ctrl->ops->flags & NVME_F_FABRICS)
1132+
return true;
1133+
return ctrl->sgls & NVME_CTRL_SGLS_MSDS;
1134+
}
1135+
11291136
#ifdef CONFIG_NVME_HOST_AUTH
11301137
int __init nvme_init_auth(void);
11311138
void __exit nvme_exit_auth(void);

drivers/nvme/host/pci.c

Lines changed: 129 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
*/
4444
#define NVME_MAX_KB_SZ 8192
4545
#define NVME_MAX_SEGS 128
46+
#define NVME_MAX_META_SEGS 15
4647
#define NVME_MAX_NR_ALLOCATIONS 5
4748

4849
static int use_threaded_interrupts;
@@ -144,6 +145,7 @@ struct nvme_dev {
144145
struct sg_table *hmb_sgt;
145146

146147
mempool_t *iod_mempool;
148+
mempool_t *iod_meta_mempool;
147149

148150
/* shadow doorbell buffer support: */
149151
__le32 *dbbuf_dbs;
@@ -239,6 +241,8 @@ struct nvme_iod {
239241
dma_addr_t first_dma;
240242
dma_addr_t meta_dma;
241243
struct sg_table sgt;
244+
struct sg_table meta_sgt;
245+
union nvme_descriptor meta_list;
242246
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
243247
};
244248

@@ -506,6 +510,14 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
506510
spin_unlock(&nvmeq->sq_lock);
507511
}
508512

513+
static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
514+
struct request *req)
515+
{
516+
if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
517+
return false;
518+
return req->nr_integrity_segments > 1;
519+
}
520+
509521
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
510522
int nseg)
511523
{
@@ -518,6 +530,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
518530
return false;
519531
if (!nvmeq->qid)
520532
return false;
533+
if (nvme_pci_metadata_use_sgls(dev, req))
534+
return true;
521535
if (!sgl_threshold || avg_seg_size < sgl_threshold)
522536
return false;
523537
return true;
@@ -780,7 +794,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
780794
struct bio_vec bv = req_bvec(req);
781795

782796
if (!is_pci_p2pdma_page(bv.bv_page)) {
783-
if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
797+
if (!nvme_pci_metadata_use_sgls(dev, req) &&
798+
(bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
784799
bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
785800
return nvme_setup_prp_simple(dev, req,
786801
&cmnd->rw, &bv);
@@ -824,11 +839,69 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
824839
return ret;
825840
}
826841

827-
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
828-
struct nvme_command *cmnd)
842+
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
843+
struct request *req)
844+
{
845+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
846+
struct nvme_rw_command *cmnd = &iod->cmd.rw;
847+
struct nvme_sgl_desc *sg_list;
848+
struct scatterlist *sgl, *sg;
849+
unsigned int entries;
850+
dma_addr_t sgl_dma;
851+
int rc, i;
852+
853+
iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
854+
if (!iod->meta_sgt.sgl)
855+
return BLK_STS_RESOURCE;
856+
857+
sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
858+
iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
859+
iod->meta_sgt.sgl);
860+
if (!iod->meta_sgt.orig_nents)
861+
goto out_free_sg;
862+
863+
rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
864+
DMA_ATTR_NO_WARN);
865+
if (rc)
866+
goto out_free_sg;
867+
868+
sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
869+
if (!sg_list)
870+
goto out_unmap_sg;
871+
872+
entries = iod->meta_sgt.nents;
873+
iod->meta_list.sg_list = sg_list;
874+
iod->meta_dma = sgl_dma;
875+
876+
cmnd->flags = NVME_CMD_SGL_METASEG;
877+
cmnd->metadata = cpu_to_le64(sgl_dma);
878+
879+
sgl = iod->meta_sgt.sgl;
880+
if (entries == 1) {
881+
nvme_pci_sgl_set_data(sg_list, sgl);
882+
return BLK_STS_OK;
883+
}
884+
885+
sgl_dma += sizeof(*sg_list);
886+
nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
887+
for_each_sg(sgl, sg, entries, i)
888+
nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
889+
890+
return BLK_STS_OK;
891+
892+
out_unmap_sg:
893+
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
894+
out_free_sg:
895+
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
896+
return BLK_STS_RESOURCE;
897+
}
898+
899+
static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
900+
struct request *req)
829901
{
830902
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
831903
struct bio_vec bv = rq_integrity_vec(req);
904+
struct nvme_command *cmnd = &iod->cmd;
832905

833906
iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
834907
if (dma_mapping_error(dev->dev, iod->meta_dma))
@@ -837,6 +910,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
837910
return BLK_STS_OK;
838911
}
839912

913+
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
914+
{
915+
if (nvme_pci_metadata_use_sgls(dev, req))
916+
return nvme_pci_setup_meta_sgls(dev, req);
917+
return nvme_pci_setup_meta_mptr(dev, req);
918+
}
919+
840920
static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
841921
{
842922
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -845,6 +925,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
845925
iod->aborted = false;
846926
iod->nr_allocations = -1;
847927
iod->sgt.nents = 0;
928+
iod->meta_sgt.nents = 0;
848929

849930
ret = nvme_setup_cmd(req->q->queuedata, req);
850931
if (ret)
@@ -857,7 +938,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
857938
}
858939

859940
if (blk_integrity_rq(req)) {
860-
ret = nvme_map_metadata(dev, req, &iod->cmd);
941+
ret = nvme_map_metadata(dev, req);
861942
if (ret)
862943
goto out_unmap_data;
863944
}
@@ -955,17 +1036,31 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
9551036
*rqlist = requeue_list;
9561037
}
9571038

1039+
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
1040+
struct request *req)
1041+
{
1042+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
1043+
1044+
if (!iod->meta_sgt.nents) {
1045+
dma_unmap_page(dev->dev, iod->meta_dma,
1046+
rq_integrity_vec(req).bv_len,
1047+
rq_dma_dir(req));
1048+
return;
1049+
}
1050+
1051+
dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
1052+
iod->meta_dma);
1053+
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
1054+
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
1055+
}
1056+
9581057
static __always_inline void nvme_pci_unmap_rq(struct request *req)
9591058
{
9601059
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
9611060
struct nvme_dev *dev = nvmeq->dev;
9621061

963-
if (blk_integrity_rq(req)) {
964-
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
965-
966-
dma_unmap_page(dev->dev, iod->meta_dma,
967-
rq_integrity_vec(req).bv_len, rq_dma_dir(req));
968-
}
1062+
if (blk_integrity_rq(req))
1063+
nvme_unmap_metadata(dev, req);
9691064

9701065
if (blk_rq_nr_phys_segments(req))
9711066
nvme_unmap_data(dev, req);
@@ -2761,6 +2856,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
27612856

27622857
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
27632858
{
2859+
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
27642860
size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
27652861

27662862
dev->iod_mempool = mempool_create_node(1,
@@ -2769,7 +2865,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
27692865
dev_to_node(dev->dev));
27702866
if (!dev->iod_mempool)
27712867
return -ENOMEM;
2868+
2869+
dev->iod_meta_mempool = mempool_create_node(1,
2870+
mempool_kmalloc, mempool_kfree,
2871+
(void *)meta_size, GFP_KERNEL,
2872+
dev_to_node(dev->dev));
2873+
if (!dev->iod_meta_mempool)
2874+
goto free;
2875+
27722876
return 0;
2877+
free:
2878+
mempool_destroy(dev->iod_mempool);
2879+
return -ENOMEM;
27732880
}
27742881

27752882
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2834,6 +2941,11 @@ static void nvme_reset_work(struct work_struct *work)
28342941
if (result)
28352942
goto out;
28362943

2944+
if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
2945+
dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
2946+
else
2947+
dev->ctrl.max_integrity_segments = 1;
2948+
28372949
nvme_dbbuf_dma_alloc(dev);
28382950

28392951
result = nvme_setup_host_mem(dev);
@@ -3101,11 +3213,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
31013213
dev->ctrl.max_hw_sectors = min_t(u32,
31023214
NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
31033215
dev->ctrl.max_segments = NVME_MAX_SEGS;
3104-
3105-
/*
3106-
* There is no support for SGLs for metadata (yet), so we are limited to
3107-
* a single integrity segment for the separate metadata pointer.
3108-
*/
31093216
dev->ctrl.max_integrity_segments = 1;
31103217
return dev;
31113218

@@ -3168,6 +3275,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
31683275
if (result)
31693276
goto out_disable;
31703277

3278+
if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
3279+
dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
3280+
else
3281+
dev->ctrl.max_integrity_segments = 1;
3282+
31713283
nvme_dbbuf_dma_alloc(dev);
31723284

31733285
result = nvme_setup_host_mem(dev);
@@ -3210,6 +3322,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
32103322
nvme_free_queues(dev, 0);
32113323
out_release_iod_mempool:
32123324
mempool_destroy(dev->iod_mempool);
3325+
mempool_destroy(dev->iod_meta_mempool);
32133326
out_release_prp_pools:
32143327
nvme_release_prp_pools(dev);
32153328
out_dev_unmap:
@@ -3275,6 +3388,7 @@ static void nvme_remove(struct pci_dev *pdev)
32753388
nvme_dbbuf_dma_free(dev);
32763389
nvme_free_queues(dev, 0);
32773390
mempool_destroy(dev->iod_mempool);
3391+
mempool_destroy(dev->iod_meta_mempool);
32783392
nvme_release_prp_pools(dev);
32793393
nvme_dev_unmap(dev);
32803394
nvme_uninit_ctrl(&dev->ctrl);

include/linux/nvme.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ enum {
389389
NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5,
390390
NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7,
391391
NVME_CTRL_CTRATT_UUID_LIST = 1 << 9,
392+
NVME_CTRL_SGLS_MSDS = 1 << 19,
392393
};
393394

394395
struct nvme_lbaf {

0 commit comments

Comments
 (0)