From 2b018086143d638de8d67ae5be6e8c1afb413193 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:46 -0700 Subject: [PATCH 01/19] blk-mq: unconditional nr_integrity_segments Always defining the field will make using it easier and less error prone in future patches. There shouldn't be any downside to this: the field fits in what would otherwise be a 2-byte hole, so we're not saving space by conditionally leaving it out. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 -- include/linux/blk-mq.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f1f7d0b3ff3..ef3a2ed49956 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -376,9 +376,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; -#if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; -#endif rq->end_io = NULL; rq->end_io_data = NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8d304b1d16b1..4fecf46ef681 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -149,10 +149,7 @@ struct request { * physical address coalescing is performed. */ unsigned short nr_phys_segments; - -#ifdef CONFIG_BLK_DEV_INTEGRITY unsigned short nr_integrity_segments; -#endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; From 9c297eced59817f461be33e4c241820c5be4bcc1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:47 -0700 Subject: [PATCH 02/19] blk-mq: set the nr_integrity_segments from bio This value is used for merging considerations, so it needs to be accurate. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-3-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index ef3a2ed49956..82219f0e9a25 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2544,6 +2544,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->__sector = bio->bi_iter.bi_sector; rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); + if (bio_integrity(bio)) + rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, + bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); From d148d7503456556859c7e4d354115215d8fb5016 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:48 -0700 Subject: [PATCH 03/19] blk-integrity: properly account for segments Both types of merging when integrity data is used are miscounting the segments: Merging two requests wasn't accounting for the new segment count, so add the "next" segment count to the first on a successful merge to ensure this value is accurate. Merging a bio into an existing request was double counting the bio's segments, even if the merge failed later on. Move the segment accounting to the end when the merge is successful. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-4-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 2 -- block/blk-merge.c | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 010decc892ea..afd101555d3c 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -153,8 +153,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, q->limits.max_integrity_segments) return false; - req->nr_integrity_segments += nr_integrity_segs; - return true; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 56769c4bcd79..ad763ec313b6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -639,6 +639,9 @@ static inline int ll_new_hw_segment(struct request *req, struct bio *bio, * counters. */ req->nr_phys_segments += nr_phys_segs; + if (bio_integrity(bio)) + req->nr_integrity_segments += blk_rq_count_integrity_sg(req->q, + bio); return 1; no_merge: @@ -731,6 +734,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; + req->nr_integrity_segments += next->nr_integrity_segments; return 1; } From 0d7cb52fe417dde4bc9e8d01fadd8c0ec69612cd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:49 -0700 Subject: [PATCH 04/19] blk-integrity: consider entire bio list for merging If a bio is merged to a request, the entire bio list is merged, so don't temporarily detach it from its list when counting segments. In most cases, bi_next will already be NULL, so detaching is usually a no-op. But if the bio does have a list, the current code is miscounting the segments for the resulting merge. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-5-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index afd101555d3c..84065691aaed 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -134,7 +134,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, struct bio *bio) { int nr_integrity_segs; - struct bio *next = bio->bi_next; if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) return true; @@ -145,10 +144,7 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) return false; - bio->bi_next = NULL; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); - bio->bi_next = next; - if (req->nr_integrity_segments + nr_integrity_segs > q->limits.max_integrity_segments) return false; From d2c5b1faccd5ef6352456f817e941945d3b3fe62 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:50 -0700 Subject: [PATCH 05/19] block: provide a request helper for user integrity segments Provide a helper to keep the request flags and nr_integrity_segments in sync with the bio's integrity payload. This is an integrity equivalent to the normal data helper function, 'blk_rq_map_user()'. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-6-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 - block/blk-integrity.c | 14 ++++++++++++++ drivers/nvme/host/ioctl.c | 6 ++---- include/linux/blk-integrity.h | 9 +++++++++ 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8d1fb38f745f..357a022eed41 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -371,7 +371,6 @@ free_bvec: kfree(bvec); return ret; } -EXPORT_SYMBOL_GPL(bio_integrity_map_user); /** * bio_integrity_prep - Prepare bio for integrity I/O diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 84065691aaed..ddc742d58330 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -107,6 +107,20 @@ new_segment: } EXPORT_SYMBOL(blk_rq_map_integrity_sg); +int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, + ssize_t bytes, u32 seed) +{ + int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed); + + if (ret) + return ret; + + rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, rq->bio); + rq->cmd_flags |= REQ_INTEGRITY; + return 0; +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user); + bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 1d769c842fbf..b9b79ccfabf8 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -3,7 +3,6 @@ * Copyright (c) 2011-2014, Intel Corporation. * Copyright (c) 2017-2021 Christoph Hellwig. */ -#include #include #include /* for force_successful_syscall_return */ #include @@ -153,11 +152,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, bio_set_dev(bio, bdev); if (has_metadata) { - ret = bio_integrity_map_user(bio, meta_buffer, meta_len, - meta_seed); + ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len, + meta_seed); if (ret) goto out_unmap; - req->cmd_flags |= REQ_INTEGRITY; } return ret; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index de98049b7ded..793dbb1e0672 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -28,6 +28,8 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); +int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, + ssize_t bytes, u32 seed); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -102,6 +104,13 @@ static inline int blk_rq_map_integrity_sg(struct request_queue *q, { return 0; } +static inline int blk_rq_integrity_map_user(struct request *rq, + void __user *ubuf, + ssize_t bytes, + u32 seed) +{ + return -EINVAL; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; From 27c3785e94f003c664d9d867fbd62d1494546876 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:51 -0700 Subject: [PATCH 06/19] scsi: use request to get integrity segments The request tracks the integrity segments already, so no need to recount the segments again. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-7-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 3958a6d14bf4..c602b0af745c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1175,8 +1175,7 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) goto out_free_sgtables; } - ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); - + ivecs = rq->nr_integrity_segments; if (sg_alloc_table_chained(&prot_sdb->table, ivecs, prot_sdb->table.sgl, SCSI_INLINE_PROT_SG_CNT)) { From f4330766bc0d14b5eb9459e616060d697e7b128e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:52 -0700 Subject: [PATCH 07/19] nvme-rdma: use request to get integrity segments The request tracks the integrity segments already, so no need to recount the segments again. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-8-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 15b5e06039a5..256466bdaee7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1496,7 +1496,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, req->metadata_sgl->sg_table.sgl = (struct scatterlist *)(req->metadata_sgl + 1); ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, - blk_rq_count_integrity_sg(rq->q, rq->bio), + rq->nr_integrity_segments, req->metadata_sgl->sg_table.sgl, NVME_INLINE_METADATA_SG_CNT); if (unlikely(ret)) { From db5197b554fcb8fde0182af65e8e94bec414e342 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:53 -0700 Subject: [PATCH 08/19] block: unexport blk_rq_count_integrity_sg There are no external users of this. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-9-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ddc742d58330..1d82b18e06f8 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -53,7 +53,6 @@ new_segment: return segments; } -EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist From 76c313f658d2752e8527610677164aa7094ef7a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 12:17:46 -0700 Subject: [PATCH 09/19] blk-integrity: improved sg segment mapping Make the integrity mapping more like data mapping, blk_rq_map_sg. Use the request to validate the segment count, and update the callers so they don't have to. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913191746.2628196-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 15 +++++++++++---- drivers/nvme/host/rdma.c | 4 ++-- drivers/scsi/scsi_lib.c | 11 +++-------- include/linux/blk-integrity.h | 6 ++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 1d82b18e06f8..0a2b1c5d0ebf 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -62,19 +62,20 @@ new_segment: * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all - * elements. I.e. sized using blk_rq_count_integrity_sg(). + * elements. I.e. sized using blk_rq_count_integrity_sg() or + * rq->nr_integrity_segments. */ -int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist) +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) { struct bio_vec iv, ivprv = { NULL }; + struct request_queue *q = rq->q; struct scatterlist *sg = NULL; + struct bio *bio = rq->bio; unsigned int segments = 0; struct bvec_iter iter; int prev = 0; bio_for_each_integrity_vec(iv, bio, iter) { - if (prev) { if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; @@ -102,6 +103,12 @@ new_segment: if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of segment + * is bigger than number of req's physical integrity segments + */ + BUG_ON(segments > rq->nr_integrity_segments); + BUG_ON(segments > queue_max_integrity_segments(q)); return segments; } EXPORT_SYMBOL(blk_rq_map_integrity_sg); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 256466bdaee7..c8fd0e8f0237 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1504,8 +1504,8 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, goto out_unmap_sg; } - req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q, - rq->bio, req->metadata_sgl->sg_table.sgl); + req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq, + req->metadata_sgl->sg_table.sgl); *pi_count = ib_dma_map_sg(ibdev, req->metadata_sgl->sg_table.sgl, req->metadata_sgl->nents, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c602b0af745c..c2f6d0e1c03e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1163,7 +1163,6 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; - int ivecs; if (WARN_ON_ONCE(!prot_sdb)) { /* @@ -1175,19 +1174,15 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) goto out_free_sgtables; } - ivecs = rq->nr_integrity_segments; - if (sg_alloc_table_chained(&prot_sdb->table, ivecs, + if (sg_alloc_table_chained(&prot_sdb->table, + rq->nr_integrity_segments, prot_sdb->table.sgl, SCSI_INLINE_PROT_SG_CNT)) { ret = BLK_STS_RESOURCE; goto out_free_sgtables; } - count = blk_rq_map_integrity_sg(rq->q, rq->bio, - prot_sdb->table.sgl); - BUG_ON(count > ivecs); - BUG_ON(count > queue_max_integrity_segments(rq->q)); - + count = blk_rq_map_integrity_sg(rq, prot_sdb->table.sgl); cmd->prot_sdb = prot_sdb; cmd->prot_sdb->table.nents = count; } diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 793dbb1e0672..676f8f860c47 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -25,8 +25,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, } #ifdef CONFIG_BLK_DEV_INTEGRITY -int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, - struct scatterlist *); +int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes, u32 seed); @@ -98,8 +97,7 @@ static inline int blk_rq_count_integrity_sg(struct request_queue *q, { return 0; } -static inline int blk_rq_map_integrity_sg(struct request_queue *q, - struct bio *b, +static inline int blk_rq_map_integrity_sg(struct request *q, struct scatterlist *s) { return 0; From aa3d8a36780ab568d528348dd8115560f63ea16b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Aug 2024 16:30:58 +1000 Subject: [PATCH 10/19] block: change wait on bd_claiming to use a var_waitqueue bd_prepare_to_claim() waits for a var to change, not for a bit to be cleared. Change from bit_waitqueue() to __var_waitqueue() and correspondingly use wake_up_var(). This will allow a future patch which change the "bit" function to expect an "unsigned long *" instead of "void *". Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20240826063659.15327-2-neilb@suse.de Signed-off-by: Jens Axboe --- block/bdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index c5507b6f63b8..21e688fb6449 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -548,7 +548,7 @@ retry: /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { - wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); @@ -571,7 +571,7 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); + wake_up_var(&whole->bd_claiming); } /** From 4208c562a27899212e8046080555e0f204e0579a Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 17 Sep 2024 10:24:57 +0530 Subject: [PATCH 11/19] block: remove bogus union The union around bi_integrity field is pointless. Remove it. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240917045457.429698-1-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 36ed96133217..6d3a0fff2a1d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -248,11 +248,9 @@ struct bio { struct bio_crypt_ctx *bi_crypt_context; #endif - union { #if defined(CONFIG_BLK_DEV_INTEGRITY) - struct bio_integrity_payload *bi_integrity; /* data integrity */ + struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif - }; unsigned short bi_vcnt; /* how many bio_vec's */ From e3accac1a976e65491a9b9fba82ce8ddbd3d2389 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 17 Sep 2024 22:32:31 +0900 Subject: [PATCH 12/19] block: Fix elv_iosched_local_module handling of "none" scheduler Commit 734e1a860312 ("block: Prevent deadlocks when switching elevators") introduced the function elv_iosched_load_module() to allow loading an elevator module outside of elv_iosched_store() with the target device queue not frozen, to avoid deadlocks. However, the "none" scheduler does not have a module and as a result, elv_iosched_load_module() always returns an error when trying to switch to this valid scheduler. Fix this by ignoring the return value of the request_module() call done by elv_iosched_load_module(). This restores the behavior before commit 734e1a860312, which was to ignore the request_module() result and instead rely on elevator_change() to handle the "none" scheduler case. Reported-by: Shin'ichiro Kawasaki Fixes: 734e1a860312 ("block: Prevent deadlocks when switching elevators") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240917133231.134806-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/elevator.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index c355b55d0107..4122026b11f1 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -715,7 +715,9 @@ int elv_iosched_load_module(struct gendisk *disk, const char *buf, strscpy(elevator_name, buf, sizeof(elevator_name)); - return request_module("%s-iosched", strstrip(elevator_name)); + request_module("%s-iosched", strstrip(elevator_name)); + + return 0; } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, From 2f02b5af3a4482b216e6a466edecf6ba8450fa45 Mon Sep 17 00:00:00 2001 From: Qiu-ji Chen Date: Fri, 13 Sep 2024 16:35:04 +0800 Subject: [PATCH 13/19] drbd: Fix atomicity violation in drbd_uuid_set_bm() The violation of atomicity occurs when the drbd_uuid_set_bm function is executed simultaneously with modifying the value of device->ldev->md.uuid[UI_BITMAP]. Consider a scenario where, while device->ldev->md.uuid[UI_BITMAP] passes the validity check when its value is not zero, the value of device->ldev->md.uuid[UI_BITMAP] is written to zero. In this case, the check in drbd_uuid_set_bm might refer to the old value of device->ldev->md.uuid[UI_BITMAP] (before locking), which allows an invalid value to pass the validity check, resulting in inconsistency. To address this issue, it is recommended to include the data validity check within the locked section of the function. This modification ensures that the value of device->ldev->md.uuid[UI_BITMAP] does not change during the validation process, thereby maintaining its integrity. This possible bug is found by an experimental static analysis tool developed by our team. This tool analyzes the locking APIs to extract function pairs that can be concurrently executed, and then analyzes the instructions in the paired functions to identify possible concurrency bugs including data races and atomicity violations. Fixes: 9f2247bb9b75 ("drbd: Protect accesses to the uuid set with a spinlock") Cc: stable@vger.kernel.org Signed-off-by: Qiu-ji Chen Reviewed-by: Philipp Reisner Link: https://lore.kernel.org/r/20240913083504.10549-1-chenqiuji666@gmail.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 449123eb54bf..0d74d75260ef 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3399,10 +3399,12 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) { unsigned long flags; - if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) - return; - spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) { + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); + return; + } + if (val == 0) { drbd_uuid_move_history(device); device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; From 9ba5dcc722de4390a1d3211b2ee3c864f84f5461 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Sep 2024 01:48:17 +0100 Subject: [PATCH 14/19] block: Remove unused blk_limits_io_{min,opt} blk_limits_io_min and blk_limits_io_opt are unused since the recent commit 0a94a469a4f0 ("dm: stop using blk_limits_io_{min,opt}") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20240920004817.676216-1-linux@treblig.org Signed-off-by: Jens Axboe --- block/blk-settings.c | 42 ------------------------------------------ include/linux/blkdev.h | 2 -- 2 files changed, 44 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index cd8a8eabc9a5..a446654ddee5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -437,48 +437,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_limits_io_min - set minimum request size for a device - * @limits: the queue limits - * @min: smallest I/O size in bytes - * - * Description: - * Some devices have an internal block size bigger than the reported - * hardware sector size. This function can be used to signal the - * smallest I/O the device can perform without incurring a performance - * penalty. - */ -void blk_limits_io_min(struct queue_limits *limits, unsigned int min) -{ - limits->io_min = min; - - if (limits->io_min < limits->logical_block_size) - limits->io_min = limits->logical_block_size; - - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; -} -EXPORT_SYMBOL(blk_limits_io_min); - -/** - * blk_limits_io_opt - set optimal request size for a device - * @limits: the queue limits - * @opt: smallest I/O size in bytes - * - * Description: - * Storage devices may report an optimal I/O size, which is the - * device's preferred unit for sustained I/O. This is rarely reported - * for disk drives. For RAID arrays it is usually the stripe width or - * the internal track size. A properly aligned multiple of - * optimal_io_size is the preferred request size for workloads where - * sustained throughput is desired. - */ -void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) -{ - limits->io_opt = opt; -} -EXPORT_SYMBOL(blk_limits_io_opt); - static int queue_limit_alignment_offset(const struct queue_limits *lim, sector_t sector) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 643c9020a35a..50c3b959da28 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -968,8 +968,6 @@ static inline void blk_queue_disable_write_zeroes(struct request_queue *q) /* * Access functions for manipulating queue properties */ -extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, From 65f666c6203600053478ce8e34a1db269a8701c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 19 Sep 2024 10:17:09 +0800 Subject: [PATCH 15/19] lib/sbitmap: define swap_lock as raw_spinlock_t When called from sbitmap_queue_get(), sbitmap_deferred_clear() may be run with preempt disabled. In RT kernel, spin_lock() can sleep, then warning of "BUG: sleeping function called from invalid context" can be triggered. Fix it by replacing it with raw_spin_lock. Cc: Yang Yang Fixes: 72d04bdcf3f7 ("sbitmap: fix io hung due to race on sbitmap_word::cleared") Signed-off-by: Ming Lei Reviewed-by: Yang Yang Link: https://lore.kernel.org/r/20240919021709.511329-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- lib/sbitmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index c09cdcc99471..189140bf11fc 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -40,7 +40,7 @@ struct sbitmap_word { /** * @swap_lock: serializes simultaneous updates of ->word and ->cleared */ - spinlock_t swap_lock; + raw_spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 5e2e93307f0d..d3412984170c 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -65,7 +65,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map, { unsigned long mask, word_mask; - guard(spinlock_irqsave)(&map->swap_lock); + guard(raw_spinlock_irqsave)(&map->swap_lock); if (!map->cleared) { if (depth == 0) @@ -136,7 +136,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, } for (i = 0; i < sb->map_nr; i++) - spin_lock_init(&sb->map[i].swap_lock); + raw_spin_lock_init(&sb->map[i].swap_lock); return 0; } From 63bcf9014e95a7d279d10d8e2caa5d88db2b1855 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Sat, 14 Sep 2024 14:01:22 +0200 Subject: [PATCH 16/19] nvme-multipath: system fails to create generic nvme device NVME_NSHEAD_DISK_LIVE is a flag for struct nvme_ns_head, not nvme_ns. The current code has a typo causing NVME_NSHEAD_DISK_LIVE never to be cleared once device_add_disk_fails, causing the system never to create the 'generic' character device. Even several rescan attempts will change the situation and the system has to be rebooted to fix the issue. Fixes: 11384580e332 ("nvme-multipath: add error handling support for add_disk()") Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 518e22dd4f9b..6d97058cde7a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -648,7 +648,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) rc = device_add_disk(&head->subsys->dev, head->disk, nvme_ns_attr_groups); if (rc) { - clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); + clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags); return; } nvme_add_ns_head_cdev(head); From 3b97f5a05cfc55e7729ff3769f63eef64e2178bb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Sat, 14 Sep 2024 14:01:23 +0200 Subject: [PATCH 17/19] nvme-multipath: avoid hang on inaccessible namespaces During repetitive namespace remapping operations on the target the namespace might have changed between the time the initial scan was performed, and partition scan was invoked by device_add_disk() in nvme_mpath_set_live(). We then end up with a stuck scanning process: [<0>] folio_wait_bit_common+0x12a/0x310 [<0>] filemap_read_folio+0x97/0xd0 [<0>] do_read_cache_folio+0x108/0x390 [<0>] read_part_sector+0x31/0xa0 [<0>] read_lba+0xc5/0x160 [<0>] efi_partition+0xd9/0x8f0 [<0>] bdev_disk_changed+0x23d/0x6d0 [<0>] blkdev_get_whole+0x78/0xc0 [<0>] bdev_open+0x2c6/0x3b0 [<0>] bdev_file_open_by_dev+0xcb/0x120 [<0>] disk_scan_partitions+0x5d/0x100 [<0>] device_add_disk+0x402/0x420 [<0>] nvme_mpath_set_live+0x4f/0x1f0 [nvme_core] [<0>] nvme_mpath_add_disk+0x107/0x120 [nvme_core] [<0>] nvme_alloc_ns+0xac6/0xe60 [nvme_core] [<0>] nvme_scan_ns+0x2dd/0x3e0 [nvme_core] [<0>] nvme_scan_work+0x1a3/0x490 [nvme_core] This happens when we have several paths, some of which are inaccessible, and the active paths are removed first. Then nvme_find_path() will requeue I/O in the ns_head (as paths are present), but the requeue list is never triggered as all remaining paths are inactive. This patch checks for NVME_NSHEAD_DISK_LIVE in nvme_available_path(), and requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared once the last path has been removed to properly terminate pending I/O. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 6d97058cde7a..48e7a8906d01 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -421,6 +421,9 @@ static bool nvme_available_path(struct nvme_ns_head *head) { struct nvme_ns *ns; + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + return NULL; + list_for_each_entry_rcu(ns, &head->list, siblings) { if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) continue; @@ -969,11 +972,16 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) { if (!head->disk) return; - kblockd_schedule_work(&head->requeue_work); - if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); } + /* + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared + * to allow multipath to fail all I/O. + */ + synchronize_srcu(&head->srcu); + kblockd_schedule_work(&head->requeue_work); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) From 83340d9c6178107df581c3ebbae0e28d0b15e879 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 24 Sep 2024 18:01:34 +0900 Subject: [PATCH 18/19] nvme: null terminate nvme_tls_attrs Commit 1e48b34c9bc7 ("nvme: split off TLS sysfs attributes into a separate group") introduced the struct attribute array nvme_tls_attrs. However, the array was not null terminated and caused BUG KASAN global- out-of-bounds. To avoid the BUG, null terminate the array. Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-nvme/jhllwfxcedrcxcnbajwl4x2l2ujcqowqcd4ps574zrafrqhjna@f4icvecutekm/ Fixes: 1e48b34c9bc7 ("nvme: split off TLS sysfs attributes into a separate group") Signed-off-by: Shin'ichiro Kawasaki Tested-by: Yi Zhang Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index eb345551d6fe..b68a9e5f1ea3 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -767,6 +767,7 @@ static struct attribute *nvme_tls_attrs[] = { &dev_attr_tls_key.attr, &dev_attr_tls_configured_key.attr, &dev_attr_tls_keyring.attr, + NULL, }; static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, From 9064610348b16356d43e59e286aedfec31825541 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 4 Sep 2024 14:48:50 -0700 Subject: [PATCH 19/19] nvme: remove CC register read-back during enabling Any non-posted read should flush the previous write, so we don't necessarily need to read back the value we just wrote. I've found at least some controllers that respond with 0 for short moments after writing the CC register with EN (enable) cleared, so the read-back is overwriting our valid ctrl_config value and ends up breaking on the subsequent enabling. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ca9959a8fb9e..ba6508455e18 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2468,11 +2468,6 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) if (ret) return ret; - /* Flush write to device (required if transport is PCI) */ - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config); - if (ret) - return ret; - /* CAP value may change after initial CC write */ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); if (ret)