From dc12502905b7a3de9097ea6b98870470c2921e09 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Tue, 27 Aug 2024 19:08:08 +0300 Subject: [PATCH 01/34] vdpa/mlx5: Fix invalid mr resource destroy Certain error paths from mlx5_vdpa_dev_add() can end up releasing mr resources which never got initialized in the first place. This patch adds the missing check in mlx5_vdpa_destroy_mr_resources() to block releasing non-initialized mr resources. Reference trace: mlx5_core 0000:08:00.2: mlx5_vdpa_dev_add:3274:(pid 2700) warning: No mac address provisioned? BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 140216067 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 8 PID: 2700 Comm: vdpa Kdump: loaded Not tainted 5.14.0-496.el9.x86_64 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb] Code: [...] RSP: 0018:ff1c823ac23077f0 EFLAGS: 00010246 RAX: ffffffffc1a21a60 RBX: ffffffff899567a0 RCX: 0000000000000000 RDX: ffffffffffffffff RSI: 0000000000000000 RDI: 0000000000000000 RBP: ff1bda1f7c21e800 R08: 0000000000000000 R09: ff1c823ac2307670 R10: ff1c823ac2307668 R11: ffffffff8a9e7b68 R12: 0000000000000000 R13: 0000000000000000 R14: ff1bda1f43e341a0 R15: 00000000ffffffea FS: 00007f56eba7c740(0000) GS:ff1bda269f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104d90001 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa] ? __die_body.cold+0x8/0xd ? page_fault_oops+0x134/0x170 ? __irq_work_queue_local+0x2b/0xc0 ? irq_work_queue+0x2c/0x50 ? exc_page_fault+0x62/0x150 ? asm_exc_page_fault+0x22/0x30 ? __pfx_mlx5_vdpa_free+0x10/0x10 [mlx5_vdpa] ? vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb] mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa] vdpa_release_dev+0x1e/0x50 [vdpa] device_release+0x31/0x90 kobject_cleanup+0x37/0x130 mlx5_vdpa_dev_add+0x2d2/0x7a0 [mlx5_vdpa] vdpa_nl_cmd_dev_add_set_doit+0x277/0x4c0 [vdpa] genl_family_rcv_msg_doit+0xd9/0x130 genl_family_rcv_msg+0x14d/0x220 ? __pfx_vdpa_nl_cmd_dev_add_set_doit+0x10/0x10 [vdpa] ? _copy_to_user+0x1a/0x30 ? move_addr_to_user+0x4b/0xe0 genl_rcv_msg+0x47/0xa0 ? __import_iovec+0x46/0x150 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x245/0x370 netlink_sendmsg+0x206/0x440 __sys_sendto+0x1dc/0x1f0 ? do_read_fault+0x10c/0x1d0 ? do_pte_missing+0x10d/0x190 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x5c/0xf0 ? __count_memcg_events+0x4f/0xb0 ? mm_account_fault+0x6c/0x100 ? handle_mm_fault+0x116/0x270 ? do_user_addr_fault+0x1d6/0x6a0 ? do_syscall_64+0x6b/0xf0 ? clear_bhb_loop+0x25/0x80 ? clear_bhb_loop+0x25/0x80 ? clear_bhb_loop+0x25/0x80 ? clear_bhb_loop+0x25/0x80 ? clear_bhb_loop+0x25/0x80 entry_SYSCALL_64_after_hwframe+0x78/0x80 Fixes: 512c0cdd80c1 ("vdpa/mlx5: Decouple cvq iotlb handling from hw mapping code") Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240827160808.2448017-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Si-Wei Liu Acked-by: Jason Wang Reviewed-by: Shannon Nelson --- drivers/vdpa/mlx5/core/mr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 4758914ccf86..bf56f3d69625 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -581,6 +581,9 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) { + if (!mvdev->res.valid) + return; + for (int i = 0; i < MLX5_VDPA_NUM_AS; i++) mlx5_vdpa_update_mr(mvdev, NULL, i); From 02e9e9366fefe461719da5d173385b6685f70319 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 16 Aug 2024 11:19:00 +0800 Subject: [PATCH 02/34] vhost_vdpa: assign irq bypass producer token correctly We used to call irq_bypass_unregister_producer() in vhost_vdpa_setup_vq_irq() which is problematic as we don't know if the token pointer is still valid or not. Actually, we use the eventfd_ctx as the token so the life cycle of the token should be bound to the VHOST_SET_VRING_CALL instead of vhost_vdpa_setup_vq_irq() which could be called by set_status(). Fixing this by setting up irq bypass producer's token when handling VHOST_SET_VRING_CALL and un-registering the producer before calling vhost_vring_ioctl() to prevent a possible use after free as eventfd could have been released in vhost_vring_ioctl(). And such registering and unregistering will only be done if DRIVER_OK is set. Reported-by: Dragos Tatulea Tested-by: Dragos Tatulea Reviewed-by: Dragos Tatulea Fixes: 2cf1ba9a4d15 ("vhost_vdpa: implement IRQ offloading in vhost_vdpa") Signed-off-by: Jason Wang Message-Id: <20240816031900.18013-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 478cd46a49ed..5a49b5a6d496 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) if (irq < 0) return; - irq_bypass_unregister_producer(&vq->call_ctx.producer); if (!vq->call_ctx.ctx) return; - vq->call_ctx.producer.token = vq->call_ctx.ctx; vq->call_ctx.producer.irq = irq; ret = irq_bypass_register_producer(&vq->call_ctx.producer); if (unlikely(ret)) @@ -709,6 +707,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, vq->last_avail_idx = vq_state.split.avail_index; } break; + case VHOST_SET_VRING_CALL: + if (vq->call_ctx.ctx) { + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_unsetup_vq_irq(v, idx); + vq->call_ctx.producer.token = NULL; + } + break; } r = vhost_vring_ioctl(&v->vdev, cmd, argp); @@ -747,13 +753,16 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; cb.trigger = vq->call_ctx.ctx; + vq->call_ctx.producer.token = vq->call_ctx.ctx; + if (ops->get_status(vdpa) & + VIRTIO_CONFIG_S_DRIVER_OK) + vhost_vdpa_setup_vq_irq(v, idx); } else { cb.callback = NULL; cb.private = NULL; cb.trigger = NULL; } ops->set_vq_cb(vdpa, idx, &cb); - vhost_vdpa_setup_vq_irq(v, idx); break; case VHOST_SET_VRING_NUM: @@ -1419,6 +1428,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) for (i = 0; i < nvqs; i++) { vqs[i] = &v->vqs[i]; vqs[i]->handle_kick = handle_vq_kick; + vqs[i]->call_ctx.ctx = NULL; } vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); From e25fbcd97cf52c3c9824d44b5c56c19673c3dd50 Mon Sep 17 00:00:00 2001 From: Philip Chen Date: Mon, 26 Aug 2024 21:53:13 +0000 Subject: [PATCH 03/34] virtio_pmem: Check device status before requesting flush If a pmem device is in a bad status, the driver side could wait for host ack forever in virtio_pmem_flush(), causing the system to hang. So add a status check in the beginning of virtio_pmem_flush() to return early if the device is not activated. Signed-off-by: Philip Chen Message-Id: <20240826215313.2673566-1-philipchen@chromium.org> Signed-off-by: Michael S. Tsirkin Acked-by: Pankaj Gupta config->get_status(vdev) & VIRTIO_CONFIG_S_NEEDS_RESET) { + dev_info(&vdev->dev, "virtio pmem device needs a reset\n"); + return -EIO; + } + might_sleep(); req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); if (!req_data) From 6cf1c97dad2ebc4de03105cc444b3dfaa83f3dc2 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Tue, 23 Apr 2024 11:41:07 +0800 Subject: [PATCH 04/34] virtio_balloon: introduce oom-kill invocations When the guest OS runs under critical memory pressure, the guest starts to kill processes. A guest monitor agent may scan 'oom_kill' from /proc/vmstat, and reports the OOM KILL event. However, the agent may be killed and we will loss this critical event(and the later events). For now we can also grep for magic words in guest kernel log from host side. Rather than this unstable way, virtio balloon reports OOM-KILL invocations instead. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi Message-Id: <20240423034109.1552866-3-pizhenwei@bytedance.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_balloon.c | 1 + include/uapi/linux/virtio_balloon.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 54469277ca30..b54392366142 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -363,6 +363,7 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index ddaa45e723c4..b17bbe033697 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -71,7 +71,8 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ #define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ -#define VIRTIO_BALLOON_S_NR 10 +#define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ +#define VIRTIO_BALLOON_S_NR 11 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -83,7 +84,8 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \ VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ - VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \ + VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ + VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") From c5b70a26aac39f09a23fd72f44cfbb3d4d5a14d5 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Tue, 23 Apr 2024 11:41:08 +0800 Subject: [PATCH 05/34] virtio_balloon: introduce memory allocation stall counter Memory allocation stall counter represents the performance/latency of memory allocation, expose this counter to the host side by virtio balloon device via out-of-bound way. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi Message-Id: <20240423034109.1552866-4-pizhenwei@bytedance.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_balloon.c | 8 ++++++++ include/uapi/linux/virtio_balloon.h | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b54392366142..6f108b2977de 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -355,6 +355,8 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) { unsigned long events[NR_VM_EVENT_ITEMS]; unsigned int idx = 0; + unsigned int zid; + unsigned long stall = 0; all_vm_events(events); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, @@ -365,6 +367,12 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]); + /* sum all the stall events */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) + stall += events[ALLOCSTALL_NORMAL - ZONE_NORMAL + zid]; + + update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index b17bbe033697..487b893a160e 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -72,7 +72,8 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ #define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ -#define VIRTIO_BALLOON_S_NR 11 +#define VIRTIO_BALLOON_S_ALLOC_STALL 11 /* Stall count of memory allocatoin */ +#define VIRTIO_BALLOON_S_NR 12 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -85,7 +86,8 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ - VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \ + VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \ + VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") From 74c025c5d7e4ac7c7ad269c1ee64da4bdfe4770c Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Tue, 23 Apr 2024 11:41:09 +0800 Subject: [PATCH 06/34] virtio_balloon: introduce memory scan/reclaim info Expose memory scan/reclaim information to the host side via virtio balloon device. Now we have a metric to analyze the memory performance: y: counter increases n: counter does not changes h: the rate of counter change is high l: the rate of counter change is low OOM: VIRTIO_BALLOON_S_OOM_KILL STALL: VIRTIO_BALLOON_S_ALLOC_STALL ASCAN: VIRTIO_BALLOON_S_SCAN_ASYNC DSCAN: VIRTIO_BALLOON_S_SCAN_DIRECT ARCLM: VIRTIO_BALLOON_S_RECLAIM_ASYNC DRCLM: VIRTIO_BALLOON_S_RECLAIM_DIRECT - OOM[y], STALL[*], ASCAN[*], DSCAN[*], ARCLM[*], DRCLM[*]: the guest runs under really critial memory pressure - OOM[n], STALL[h], ASCAN[*], DSCAN[l], ARCLM[*], DRCLM[l]: the memory allocation stalls due to cgroup, not the global memory pressure. - OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[h]: the memory allocation stalls due to global memory pressure. The performance gets hurt a lot. A high ratio between DRCLM/DSCAN shows quite effective memory reclaiming. - OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[l]: the memory allocation stalls due to global memory pressure. the ratio between DRCLM/DSCAN gets low, the guest OS is thrashing heavily, the serious case leads poor performance and difficult trouble shooting. Ex, sshd may block on memory allocation when accepting new connections, a user can't login a VM by ssh command. - OOM[n], STALL[n], ASCAN[h], DSCAN[n], ARCLM[l], DRCLM[n]: the low ratio between ARCLM/ASCAN shows that the guest tries to reclaim more memory, but it can't. Once more memory is required in future, it will struggle to reclaim memory. Acked-by: David Hildenbrand Signed-off-by: zhenwei pi Message-Id: <20240423034109.1552866-5-pizhenwei@bytedance.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_balloon.c | 9 +++++++++ include/uapi/linux/virtio_balloon.h | 12 ++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 6f108b2977de..b36d2803674e 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -373,6 +373,15 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); + update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN, + pages_to_bytes(events[PGSCAN_KSWAPD])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN, + pages_to_bytes(events[PGSCAN_DIRECT])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM, + pages_to_bytes(events[PGSTEAL_KSWAPD])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM, + pages_to_bytes(events[PGSTEAL_DIRECT])); + #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 487b893a160e..ee35a372805d 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -73,7 +73,11 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ #define VIRTIO_BALLOON_S_OOM_KILL 10 /* OOM killer invocations */ #define VIRTIO_BALLOON_S_ALLOC_STALL 11 /* Stall count of memory allocatoin */ -#define VIRTIO_BALLOON_S_NR 12 +#define VIRTIO_BALLOON_S_ASYNC_SCAN 12 /* Amount of memory scanned asynchronously */ +#define VIRTIO_BALLOON_S_DIRECT_SCAN 13 /* Amount of memory scanned directly */ +#define VIRTIO_BALLOON_S_ASYNC_RECLAIM 14 /* Amount of memory reclaimed asynchronously */ +#define VIRTIO_BALLOON_S_DIRECT_RECLAIM 15 /* Amount of memory reclaimed directly */ +#define VIRTIO_BALLOON_S_NR 16 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \ VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \ @@ -87,7 +91,11 @@ struct virtio_balloon_config { VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \ VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \ VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \ - VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \ + VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls", \ + VIRTIO_BALLOON_S_NAMES_prefix "async-scans", \ + VIRTIO_BALLOON_S_NAMES_prefix "direct-scans", \ + VIRTIO_BALLOON_S_NAMES_prefix "async-reclaims", \ + VIRTIO_BALLOON_S_NAMES_prefix "direct-reclaims" \ } #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("") From a8927f69e85ec3508085e1042ca8ffe1c1ededae Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 24 Jul 2024 00:41:08 -0700 Subject: [PATCH 07/34] tools/virtio:Fix the wrong format specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unsigned int should use "%u" instead of "%d". Signed-off-by: Zhu Jun Message-Id: <20240724074108.9530-1-zhujun2@cmss.chinamobile.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eugenio Pérez Reviewed-by: Xuan Zhuo --- tools/virtio/ringtest/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtio/ringtest/main.c b/tools/virtio/ringtest/main.c index 5a18b2301a63..e471d8e7cfaa 100644 --- a/tools/virtio/ringtest/main.c +++ b/tools/virtio/ringtest/main.c @@ -276,7 +276,7 @@ static void help(void) fprintf(stderr, "Usage: [--help]" " [--host-affinity H]" " [--guest-affinity G]" - " [--ring-size R (default: %d)]" + " [--ring-size R (default: %u)]" " [--run-cycles C (default: %d)]" " [--batch b]" " [--outstanding o]" From 2f87e9cf0c9e21ab9be1fb2ba8520a1525359497 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 31 Jul 2024 11:16:01 +0800 Subject: [PATCH 08/34] vdpa: support set mac address from vdpa tool Add new UAPI to support the mac address from vdpa tool Function vdpa_nl_cmd_dev_attr_set_doit() will get the new MAC address from the vdpa tool and then set it to the device. The usage is: vdpa dev set name vdpa_name mac **:**:**:**:**:** Here is example: root@L1# vdpa -jp dev config show vdpa0 { "config": { "vdpa0": { "mac": "82:4d:e9:5d:d7:e6", "link ": "up", "link_announce ": false, "mtu": 1500 } } } root@L1# vdpa dev set name vdpa0 mac 00:11:22:33:44:55 root@L1# vdpa -jp dev config show vdpa0 { "config": { "vdpa0": { "mac": "00:11:22:33:44:55", "link ": "up", "link_announce ": false, "mtu": 1500 } } } Signed-off-by: Cindy Lu Message-Id: <20240731031653.1047692-2-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa.c | 79 +++++++++++++++++++++++++++++++++++++++ include/linux/vdpa.h | 9 +++++ include/uapi/linux/vdpa.h | 1 + 3 files changed, 89 insertions(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 4dbd2e55a288..8a372b51c21a 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -1361,6 +1361,80 @@ dev_err: return err; } +static int vdpa_dev_net_device_attr_set(struct vdpa_device *vdev, + struct genl_info *info) +{ + struct vdpa_dev_set_config set_config = {}; + struct vdpa_mgmt_dev *mdev = vdev->mdev; + struct nlattr **nl_attrs = info->attrs; + const u8 *macaddr; + int err = -EOPNOTSUPP; + + down_write(&vdev->cf_lock); + if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { + set_config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); + macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); + + if (is_valid_ether_addr(macaddr)) { + ether_addr_copy(set_config.net.mac, macaddr); + if (mdev->ops->dev_set_attr) { + err = mdev->ops->dev_set_attr(mdev, vdev, + &set_config); + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Operation not supported by the device."); + } + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "Invalid MAC address"); + } + } + up_write(&vdev->cf_lock); + return err; +} + +static int vdpa_nl_cmd_dev_attr_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct vdpa_device *vdev; + struct device *dev; + const char *name; + u64 classes; + int err = 0; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + + down_write(&vdpa_dev_lock); + dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); + err = -EINVAL; + goto mdev_err; + } + classes = vdpa_mgmtdev_get_classes(vdev->mdev, NULL); + if (classes & BIT_ULL(VIRTIO_ID_NET)) { + err = vdpa_dev_net_device_attr_set(vdev, info); + } else { + NL_SET_ERR_MSG_FMT_MOD(info->extack, "%s device not supported", + name); + } + +mdev_err: + put_device(dev); +dev_err: + up_write(&vdpa_dev_lock); + return err; +} + static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); @@ -1497,6 +1571,11 @@ static const struct genl_ops vdpa_nl_ops[] = { .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, + { + .cmd = VDPA_CMD_DEV_ATTR_SET, + .doit = vdpa_nl_cmd_dev_attr_set_doit, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family vdpa_nl_family __ro_after_init = { diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 7977ca03ac7a..2e7a30fe6b92 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -582,11 +582,20 @@ void vdpa_set_status(struct vdpa_device *vdev, u8 status); * @dev: vdpa device to remove * Driver need to remove the specified device by calling * _vdpa_unregister_device(). + * @dev_set_attr: change a vdpa device's attr after it was create + * @mdev: parent device to use for device + * @dev: vdpa device structure + * @config:Attributes to be set for the device. + * The driver needs to check the mask of the structure and then set + * the related information to the vdpa device. The driver must return 0 + * if set successfully. */ struct vdpa_mgmtdev_ops { int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config); void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev); + int (*dev_set_attr)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *config); }; /** diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index 842bf1201ac4..71edf2c70cc3 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -19,6 +19,7 @@ enum vdpa_command { VDPA_CMD_DEV_GET, /* can dump */ VDPA_CMD_DEV_CONFIG_GET, /* can dump */ VDPA_CMD_DEV_VSTATS_GET, + VDPA_CMD_DEV_ATTR_SET, }; enum vdpa_attr { From 218bb7ec17f1f66a63cb7421fb8a1d48032988e8 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 31 Jul 2024 11:16:02 +0800 Subject: [PATCH 09/34] vdpa_sim_net: Add the support of set mac address Add the function to support setting the MAC address. For vdpa_sim_net, the driver will write the MAC address to the config space, and other devices can implement their own functions to support this. Signed-off-by: Cindy Lu Message-Id: <20240731031653.1047692-3-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index cfe962911804..6caf09a1907b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -414,6 +414,24 @@ static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); } +static int vdpasim_net_set_attr(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *config) +{ + struct vdpasim *vdpasim = container_of(dev, struct vdpasim, vdpa); + struct virtio_net_config *vio_config = vdpasim->config; + + mutex_lock(&vdpasim->mutex); + + if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + ether_addr_copy(vio_config->mac, config->net.mac); + mutex_unlock(&vdpasim->mutex); + return 0; + } + + mutex_unlock(&vdpasim->mutex); + return -EOPNOTSUPP; +} + static void vdpasim_net_setup_config(struct vdpasim *vdpasim, const struct vdpa_dev_set_config *config) { @@ -510,7 +528,8 @@ static void vdpasim_net_dev_del(struct vdpa_mgmt_dev *mdev, static const struct vdpa_mgmtdev_ops vdpasim_net_mgmtdev_ops = { .dev_add = vdpasim_net_dev_add, - .dev_del = vdpasim_net_dev_del + .dev_del = vdpasim_net_dev_del, + .dev_set_attr = vdpasim_net_set_attr }; static struct virtio_device_id id_table[] = { From 6d17035a74028f0b0e77affefbfb5d71e6d32713 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 31 Jul 2024 11:16:03 +0800 Subject: [PATCH 10/34] vdpa/mlx5: Add the support of set mac address Add the function to support setting the MAC address. For vdpa/mlx5, the function will use mlx5_mpfs_add_mac to set the mac address Tested in ConnectX-6 Dx device Signed-off-by: Cindy Lu Message-Id: <20240731031653.1047692-4-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index fa78e8288ebb..7862dfc29107 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3937,9 +3937,37 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mgtdev->ndev = NULL; } +static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *add_config) +{ + struct virtio_net_config *config; + struct mlx5_core_dev *pfmdev; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + struct mlx5_core_dev *mdev; + int err = -EOPNOTSUPP; + + mvdev = to_mvdev(dev); + ndev = to_mlx5_vdpa_ndev(mvdev); + mdev = mvdev->mdev; + config = &ndev->config; + + down_write(&ndev->reslock); + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); + err = mlx5_mpfs_add_mac(pfmdev, config->mac); + if (!err) + ether_addr_copy(config->mac, add_config->net.mac); + } + + up_write(&ndev->reslock); + return err; +} + static const struct vdpa_mgmtdev_ops mdev_ops = { .dev_add = mlx5_vdpa_dev_add, .dev_del = mlx5_vdpa_dev_del, + .dev_set_attr = mlx5_vdpa_set_attr, }; static struct virtio_device_id id_table[] = { From 7d627137dc1062aba6276a7d2ebe7f5ff8d542c5 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:50 +0300 Subject: [PATCH 11/34] net/mlx5: Support throttled commands from async API Currently, commands that qualify as throttled can't be used via the async API. That's due to the fact that the throttle semaphore can sleep but the async API can't. This patch allows throttling in the async API by using the tentative variant of the semaphore and upon failure (semaphore at 0) returns EBUSY to signal to the caller that they need to wait for the completion of previously issued commands. Furthermore, make sure that the semaphore is released in the callback. Signed-off-by: Dragos Tatulea Cc: Leon Romanovsky Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 20768ef2e9d2..f69c977c1569 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1882,10 +1882,12 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, throttle_op = mlx5_cmd_is_throttle_opcode(opcode); if (throttle_op) { - /* atomic context may not sleep */ - if (callback) - return -EINVAL; - down(&dev->cmd.vars.throttle_sem); + if (callback) { + if (down_trylock(&dev->cmd.vars.throttle_sem)) + return -EBUSY; + } else { + down(&dev->cmd.vars.throttle_sem); + } } pages_queue = is_manage_pages(in); @@ -2091,10 +2093,19 @@ static void mlx5_cmd_exec_cb_handler(int status, void *_work) { struct mlx5_async_work *work = _work; struct mlx5_async_ctx *ctx; + struct mlx5_core_dev *dev; + u16 opcode; ctx = work->ctx; - status = cmd_status_err(ctx->dev, status, work->opcode, work->op_mod, work->out); + dev = ctx->dev; + opcode = work->opcode; + status = cmd_status_err(dev, status, work->opcode, work->op_mod, work->out); work->user_callback(status, work); + /* Can't access "work" from this point on. It could have been freed in + * the callback. + */ + if (mlx5_cmd_is_throttle_opcode(opcode)) + up(&dev->cmd.vars.throttle_sem); if (atomic_dec_and_test(&ctx->num_inflight)) complete(&ctx->inflight_done); } From de2cd39fc11b2f55b7f40f2a3036ca27327e4461 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:51 +0300 Subject: [PATCH 12/34] vdpa/mlx5: Introduce error logging function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlx5_vdpa_err() was missing. This patch adds it and uses it in the necessary places. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Acked-by: Eugenio Pérez Message-Id: <20240816090159.1967650-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 5 +++++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 50aac8fe57ef..424d445ebee4 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -135,6 +135,11 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev, int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev); int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid); +#define mlx5_vdpa_err(__dev, format, ...) \ + dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, __func__, __LINE__, \ + current->pid, ##__VA_ARGS__) + + #define mlx5_vdpa_warn(__dev, format, ...) \ dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \ current->pid, ##__VA_ARGS__) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 7862dfc29107..d4d94cf11ace 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1538,13 +1538,13 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mv err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed, err: %d\n", err); + mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: %d\n", err); return err; } err = query_virtqueue(ndev, mvq, &attr); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue, err: %d\n", err); + mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: %d\n", err); return err; } @@ -1585,7 +1585,7 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq */ err = modify_virtqueue(ndev, mvq, 0); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, + mlx5_vdpa_err(&ndev->mvdev, "modify vq properties failed for vq %u, err: %d\n", mvq->index, err); return err; @@ -1600,15 +1600,15 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: return 0; default: - mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad state %d\n", + mlx5_vdpa_err(&ndev->mvdev, "resume vq %u called from bad state %d\n", mvq->index, mvq->fw_state); return -EINVAL; } err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); if (err) - mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq %u, err: %d\n", - mvq->index, err); + mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, err: %d\n", + mvq->index, err); return err; } @@ -2002,13 +2002,13 @@ static int setup_steering(struct mlx5_vdpa_net *ndev) ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS); if (!ns) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n"); + mlx5_vdpa_err(&ndev->mvdev, "failed to get flow namespace\n"); return -EOPNOTSUPP; } ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ndev->rxft)) { - mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n"); + mlx5_vdpa_err(&ndev->mvdev, "failed to create flow table\n"); return PTR_ERR(ndev->rxft); } mlx5_vdpa_add_rx_flow_table(ndev); @@ -2530,7 +2530,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa err = query_virtqueue(ndev, mvq, &attr); if (err) { - mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); + mlx5_vdpa_err(mvdev, "failed to query virtqueue\n"); return err; } state->split.avail_index = attr.used_index; @@ -3189,7 +3189,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags) if ((flags & VDPA_RESET_F_CLEAN_MAP) && MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { if (mlx5_vdpa_create_dma_mr(mvdev)) - mlx5_vdpa_warn(mvdev, "create MR failed\n"); + mlx5_vdpa_err(mvdev, "create MR failed\n"); } if (vq_reset) setup_vq_resources(ndev, false); @@ -3244,7 +3244,7 @@ static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, new_mr = mlx5_vdpa_create_mr(mvdev, iotlb); if (IS_ERR(new_mr)) { err = PTR_ERR(new_mr); - mlx5_vdpa_warn(mvdev, "create map failed(%d)\n", err); + mlx5_vdpa_err(mvdev, "create map failed(%d)\n", err); return err; } } else { @@ -3257,7 +3257,7 @@ static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, } else { err = mlx5_vdpa_change_map(mvdev, new_mr, asid); if (err) { - mlx5_vdpa_warn(mvdev, "change map failed(%d)\n", err); + mlx5_vdpa_err(mvdev, "change map failed(%d)\n", err); goto out_err; } } From d89d58f4888cde693e7707e13623eb50bb6435c2 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:52 +0300 Subject: [PATCH 13/34] vdpa/mlx5: Introduce async fw command wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new function mlx5_vdpa_exec_async_cmds() which wraps the mlx5_core async firmware command API in a way that will be used to parallelize certain operation in this driver. The wrapper deals with the case when mlx5_cmd_exec_cb() returns EBUSY due to the command being throttled. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-4-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 15 ++++++ drivers/vdpa/mlx5/core/resources.c | 73 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 424d445ebee4..b34e9b93d56e 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -105,6 +105,18 @@ struct mlx5_vdpa_dev { bool suspended; }; +struct mlx5_vdpa_async_cmd { + int err; + struct mlx5_async_work cb_work; + struct completion cmd_done; + + void *in; + size_t inlen; + + void *out; + size_t outlen; +}; + int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn); void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn); int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn); @@ -134,6 +146,9 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev, unsigned int asid); int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev); int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid); +int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int num_cmds); #define mlx5_vdpa_err(__dev, format, ...) \ dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, __func__, __LINE__, \ diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 5c5a41b64bfc..22ea32fe007b 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -321,3 +321,76 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) mutex_destroy(&mvdev->mr_mtx); res->valid = false; } + +static void virtqueue_cmd_callback(int status, struct mlx5_async_work *context) +{ + struct mlx5_vdpa_async_cmd *cmd = + container_of(context, struct mlx5_vdpa_async_cmd, cb_work); + + cmd->err = mlx5_cmd_check(context->ctx->dev, status, cmd->in, cmd->out); + complete(&cmd->cmd_done); +} + +static int issue_async_cmd(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int issued, + int *completed) + +{ + struct mlx5_vdpa_async_cmd *cmd = &cmds[issued]; + int err; + +retry: + err = mlx5_cmd_exec_cb(&mvdev->async_ctx, + cmd->in, cmd->inlen, + cmd->out, cmd->outlen, + virtqueue_cmd_callback, + &cmd->cb_work); + if (err == -EBUSY) { + if (*completed < issued) { + /* Throttled by own commands: wait for oldest completion. */ + wait_for_completion(&cmds[*completed].cmd_done); + (*completed)++; + + goto retry; + } else { + /* Throttled by external commands: switch to sync api. */ + err = mlx5_cmd_exec(mvdev->mdev, + cmd->in, cmd->inlen, + cmd->out, cmd->outlen); + if (!err) + (*completed)++; + } + } + + return err; +} + +int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_async_cmd *cmds, + int num_cmds) +{ + int completed = 0; + int issued = 0; + int err = 0; + + for (int i = 0; i < num_cmds; i++) + init_completion(&cmds[i].cmd_done); + + while (issued < num_cmds) { + + err = issue_async_cmd(mvdev, cmds, issued, &completed); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing command %d of %d: %d\n", + issued, num_cmds, err); + break; + } + + issued++; + } + + while (completed < issued) + wait_for_completion(&cmds[completed++].cmd_done); + + return err; +} From 1fcdf43ea69e976aae4f2d76ebb199cc0d4c5a88 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:53 +0300 Subject: [PATCH 14/34] vdpa/mlx5: Use async API for vq query command Switch firmware vq query command to be issued via the async API to allow future parallelization. For now the command is still serial but the infrastructure is there to issue commands in parallel, including ratelimiting the number of issued async commands to firmware. A later patch will switch to issuing more commands at a time. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 + drivers/vdpa/mlx5/net/mlx5_vnet.c | 101 ++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 25 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index b34e9b93d56e..24fa00afb24f 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -103,6 +103,8 @@ struct mlx5_vdpa_dev { struct workqueue_struct *wq; unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; bool suspended; + + struct mlx5_async_ctx async_ctx; }; struct mlx5_vdpa_async_cmd { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index d4d94cf11ace..7debf85ffb84 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1184,40 +1184,87 @@ struct mlx5_virtq_attr { u16 used_index; }; -static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, - struct mlx5_virtq_attr *attr) +struct mlx5_virtqueue_query_mem { + u8 in[MLX5_ST_SZ_BYTES(query_virtio_net_q_in)]; + u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)]; +}; + +static void fill_query_virtqueue_cmd(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + struct mlx5_virtqueue_query_mem *cmd) { - int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out); - u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {}; - void *out; - void *obj_context; - void *cmd_hdr; - int err; - - out = kzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; - - cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr); + void *cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, cmd->in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); - err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen); - if (err) - goto err_cmd; +} + +static void query_virtqueue_end(struct mlx5_vdpa_net *ndev, + struct mlx5_virtqueue_query_mem *cmd, + struct mlx5_virtq_attr *attr) +{ + void *obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, cmd->out, obj_context); - obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context); memset(attr, 0, sizeof(*attr)); attr->state = MLX5_GET(virtio_net_q_object, obj_context, state); attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index); attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index); - kfree(out); - return 0; +} -err_cmd: - kfree(out); +static int query_virtqueues(struct mlx5_vdpa_net *ndev, + int start_vq, + int num_vqs, + struct mlx5_virtq_attr *attrs) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_virtqueue_query_mem *cmd_mem; + struct mlx5_vdpa_async_cmd *cmds; + int err = 0; + + WARN(start_vq + num_vqs > mvdev->max_vqs, "query vq range invalid [%d, %d), max_vqs: %u\n", + start_vq, start_vq + num_vqs, mvdev->max_vqs); + + cmds = kvcalloc(num_vqs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(num_vqs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) { + err = -ENOMEM; + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + cmds[i].in = &cmd_mem[i].in; + cmds[i].inlen = sizeof(cmd_mem[i].in); + cmds[i].out = &cmd_mem[i].out; + cmds[i].outlen = sizeof(cmd_mem[i].out); + fill_query_virtqueue_cmd(ndev, &ndev->vqs[start_vq + i], &cmd_mem[i]); + } + + err = mlx5_vdpa_exec_async_cmds(&ndev->mvdev, cmds, num_vqs); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing query cmd for vq range [%d, %d): %d\n", + start_vq, start_vq + num_vqs, err); + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + int vq_idx = start_vq + i; + + if (cmd->err) { + mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, err); + if (!err) + err = cmd->err; + continue; + } + + query_virtqueue_end(ndev, &cmd_mem[i], &attrs[i]); + } + +done: + kvfree(cmd_mem); + kvfree(cmds); return err; } @@ -1542,7 +1589,7 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mv return err; } - err = query_virtqueue(ndev, mvq, &attr); + err = query_virtqueues(ndev, mvq->index, 1, &attr); if (err) { mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: %d\n", err); return err; @@ -2528,7 +2575,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa return 0; } - err = query_virtqueue(ndev, mvq, &attr); + err = query_virtqueues(ndev, mvq->index, 1, &attr); if (err) { mlx5_vdpa_err(mvdev, "failed to query virtqueue\n"); return err; @@ -2879,7 +2926,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu int err; if (mvq->initialized) { - err = query_virtqueue(ndev, mvq, &attr); + err = query_virtqueues(ndev, mvq->index, 1, &attr); if (err) return err; } @@ -3854,6 +3901,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->rqt_size = 1; } + mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); + ndev->mvdev.mlx_features = device_features; mvdev->vdev.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); @@ -3935,6 +3984,8 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mvdev->wq = NULL; destroy_workqueue(wq); mgtdev->ndev = NULL; + + mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); } static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, From 61674c154bb7f19fad612242022276e8bd9e10d2 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:54 +0300 Subject: [PATCH 15/34] vdpa/mlx5: Use async API for vq modify commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch firmware vq modify command to be issued via the async API to allow future parallelization. The new refactored function applies the modify on a range of vqs and waits for their execution to complete. For now the command is still used in a serial fashion. A later patch will switch to modifying multiple vqs in parallel. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-6-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 154 ++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 48 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 7debf85ffb84..de05b941e9d1 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1189,6 +1189,11 @@ struct mlx5_virtqueue_query_mem { u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)]; }; +struct mlx5_virtqueue_modify_mem { + u8 in[MLX5_ST_SZ_BYTES(modify_virtio_net_q_in)]; + u8 out[MLX5_ST_SZ_BYTES(modify_virtio_net_q_out)]; +}; + static void fill_query_virtqueue_cmd(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, struct mlx5_virtqueue_query_mem *cmd) @@ -1298,51 +1303,30 @@ static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq) return true; } -static int modify_virtqueue(struct mlx5_vdpa_net *ndev, - struct mlx5_vdpa_virtqueue *mvq, - int state) +static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + int state, + struct mlx5_virtqueue_modify_mem *cmd) { - int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); - u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; struct mlx5_vdpa_mr *desc_mr = NULL; struct mlx5_vdpa_mr *vq_mr = NULL; - bool state_change = false; void *obj_context; void *cmd_hdr; void *vq_ctx; - void *in; - int err; - if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) - return 0; - - if (!modifiable_virtqueue_fields(mvq)) - return -EINVAL; - - in = kzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - - cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr); + cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); - obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); + obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, obj_context); vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); - if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) { - if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) { - err = -EINVAL; - goto done; - } - + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) MLX5_SET(virtio_net_q_object, obj_context, state, state); - state_change = true; - } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) { MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); @@ -1388,38 +1372,36 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, } MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); - err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); - if (err) - goto done; +} - if (state_change) - mvq->fw_state = state; +static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + int state) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { + unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]; + struct mlx5_vdpa_mr *vq_mr = mvdev->mr[asid]; + mlx5_vdpa_put_mr(mvdev, mvq->vq_mr); mlx5_vdpa_get_mr(mvdev, vq_mr); mvq->vq_mr = vq_mr; } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { + unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]; + struct mlx5_vdpa_mr *desc_mr = mvdev->mr[asid]; + mlx5_vdpa_put_mr(mvdev, mvq->desc_mr); mlx5_vdpa_get_mr(mvdev, desc_mr); mvq->desc_mr = desc_mr; } + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) + mvq->fw_state = state; + mvq->modified_fields = 0; - -done: - kfree(in); - return err; -} - -static int modify_virtqueue_state(struct mlx5_vdpa_net *ndev, - struct mlx5_vdpa_virtqueue *mvq, - unsigned int state) -{ - mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE; - return modify_virtqueue(ndev, mvq, state); } static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) @@ -1572,6 +1554,82 @@ err_fwqp: return err; } +static int modify_virtqueues(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs, int state) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_virtqueue_modify_mem *cmd_mem; + struct mlx5_vdpa_async_cmd *cmds; + int err = 0; + + WARN(start_vq + num_vqs > mvdev->max_vqs, "modify vq range invalid [%d, %d), max_vqs: %u\n", + start_vq, start_vq + num_vqs, mvdev->max_vqs); + + cmds = kvcalloc(num_vqs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(num_vqs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) { + err = -ENOMEM; + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + struct mlx5_vdpa_virtqueue *mvq; + int vq_idx = start_vq + i; + + mvq = &ndev->vqs[vq_idx]; + + if (!modifiable_virtqueue_fields(mvq)) { + err = -EINVAL; + goto done; + } + + if (mvq->fw_state != state) { + if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) { + err = -EINVAL; + goto done; + } + + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE; + } + + cmd->in = &cmd_mem[i].in; + cmd->inlen = sizeof(cmd_mem[i].in); + cmd->out = &cmd_mem[i].out; + cmd->outlen = sizeof(cmd_mem[i].out); + fill_modify_virtqueue_cmd(ndev, mvq, state, &cmd_mem[i]); + } + + err = mlx5_vdpa_exec_async_cmds(&ndev->mvdev, cmds, num_vqs); + if (err) { + mlx5_vdpa_err(mvdev, "error issuing modify cmd for vq range [%d, %d)\n", + start_vq, start_vq + num_vqs); + goto done; + } + + for (int i = 0; i < num_vqs; i++) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i]; + struct mlx5_vdpa_virtqueue *mvq; + int vq_idx = start_vq + i; + + mvq = &ndev->vqs[vq_idx]; + + if (cmd->err) { + mlx5_vdpa_err(mvdev, "modify vq %d failed, state: %d -> %d, err: %d\n", + vq_idx, mvq->fw_state, state, err); + if (!err) + err = cmd->err; + continue; + } + + modify_virtqueue_end(ndev, mvq, state); + } + +done: + kvfree(cmd_mem); + kvfree(cmds); + return err; +} + static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { struct mlx5_virtq_attr attr; @@ -1583,7 +1641,7 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mv if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) return 0; - err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); + err = modify_virtqueues(ndev, mvq->index, 1, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); if (err) { mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: %d\n", err); return err; @@ -1630,7 +1688,7 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq /* Due to a FW quirk we need to modify the VQ fields first then change state. * This should be fixed soon. After that, a single command can be used. */ - err = modify_virtqueue(ndev, mvq, 0); + err = modify_virtqueues(ndev, mvq->index, 1, mvq->fw_state); if (err) { mlx5_vdpa_err(&ndev->mvdev, "modify vq properties failed for vq %u, err: %d\n", @@ -1652,7 +1710,7 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq return -EINVAL; } - err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + err = modify_virtqueues(ndev, mvq->index, 1, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); if (err) mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, err: %d\n", mvq->index, err); From dcf3eac01f063df0a60ea779399331d2ac535784 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:55 +0300 Subject: [PATCH 16/34] vdpa/mlx5: Parallelize device suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently device suspend works on vqs serially. Building up on previous changes that converted vq operations to the async api, this patch parallelizes the device suspend: 1) Suspend all active vqs parallel. 2) Query suspended vqs in parallel. For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM, 32 CPUs x 2 threads per core), the device suspend time is reduced from ~37 ms to ~13 ms. A later patch will remove the link unregister operation which will make it even faster. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Acked-by: Eugenio Pérez Message-Id: <20240816090159.1967650-7-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index de05b941e9d1..17f74b1f0644 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1630,47 +1630,49 @@ done: return err; } -static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +static int suspend_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs) { - struct mlx5_virtq_attr attr; + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_virtq_attr *attrs; + int vq_idx, i; int err; + if (start_vq >= ndev->cur_num_vqs) + return -EINVAL; + + mvq = &ndev->vqs[start_vq]; if (!mvq->initialized) return 0; if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) return 0; - err = modify_virtqueues(ndev, mvq->index, 1, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); - if (err) { - mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: %d\n", err); + err = modify_virtqueues(ndev, start_vq, num_vqs, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND); + if (err) return err; + + attrs = kcalloc(num_vqs, sizeof(struct mlx5_virtq_attr), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + err = query_virtqueues(ndev, start_vq, num_vqs, attrs); + if (err) + goto done; + + for (i = 0, vq_idx = start_vq; i < num_vqs; i++, vq_idx++) { + mvq = &ndev->vqs[vq_idx]; + mvq->avail_idx = attrs[i].available_index; + mvq->used_idx = attrs[i].used_index; } - err = query_virtqueues(ndev, mvq->index, 1, &attr); - if (err) { - mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: %d\n", err); - return err; - } - - mvq->avail_idx = attr.available_index; - mvq->used_idx = attr.used_index; - - return 0; +done: + kfree(attrs); + return err; } -static int suspend_vqs(struct mlx5_vdpa_net *ndev) +static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { - int err = 0; - int i; - - for (i = 0; i < ndev->cur_num_vqs; i++) { - int local_err = suspend_vq(ndev, &ndev->vqs[i]); - - err = local_err ? local_err : err; - } - - return err; + return suspend_vqs(ndev, mvq->index, 1); } static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) @@ -3053,7 +3055,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, bool teardown = !is_resumable(ndev); int err; - suspend_vqs(ndev); + suspend_vqs(ndev, 0, ndev->cur_num_vqs); if (teardown) { err = save_channels_info(ndev); if (err) @@ -3606,7 +3608,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) down_write(&ndev->reslock); unregister_link_notifier(ndev); - err = suspend_vqs(ndev); + err = suspend_vqs(ndev, 0, ndev->cur_num_vqs); mlx5_vdpa_cvq_suspend(mvdev); mvdev->suspended = true; up_write(&ndev->reslock); From 5eb8c7eb1ec74ac6b9e7337674cb7a33e82a1e68 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:56 +0300 Subject: [PATCH 17/34] vdpa/mlx5: Parallelize device resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently device resume works on vqs serially. Building up on previous changes that converted vq operations to the async api, this patch parallelizes the device resume. For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM, 32 CPUs x 2 threads per core), the device resume time is reduced from ~16 ms to ~4.5 ms. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Acked-by: Eugenio Pérez Message-Id: <20240816090159.1967650-8-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 40 +++++++++++-------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 17f74b1f0644..fc9fb8d9ac4f 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1675,10 +1675,15 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mv return suspend_vqs(ndev, mvq->index, 1); } -static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +static int resume_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs) { + struct mlx5_vdpa_virtqueue *mvq; int err; + if (start_vq >= ndev->mvdev.max_vqs) + return -EINVAL; + + mvq = &ndev->vqs[start_vq]; if (!mvq->initialized) return 0; @@ -1690,13 +1695,9 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq /* Due to a FW quirk we need to modify the VQ fields first then change state. * This should be fixed soon. After that, a single command can be used. */ - err = modify_virtqueues(ndev, mvq->index, 1, mvq->fw_state); - if (err) { - mlx5_vdpa_err(&ndev->mvdev, - "modify vq properties failed for vq %u, err: %d\n", - mvq->index, err); + err = modify_virtqueues(ndev, start_vq, num_vqs, mvq->fw_state); + if (err) return err; - } break; case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: if (!is_resumable(ndev)) { @@ -1712,25 +1713,12 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq return -EINVAL; } - err = modify_virtqueues(ndev, mvq->index, 1, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); - if (err) - mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, err: %d\n", - mvq->index, err); - - return err; + return modify_virtqueues(ndev, start_vq, num_vqs, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); } -static int resume_vqs(struct mlx5_vdpa_net *ndev) +static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { - int err = 0; - - for (int i = 0; i < ndev->cur_num_vqs; i++) { - int local_err = resume_vq(ndev, &ndev->vqs[i]); - - err = local_err ? local_err : err; - } - - return err; + return resume_vqs(ndev, mvq->index, 1); } static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) @@ -3080,7 +3068,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, return err; } - resume_vqs(ndev); + resume_vqs(ndev, 0, ndev->cur_num_vqs); return 0; } @@ -3204,7 +3192,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) teardown_vq_resources(ndev); if (ndev->setup) { - err = resume_vqs(ndev); + err = resume_vqs(ndev, 0, ndev->cur_num_vqs); if (err) { mlx5_vdpa_warn(mvdev, "failed to resume VQs\n"); goto err_driver; @@ -3628,7 +3616,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev) down_write(&ndev->reslock); mvdev->suspended = false; - err = resume_vqs(ndev); + err = resume_vqs(ndev, 0, ndev->cur_num_vqs); register_link_notifier(ndev); up_write(&ndev->reslock); From 55a7cb05b0a6c6cd6e3f482551cf93c398f1b4c9 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:57 +0300 Subject: [PATCH 18/34] vdpa/mlx5: Keep notifiers during suspend but ignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unregistering notifiers is a costly operation. Instead of removing the notifiers during device suspend and adding them back at resume, simply ignore the call when the device is suspended. At resume time call queue_link_work() to make sure that the device state is propagated in case there were changes. For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM, 32 CPUs x 2 threads per core), the device suspend time is reduced from ~13 ms to ~2.5 ms. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Acked-by: Eugenio Pérez Message-Id: <20240816090159.1967650-9-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index fc9fb8d9ac4f..6d1207d7ae75 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2850,6 +2850,9 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p struct mlx5_eqe *eqe = param; int ret = NOTIFY_DONE; + if (ndev->mvdev.suspended) + return NOTIFY_DONE; + if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { switch (eqe->sub_type) { case MLX5_PORT_CHANGE_SUBTYPE_DOWN: @@ -3595,7 +3598,6 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) mlx5_vdpa_info(mvdev, "suspending device\n"); down_write(&ndev->reslock); - unregister_link_notifier(ndev); err = suspend_vqs(ndev, 0, ndev->cur_num_vqs); mlx5_vdpa_cvq_suspend(mvdev); mvdev->suspended = true; @@ -3617,7 +3619,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev) down_write(&ndev->reslock); mvdev->suspended = false; err = resume_vqs(ndev, 0, ndev->cur_num_vqs); - register_link_notifier(ndev); + queue_link_work(ndev); up_write(&ndev->reslock); return err; From 74c89072f22600cc3d83fc70617b1b6c2f500013 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:58 +0300 Subject: [PATCH 19/34] vdpa/mlx5: Small improvement for change_num_qps() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit change_num_qps() has a lot of multiplications by 2 to convert the number of VQ pairs to number of VQs. This patch simplifies the code by doing the VQP -> VQ count conversion at the beginning in a variable. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-10-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 6d1207d7ae75..8138fba7ae94 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2219,16 +2219,17 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - int cur_qps = ndev->cur_num_vqs / 2; + int cur_vqs = ndev->cur_num_vqs; + int new_vqs = newqps * 2; int err; int i; - if (cur_qps > newqps) { - err = modify_rqt(ndev, 2 * newqps); + if (cur_vqs > new_vqs) { + err = modify_rqt(ndev, new_vqs); if (err) return err; - for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) { + for (i = cur_vqs - 1; i >= new_vqs; i--) { struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i]; if (is_resumable(ndev)) @@ -2237,27 +2238,27 @@ static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) teardown_vq(ndev, mvq); } - ndev->cur_num_vqs = 2 * newqps; + ndev->cur_num_vqs = new_vqs; } else { - ndev->cur_num_vqs = 2 * newqps; - for (i = cur_qps * 2; i < 2 * newqps; i++) { + ndev->cur_num_vqs = new_vqs; + for (i = cur_vqs; i < new_vqs; i++) { struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i]; err = mvq->initialized ? resume_vq(ndev, mvq) : setup_vq(ndev, mvq, true); if (err) goto clean_added; } - err = modify_rqt(ndev, 2 * newqps); + err = modify_rqt(ndev, new_vqs); if (err) goto clean_added; } return 0; clean_added: - for (--i; i >= 2 * cur_qps; --i) + for (--i; i >= cur_vqs; --i) teardown_vq(ndev, &ndev->vqs[i]); - ndev->cur_num_vqs = 2 * cur_qps; + ndev->cur_num_vqs = cur_vqs; return err; } From 9dba41951ab64596c58f170f79a696c2cf83ff4a Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 16 Aug 2024 12:01:59 +0300 Subject: [PATCH 20/34] vdpa/mlx5: Parallelize VQ suspend/resume for CVQ MQ command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit change_num_qps() is still suspending/resuming VQs one by one. This change switches to parallel suspend/resume. When increasing the number of queues the flow has changed a bit for simplicity: the setup_vq() function will always be called before resume_vqs(). If the VQ is initialized, setup_vq() will exit early. If the VQ is not initialized, setup_vq() will create it and resume_vqs() will resume it. Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Message-Id: <20240816090159.1967650-11-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 8138fba7ae94..02b06957e0f3 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2229,25 +2229,27 @@ static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) if (err) return err; - for (i = cur_vqs - 1; i >= new_vqs; i--) { - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i]; - - if (is_resumable(ndev)) - suspend_vq(ndev, mvq); - else - teardown_vq(ndev, mvq); + if (is_resumable(ndev)) { + suspend_vqs(ndev, new_vqs, cur_vqs - new_vqs); + } else { + for (i = new_vqs; i < cur_vqs; i++) + teardown_vq(ndev, &ndev->vqs[i]); } ndev->cur_num_vqs = new_vqs; } else { ndev->cur_num_vqs = new_vqs; - for (i = cur_vqs; i < new_vqs; i++) { - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i]; - err = mvq->initialized ? resume_vq(ndev, mvq) : setup_vq(ndev, mvq, true); + for (i = cur_vqs; i < new_vqs; i++) { + err = setup_vq(ndev, &ndev->vqs[i], false); if (err) goto clean_added; } + + err = resume_vqs(ndev, cur_vqs, new_vqs - cur_vqs); + if (err) + goto clean_added; + err = modify_rqt(ndev, new_vqs); if (err) goto clean_added; From 561a16366ef57caad66d0dfe49275cd3f809c138 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 19 Aug 2024 22:09:30 +0800 Subject: [PATCH 21/34] vdpa: Remove unused declarations There is no caller and implementation in tree. Signed-off-by: Yue Haibing Message-Id: <20240819140930.122019-1-yuehaibing@huawei.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Shannon Nelson Reviewed-by: Zhu Lingshan Reviewed-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Zhu Lingshan --- drivers/vdpa/ifcvf/ifcvf_base.h | 3 --- drivers/vdpa/pds/cmds.h | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 0f347717021a..aa36de361c10 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -112,15 +112,12 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 offset, const void *src, int length); u8 ifcvf_get_status(struct ifcvf_hw *hw); void ifcvf_set_status(struct ifcvf_hw *hw, u8 status); -void io_write64_twopart(u64 val, u32 *lo, u32 *hi); void ifcvf_reset(struct ifcvf_hw *hw); u64 ifcvf_get_dev_features(struct ifcvf_hw *hw); u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); -struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); -int ifcvf_probed_virtio_net(struct ifcvf_hw *hw); u32 ifcvf_get_config_size(struct ifcvf_hw *hw); u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector); u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector); diff --git a/drivers/vdpa/pds/cmds.h b/drivers/vdpa/pds/cmds.h index e24d85cb8f1c..6b1bc33356b0 100644 --- a/drivers/vdpa/pds/cmds.h +++ b/drivers/vdpa/pds/cmds.h @@ -14,5 +14,4 @@ int pds_vdpa_cmd_init_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, struct pds_vdpa_vq_info *vq_info); int pds_vdpa_cmd_reset_vq(struct pds_vdpa_device *pdsv, u16 qid, u16 invert_idx, struct pds_vdpa_vq_info *vq_info); -int pds_vdpa_cmd_set_features(struct pds_vdpa_device *pdsv, u64 features); #endif /* _VDPA_CMDS_H_ */ From 4045b6429874e07f14b5b41e326d4e6f866f8bbf Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 25 Aug 2024 16:07:15 +0300 Subject: [PATCH 22/34] virtio_fs: introduce virtio_fs_put_locked helper Introduce a new helper function virtio_fs_put_locked to encapsulate the common pattern of releasing a virtio_fs reference while holding a lock. The existing virtio_fs_put helper will be used to release a virtio_fs reference while not holding a lock. Also add an assertion in case the lock is not taken when it should. Reviewed-by: Idan Zach Reviewed-by: Shai Malin Signed-off-by: Max Gurtovoy Message-Id: <20240825130716.9506-1-mgurtovoy@nvidia.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- fs/fuse/virtio_fs.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index dd5260141615..43f7be1d7887 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -201,18 +201,25 @@ static const struct kobj_type virtio_fs_ktype = { }; /* Make sure virtiofs_mutex is held */ +static void virtio_fs_put_locked(struct virtio_fs *fs) +{ + lockdep_assert_held(&virtio_fs_mutex); + + kobject_put(&fs->kobj); +} + static void virtio_fs_put(struct virtio_fs *fs) { - kobject_put(&fs->kobj); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put_locked(fs); + mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) { struct virtio_fs *vfs = fiq->priv; - mutex_lock(&virtio_fs_mutex); virtio_fs_put(vfs); - mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) @@ -1052,7 +1059,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) vdev->priv = NULL; /* Put device reference on virtio_fs object */ - virtio_fs_put(fs); + virtio_fs_put_locked(fs); mutex_unlock(&virtio_fs_mutex); } @@ -1596,9 +1603,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) out_err: kfree(fc); - mutex_lock(&virtio_fs_mutex); virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); return err; } From 87cbdc396a31ce29b0849705e565c81564d5ed4b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 25 Aug 2024 16:07:16 +0300 Subject: [PATCH 23/34] virtio_fs: add sysfs entries for queue information Introduce sysfs entries to provide visibility to the multiple queues used by the Virtio FS device. This enhancement allows users to query information about these queues. Specifically, add two sysfs entries: 1. Queue name: Provides the name of each queue (e.g. hiprio/requests.8). 2. CPU list: Shows the list of CPUs that can process requests for each queue. The CPU list feature is inspired by similar functionality in the block MQ layer, which provides analogous sysfs entries for block devices. These new sysfs entries will improve observability and aid in debugging and performance tuning of Virtio FS devices. Reviewed-by: Idan Zach Reviewed-by: Shai Malin Signed-off-by: Max Gurtovoy Message-Id: <20240825130716.9506-2-mgurtovoy@nvidia.com> Signed-off-by: Michael S. Tsirkin --- fs/fuse/virtio_fs.c | 147 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 139 insertions(+), 8 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 43f7be1d7887..78f579463cca 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -56,12 +56,14 @@ struct virtio_fs_vq { bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ + struct kobject *kobj; char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ struct virtio_fs { struct kobject kobj; + struct kobject *mqs_kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -200,6 +202,74 @@ static const struct kobj_type virtio_fs_ktype = { .default_groups = virtio_fs_groups, }; +static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs, + struct kobject *kobj) +{ + int i; + + for (i = 0; i < fs->nvqs; i++) { + if (kobj == fs->vqs[i].kobj) + return &fs->vqs[i]; + } + return NULL; +} + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + + if (!fsvq) + return -EINVAL; + return sysfs_emit(buf, "%s\n", fsvq->name); +} + +static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name); + +static ssize_t cpu_list_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + unsigned int cpu, qid; + const size_t size = PAGE_SIZE - 1; + bool first = true; + int ret = 0, pos = 0; + + if (!fsvq) + return -EINVAL; + + qid = fsvq->vq->index; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid - VQ_REQUEST)) { + if (first) + ret = snprintf(buf + pos, size - pos, "%u", cpu); + else + ret = snprintf(buf + pos, size - pos, ", %u", cpu); + + if (ret >= size - pos) + break; + first = false; + pos += ret; + } + } + ret = snprintf(buf + pos, size + 1 - pos, "\n"); + return pos + ret; +} + +static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list); + +static struct attribute *virtio_fs_vq_attrs[] = { + &virtio_fs_vq_name_attr.attr, + &virtio_fs_vq_cpu_list_attr.attr, + NULL +}; + +static struct attribute_group virtio_fs_vq_attr_group = { + .attrs = virtio_fs_vq_attrs, +}; + /* Make sure virtiofs_mutex is held */ static void virtio_fs_put_locked(struct virtio_fs *fs) { @@ -280,6 +350,50 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } } +static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + kobject_put(fsvq->kobj); + } +} + +static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + char buff[12]; + int i, j, ret; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + + sprintf(buff, "%d", i); + fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; + } + + ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group); + if (ret) { + kobject_put(fsvq->kobj); + goto out_del; + } + } + + return 0; + +out_del: + for (j = 0; j < i; j++) { + fsvq = &fs->vqs[j]; + kobject_put(fsvq->kobj); + } + return ret; +} + /* Add a new instance to the list or return -EEXIST if tag name exists*/ static int virtio_fs_add_instance(struct virtio_device *vdev, struct virtio_fs *fs) @@ -303,17 +417,22 @@ static int virtio_fs_add_instance(struct virtio_device *vdev, */ fs->kobj.kset = virtio_fs_kset; ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); - if (ret < 0) { - mutex_unlock(&virtio_fs_mutex); - return ret; + if (ret < 0) + goto out_unlock; + + fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; } ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); - if (ret < 0) { - kobject_del(&fs->kobj); - mutex_unlock(&virtio_fs_mutex); - return ret; - } + if (ret < 0) + goto out_put; + + ret = virtio_fs_add_queues_sysfs(fs); + if (ret) + goto out_remove; list_add_tail(&fs->list, &virtio_fs_instances); @@ -322,6 +441,16 @@ static int virtio_fs_add_instance(struct virtio_device *vdev, kobject_uevent(&fs->kobj, KOBJ_ADD); return 0; + +out_remove: + sysfs_remove_link(&fs->kobj, "device"); +out_put: + kobject_put(fs->mqs_kobj); +out_del: + kobject_del(&fs->kobj); +out_unlock: + mutex_unlock(&virtio_fs_mutex); + return ret; } /* Return the virtio_fs with a given tag, or NULL */ @@ -1050,7 +1179,9 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + virtio_fs_delete_queues_sysfs(fs); sysfs_remove_link(&fs->kobj, "device"); + kobject_put(fs->mqs_kobj); kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); From db0a314f845abf9572d5826f4cfdecb93b838952 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 29 Aug 2024 16:37:57 +0200 Subject: [PATCH 24/34] MAINTAINERS: add virtio-vsock driver in the VIRTIO CORE section The virtio-vsock driver is already under VM SOCKETS (AF_VSOCK), managed pricipally with the net tree, and VIRTIO AND VHOST VSOCK DRIVER. However, changes that only affect the virtio part usually go with Michael's tree, so let's also put the driver in the VIRTIO CORE section to have its maintainers in CC for changes to the virtio-vsock driver. Cc: "Michael S. Tsirkin" Cc: Jason Wang Signed-off-by: Stefano Garzarella Message-Id: <20240829143757.85844-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Acked-by: Jason Wang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe83ba7194ea..e06bb4cc62bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24221,6 +24221,7 @@ F: include/linux/vdpa.h F: include/linux/virtio*.h F: include/linux/vringh.h F: include/uapi/linux/virtio_*.h +F: net/vmw_vsock/virtio* F: tools/virtio/ F: tools/testing/selftests/drivers/net/virtio_net/ From 0071b138d44af4296bf871e6624369ce697b4b15 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:32 +0300 Subject: [PATCH 25/34] vdpa/mlx5: Create direct MKEYs in parallel Use the async interface to issue MTT MKEY creation. Extra care is taken at the allocation of FW input commands due to the MTT tables having variable sizes depending on MR. The indirect MKEY is still created synchronously at the end as the direct MKEYs need to be filled in. This makes create_user_mr() 3-5x faster, depending on the size of the MR. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240830105838.2666587-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mr.c | 122 +++++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 23 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index bf56f3d69625..6cca41972332 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -49,17 +49,18 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt) } } -static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) -{ - int inlen; - void *mkc; - void *in; - int err; +struct mlx5_create_mkey_mem { + u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)]; + u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)]; + __be64 mtt[]; +}; - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; +static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_create_mkey_mem *mem) +{ + void *in = &mem->in; + void *mkc; MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -76,18 +77,25 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct MLX5_SET(create_mkey_in, in, translations_octword_actual_size, get_octo_len(mr->end - mr->start, mr->log_size)); populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt)); - err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen); - kvfree(in); - if (err) { - mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n"); - return err; - } - return 0; + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid); +} + +static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_create_mkey_mem *mem) +{ + u32 mkey_index = MLX5_GET(create_mkey_out, mem->out, mkey_index); + + mr->mr = mlx5_idx_to_mkey(mkey_index); } static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { + if (!mr->mr) + return; + mlx5_vdpa_destroy_mkey(mvdev, mr->mr); } @@ -179,6 +187,76 @@ static int klm_byte_size(int nklms) return 16 * ALIGN(nklms, 4); } +#define MLX5_VDPA_MTT_ALIGN 16 + +static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + struct mlx5_vdpa_async_cmd *cmds; + struct mlx5_vdpa_direct_mr *dmr; + int err = 0; + int i = 0; + + cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_create_mkey_mem *cmd_mem; + int mttlen, mttcount; + + mttlen = roundup(MLX5_ST_SZ_BYTES(mtt) * dmr->nsg, MLX5_VDPA_MTT_ALIGN); + mttcount = mttlen / sizeof(cmd_mem->mtt[0]); + cmd_mem = kvcalloc(1, struct_size(cmd_mem, mtt, mttcount), GFP_KERNEL); + if (!cmd_mem) { + err = -ENOMEM; + goto done; + } + + cmds[i].out = cmd_mem->out; + cmds[i].outlen = sizeof(cmd_mem->out); + cmds[i].in = cmd_mem->in; + cmds[i].inlen = struct_size(cmd_mem, mtt, mttcount); + + fill_create_direct_mr(mvdev, dmr, cmd_mem); + + i++; + } + + err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs); + if (err) { + + mlx5_vdpa_err(mvdev, "error issuing MTT mkey creation for direct mrs: %d\n", err); + goto done; + } + + i = 0; + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i++]; + struct mlx5_create_mkey_mem *cmd_mem; + + cmd_mem = container_of(cmd->out, struct mlx5_create_mkey_mem, out); + + if (!cmd->err) { + create_direct_mr_end(mvdev, dmr, cmd_mem); + } else { + err = err ? err : cmd->err; + mlx5_vdpa_err(mvdev, "error creating MTT mkey [0x%llx, 0x%llx]: %d\n", + dmr->start, dmr->end, cmd->err); + } + } + +done: + for (i = i-1; i >= 0; i--) { + struct mlx5_create_mkey_mem *cmd_mem; + + cmd_mem = container_of(cmds[i].out, struct mlx5_create_mkey_mem, out); + kvfree(cmd_mem); + } + + kvfree(cmds); + return err; +} + static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { int inlen; @@ -279,14 +357,8 @@ done: goto err_map; } - err = create_direct_mr(mvdev, mr); - if (err) - goto err_direct; - return 0; -err_direct: - dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); err_map: sg_free_table(&mr->sg_head); return err; @@ -401,6 +473,10 @@ static int create_user_mr(struct mlx5_vdpa_dev *mvdev, if (err) goto err_chain; + err = create_direct_keys(mvdev, mr); + if (err) + goto err_chain; + /* Create the memory key that defines the guests's address space. This * memory key refers to the direct keys that contain the MTT * translations From e1ba5c947e56ccb09773eebfb730cae458b6a4fd Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:33 +0300 Subject: [PATCH 26/34] vdpa/mlx5: Delete direct MKEYs in parallel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the async interface to issue MTT MKEY deletion. This makes destroy_user_mr() on average 8x times faster. This number is also dependent on the size of the MR being deleted. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Acked-by: Eugenio Pérez Message-Id: <20240830105838.2666587-4-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mr.c | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 6cca41972332..a7c3cf9281b2 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -55,6 +55,11 @@ struct mlx5_create_mkey_mem { __be64 mtt[]; }; +struct mlx5_destroy_mkey_mem { + u8 out[MLX5_ST_SZ_BYTES(destroy_mkey_out)]; + u8 in[MLX5_ST_SZ_BYTES(destroy_mkey_in)]; +}; + static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr, struct mlx5_create_mkey_mem *mem) @@ -91,6 +96,17 @@ static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev, mr->mr = mlx5_idx_to_mkey(mkey_index); } +static void fill_destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_direct_mr *mr, + struct mlx5_destroy_mkey_mem *mem) +{ + void *in = &mem->in; + + MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid); + MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mr->mr)); +} + static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { if (!mr->mr) @@ -257,6 +273,53 @@ done: return err; } +DEFINE_FREE(free_cmds, struct mlx5_vdpa_async_cmd *, kvfree(_T)) +DEFINE_FREE(free_cmd_mem, struct mlx5_destroy_mkey_mem *, kvfree(_T)) + +static int destroy_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) +{ + struct mlx5_destroy_mkey_mem *cmd_mem __free(free_cmd_mem) = NULL; + struct mlx5_vdpa_async_cmd *cmds __free(free_cmds) = NULL; + struct mlx5_vdpa_direct_mr *dmr; + int err = 0; + int i = 0; + + cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL); + cmd_mem = kvcalloc(mr->num_directs, sizeof(*cmd_mem), GFP_KERNEL); + if (!cmds || !cmd_mem) + return -ENOMEM; + + list_for_each_entry(dmr, &mr->head, list) { + cmds[i].out = cmd_mem[i].out; + cmds[i].outlen = sizeof(cmd_mem[i].out); + cmds[i].in = cmd_mem[i].in; + cmds[i].inlen = sizeof(cmd_mem[i].in); + fill_destroy_direct_mr(mvdev, dmr, &cmd_mem[i]); + i++; + } + + err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs); + if (err) { + + mlx5_vdpa_err(mvdev, "error issuing MTT mkey deletion for direct mrs: %d\n", err); + return err; + } + + i = 0; + list_for_each_entry(dmr, &mr->head, list) { + struct mlx5_vdpa_async_cmd *cmd = &cmds[i++]; + + dmr->mr = 0; + if (cmd->err) { + err = err ? err : cmd->err; + mlx5_vdpa_err(mvdev, "error deleting MTT mkey [0x%llx, 0x%llx]: %d\n", + dmr->start, dmr->end, cmd->err); + } + } + + return err; +} + static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { int inlen; @@ -565,6 +628,7 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr struct mlx5_vdpa_direct_mr *n; destroy_indirect_key(mvdev, mr); + destroy_direct_keys(mvdev, mr); list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { list_del_init(&dmr->list); unmap_direct_mr(mvdev, dmr); From 0b916a9c45d92c69270f2b44a35468fe6e331c2f Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:34 +0300 Subject: [PATCH 27/34] vdpa/mlx5: Rename function A followup patch will use this name for something else. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240830105838.2666587-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 +- drivers/vdpa/mlx5/core/mr.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 24fa00afb24f..4d217d18239c 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -135,7 +135,7 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb); -void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev); void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr); void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index a7c3cf9281b2..a52d7c3bf7e7 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -719,7 +719,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) } -void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) +void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev) { if (!mvdev->res.valid) return; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 02b06957e0f3..7ec2b7ba830b 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3223,7 +3223,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) err_driver: unregister_link_notifier(ndev); err_setup: - mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); + mlx5_vdpa_clean_mrs(&ndev->mvdev); ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; err_clear: up_write(&ndev->reslock); @@ -3275,7 +3275,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags) } if (flags & VDPA_RESET_F_CLEAN_MAP) - mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); + mlx5_vdpa_clean_mrs(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.suspended = false; ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT; @@ -3433,7 +3433,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) ndev = to_mlx5_vdpa_ndev(mvdev); free_fixed_resources(ndev); - mlx5_vdpa_destroy_mr_resources(mvdev); + mlx5_vdpa_clean_mrs(mvdev); if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); @@ -4008,7 +4008,7 @@ err_reg: err_res2: free_fixed_resources(ndev); err_mr: - mlx5_vdpa_destroy_mr_resources(mvdev); + mlx5_vdpa_clean_mrs(mvdev); err_res: mlx5_vdpa_free_resources(&ndev->mvdev); err_mpfs: From 5fc85679076623a5c39ec09277144fb0bbf0c6ed Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:35 +0300 Subject: [PATCH 28/34] vdpa/mlx5: Extract mr members in own resource struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Group all mapping related resources into their own structure. Upcoming patches will add more members in this new structure. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Acked-by: Eugenio Pérez Message-Id: <20240830105838.2666587-6-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 13 ++++++----- drivers/vdpa/mlx5/core/mr.c | 30 ++++++++++++------------- drivers/vdpa/mlx5/core/resources.c | 6 ++--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 36 +++++++++++++++--------------- 4 files changed, 44 insertions(+), 41 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 4d217d18239c..5ae6deea2a8a 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -83,10 +83,18 @@ enum { MLX5_VDPA_NUM_AS = 2 }; +struct mlx5_vdpa_mr_resources { + struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; + struct list_head mr_list_head; + struct mutex mr_mtx; +}; + struct mlx5_vdpa_dev { struct vdpa_device vdev; struct mlx5_core_dev *mdev; struct mlx5_vdpa_resources res; + struct mlx5_vdpa_mr_resources mres; u64 mlx_features; u64 actual_features; @@ -95,13 +103,8 @@ struct mlx5_vdpa_dev { u16 max_idx; u32 generation; - struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; - struct list_head mr_list_head; - /* serialize mr access */ - struct mutex mr_mtx; struct mlx5_control_vq cvq; struct workqueue_struct *wq; - unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; bool suspended; struct mlx5_async_ctx async_ctx; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index a52d7c3bf7e7..70311a41a5ba 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - mutex_lock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); _mlx5_vdpa_put_mr(mvdev, mr); - mutex_unlock(&mvdev->mr_mtx); + mutex_unlock(&mvdev->mres.mr_mtx); } static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, @@ -683,39 +683,39 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - mutex_lock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); _mlx5_vdpa_get_mr(mvdev, mr); - mutex_unlock(&mvdev->mr_mtx); + mutex_unlock(&mvdev->mres.mr_mtx); } void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *new_mr, unsigned int asid) { - struct mlx5_vdpa_mr *old_mr = mvdev->mr[asid]; + struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid]; - mutex_lock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); _mlx5_vdpa_put_mr(mvdev, old_mr); - mvdev->mr[asid] = new_mr; + mvdev->mres.mr[asid] = new_mr; - mutex_unlock(&mvdev->mr_mtx); + mutex_unlock(&mvdev->mres.mr_mtx); } static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr *mr; - mutex_lock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); - list_for_each_entry(mr, &mvdev->mr_list_head, mr_list) { + list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) { mlx5_vdpa_warn(mvdev, "mkey still alive after resource delete: " "mr: %p, mkey: 0x%x, refcount: %u\n", mr, mr->mkey, refcount_read(&mr->refcount)); } - mutex_unlock(&mvdev->mr_mtx); + mutex_unlock(&mvdev->mres.mr_mtx); } @@ -756,7 +756,7 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, if (err) goto err_iotlb; - list_add_tail(&mr->mr_list, &mvdev->mr_list_head); + list_add_tail(&mr->mr_list, &mvdev->mres.mr_list_head); return 0; @@ -782,9 +782,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, if (!mr) return ERR_PTR(-ENOMEM); - mutex_lock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb); - mutex_unlock(&mvdev->mr_mtx); + mutex_unlock(&mvdev->mres.mr_mtx); if (err) goto out_err; @@ -804,7 +804,7 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev, { int err; - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) + if (mvdev->mres.group2asid[MLX5_VDPA_CVQ_GROUP] != asid) return 0; spin_lock(&mvdev->cvq.iommu_lock); diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 22ea32fe007b..3e3b3049cb08 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -256,7 +256,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) mlx5_vdpa_warn(mvdev, "resources already allocated\n"); return -EINVAL; } - mutex_init(&mvdev->mr_mtx); + mutex_init(&mvdev->mres.mr_mtx); res->uar = mlx5_get_uars_page(mdev); if (IS_ERR(res->uar)) { err = PTR_ERR(res->uar); @@ -301,7 +301,7 @@ err_pd: err_uctx: mlx5_put_uars_page(mdev, res->uar); err_uars: - mutex_destroy(&mvdev->mr_mtx); + mutex_destroy(&mvdev->mres.mr_mtx); return err; } @@ -318,7 +318,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) dealloc_pd(mvdev, res->pdn, res->uid); destroy_uctx(mvdev, res->uid); mlx5_put_uars_page(mvdev->mdev, res->uar); - mutex_destroy(&mvdev->mr_mtx); + mutex_destroy(&mvdev->mres.mr_mtx); res->valid = false; } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 7ec2b7ba830b..0332ad577ce4 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -941,11 +941,11 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); - vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; if (vq_mr) MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey); - vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + vq_desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey); @@ -953,11 +953,11 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, /* If there is no mr update, make sure that the existing ones are set * modify to ready. */ - vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; if (vq_mr) mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY; - vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + vq_desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; if (vq_desc_mr) mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; } @@ -1354,7 +1354,7 @@ static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev, } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { - vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + vq_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]]; if (vq_mr) MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey); @@ -1363,7 +1363,7 @@ static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev, } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { - desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + desc_mr = mvdev->mres.mr[mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey); @@ -1381,8 +1381,8 @@ static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { - unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]; - struct mlx5_vdpa_mr *vq_mr = mvdev->mr[asid]; + unsigned int asid = mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_GROUP]; + struct mlx5_vdpa_mr *vq_mr = mvdev->mres.mr[asid]; mlx5_vdpa_put_mr(mvdev, mvq->vq_mr); mlx5_vdpa_get_mr(mvdev, vq_mr); @@ -1390,8 +1390,8 @@ static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev, } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { - unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]; - struct mlx5_vdpa_mr *desc_mr = mvdev->mr[asid]; + unsigned int asid = mvdev->mres.group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]; + struct mlx5_vdpa_mr *desc_mr = mvdev->mres.mr[asid]; mlx5_vdpa_put_mr(mvdev, mvq->desc_mr); mlx5_vdpa_get_mr(mvdev, desc_mr); @@ -3235,7 +3235,7 @@ static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) /* default mapping all groups are mapped to asid 0 */ for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) - mvdev->group2asid[i] = 0; + mvdev->mres.group2asid[i] = 0; } static bool needs_vqs_reset(const struct mlx5_vdpa_dev *mvdev) @@ -3353,7 +3353,7 @@ static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, new_mr = NULL; } - if (!mvdev->mr[asid]) { + if (!mvdev->mres.mr[asid]) { mlx5_vdpa_update_mr(mvdev, new_mr, asid); } else { err = mlx5_vdpa_change_map(mvdev, new_mr, asid); @@ -3637,12 +3637,12 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, if (group >= MLX5_VDPA_NUMVQ_GROUPS) return -EINVAL; - mvdev->group2asid[group] = asid; + mvdev->mres.group2asid[group] = asid; - mutex_lock(&mvdev->mr_mtx); - if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mr[asid]) - err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mr[asid]->iotlb, asid); - mutex_unlock(&mvdev->mr_mtx); + mutex_lock(&mvdev->mres.mr_mtx); + if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mres.mr[asid]) + err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mres.mr[asid]->iotlb, asid); + mutex_unlock(&mvdev->mres.mr_mtx); return err; } @@ -3962,7 +3962,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, if (err) goto err_mpfs; - INIT_LIST_HEAD(&mvdev->mr_list_head); + INIT_LIST_HEAD(&mvdev->mres.mr_list_head); if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { err = mlx5_vdpa_create_dma_mr(mvdev); From 58d4d50e758ab1e880b30ba815d733d46f5cbfac Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:36 +0300 Subject: [PATCH 29/34] vdpa/mlx5: Rename mr_mtx -> lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the mr resources have their own namespace in the struct, give the lock a clearer name. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Acked-by: Eugenio Pérez Message-Id: <20240830105838.2666587-7-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 +- drivers/vdpa/mlx5/core/mr.c | 20 ++++++++++---------- drivers/vdpa/mlx5/core/resources.c | 6 +++--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 5ae6deea2a8a..89b564cecddf 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -87,7 +87,7 @@ struct mlx5_vdpa_mr_resources { struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; struct list_head mr_list_head; - struct mutex mr_mtx; + struct mutex lock; }; struct mlx5_vdpa_dev { diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 70311a41a5ba..cac470125612 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); _mlx5_vdpa_put_mr(mvdev, mr); - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); } static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, @@ -683,9 +683,9 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); _mlx5_vdpa_get_mr(mvdev, mr); - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); } void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, @@ -694,19 +694,19 @@ void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, { struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid]; - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); _mlx5_vdpa_put_mr(mvdev, old_mr); mvdev->mres.mr[asid] = new_mr; - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); } static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr *mr; - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) { @@ -715,7 +715,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) mr, mr->mkey, refcount_read(&mr->refcount)); } - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); } @@ -782,9 +782,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, if (!mr) return ERR_PTR(-ENOMEM); - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb); - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); if (err) goto out_err; diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index 3e3b3049cb08..fe2ca3458f6c 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -256,7 +256,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) mlx5_vdpa_warn(mvdev, "resources already allocated\n"); return -EINVAL; } - mutex_init(&mvdev->mres.mr_mtx); + mutex_init(&mvdev->mres.lock); res->uar = mlx5_get_uars_page(mdev); if (IS_ERR(res->uar)) { err = PTR_ERR(res->uar); @@ -301,7 +301,7 @@ err_pd: err_uctx: mlx5_put_uars_page(mdev, res->uar); err_uars: - mutex_destroy(&mvdev->mres.mr_mtx); + mutex_destroy(&mvdev->mres.lock); return err; } @@ -318,7 +318,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) dealloc_pd(mvdev, res->pdn, res->uid); destroy_uctx(mvdev, res->uid); mlx5_put_uars_page(mvdev->mdev, res->uar); - mutex_destroy(&mvdev->mres.mr_mtx); + mutex_destroy(&mvdev->mres.lock); res->valid = false; } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 0332ad577ce4..fc915c834f56 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3639,10 +3639,10 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, mvdev->mres.group2asid[group] = asid; - mutex_lock(&mvdev->mres.mr_mtx); + mutex_lock(&mvdev->mres.lock); if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mres.mr[asid]) err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mres.mr[asid]->iotlb, asid); - mutex_unlock(&mvdev->mres.mr_mtx); + mutex_unlock(&mvdev->mres.lock); return err; } From f30a1232b6979c7fc14e821cb349c40073c6191d Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:37 +0300 Subject: [PATCH 30/34] vdpa/mlx5: Introduce init/destroy for MR resources There's currently not a lot of action happening during the init/destroy of MR resources. But more will be added in the upcoming patches. As the mr mutex lock init/destroy has been moved to these new functions, the lifetime has now shifted away from mlx5_vdpa_alloc_resources() / mlx5_vdpa_free_resources() into these new functions. However, the lifetime at the outer scope remains the same: mlx5_vdpa_dev_add() / mlx5_vdpa_dev_free() Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240830105838.2666587-8-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 ++ drivers/vdpa/mlx5/core/mr.c | 17 +++++++++++++++++ drivers/vdpa/mlx5/core/resources.c | 3 --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +++++++-- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 89b564cecddf..c3e17bc888e8 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -138,6 +138,8 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb); +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev); +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev); void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev); void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr); diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index cac470125612..64683a39d3db 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -846,3 +846,20 @@ int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) return 0; } + +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + + INIT_LIST_HEAD(&mres->mr_list_head); + mutex_init(&mres->lock); + + return 0; +} + +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + + mutex_destroy(&mres->lock); +} diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index fe2ca3458f6c..aeae31d0cefa 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -256,7 +256,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) mlx5_vdpa_warn(mvdev, "resources already allocated\n"); return -EINVAL; } - mutex_init(&mvdev->mres.lock); res->uar = mlx5_get_uars_page(mdev); if (IS_ERR(res->uar)) { err = PTR_ERR(res->uar); @@ -301,7 +300,6 @@ err_pd: err_uctx: mlx5_put_uars_page(mdev, res->uar); err_uars: - mutex_destroy(&mvdev->mres.lock); return err; } @@ -318,7 +316,6 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) dealloc_pd(mvdev, res->pdn, res->uid); destroy_uctx(mvdev, res->uid); mlx5_put_uars_page(mvdev->mdev, res->uar); - mutex_destroy(&mvdev->mres.lock); res->valid = false; } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index fc915c834f56..4b7dcfcba444 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3434,6 +3434,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) free_fixed_resources(ndev); mlx5_vdpa_clean_mrs(mvdev); + mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); @@ -3962,12 +3963,14 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, if (err) goto err_mpfs; - INIT_LIST_HEAD(&mvdev->mres.mr_list_head); + err = mlx5_vdpa_init_mr_resources(mvdev); + if (err) + goto err_res; if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { err = mlx5_vdpa_create_dma_mr(mvdev); if (err) - goto err_res; + goto err_mr_res; } err = alloc_fixed_resources(ndev); @@ -4009,6 +4012,8 @@ err_res2: free_fixed_resources(ndev); err_mr: mlx5_vdpa_clean_mrs(mvdev); +err_mr_res: + mlx5_vdpa_destroy_mr_resources(mvdev); err_res: mlx5_vdpa_free_resources(&ndev->mvdev); err_mpfs: From 62111654481d5df4be3776a898cb88b5e4974103 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 30 Aug 2024 13:58:38 +0300 Subject: [PATCH 31/34] vdpa/mlx5: Postpone MR deletion Currently, when a new MR is set up, the old MR is deleted. MR deletion is about 30-40% the time of MR creation. As deleting the old MR is not important for the process of setting up the new MR, this operation can be postponed. This series adds a workqueue that does MR garbage collection at a later point. If the MR lock is taken, the handler will back off and reschedule. The exception during shutdown: then the handler must not postpone the work. Note that this is only a speculative optimization: if there is some mapping operation that is triggered while the garbage collector handler has the lock taken, this operation it will have to wait for the handler to finish. Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Message-Id: <20240830105838.2666587-9-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 10 ++++++ drivers/vdpa/mlx5/core/mr.c | 55 ++++++++++++++++++++++++++++-- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 +-- 3 files changed, 64 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index c3e17bc888e8..2cedf7e2dbc4 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -86,8 +86,18 @@ enum { struct mlx5_vdpa_mr_resources { struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; + + /* Pre-deletion mr list */ struct list_head mr_list_head; + + /* Deferred mr list */ + struct list_head mr_gc_list_head; + struct workqueue_struct *wq_gc; + struct delayed_work gc_dwork_ent; + struct mutex lock; + + atomic_t shutdown; }; struct mlx5_vdpa_dev { diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 64683a39d3db..2dd21e0b399e 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -653,14 +653,50 @@ static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_ kfree(mr); } +/* There can be multiple .set_map() operations in quick succession. + * This large delay is a simple way to prevent the MR cleanup from blocking + * .set_map() MR creation in this scenario. + */ +#define MLX5_VDPA_MR_GC_TRIGGER_MS 2000 + +static void mlx5_vdpa_mr_gc_handler(struct work_struct *work) +{ + struct mlx5_vdpa_mr_resources *mres; + struct mlx5_vdpa_mr *mr, *tmp; + struct mlx5_vdpa_dev *mvdev; + + mres = container_of(work, struct mlx5_vdpa_mr_resources, gc_dwork_ent.work); + + if (atomic_read(&mres->shutdown)) { + mutex_lock(&mres->lock); + } else if (!mutex_trylock(&mres->lock)) { + queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent, + msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS)); + return; + } + + mvdev = container_of(mres, struct mlx5_vdpa_dev, mres); + + list_for_each_entry_safe(mr, tmp, &mres->mr_gc_list_head, mr_list) { + _mlx5_vdpa_destroy_mr(mvdev, mr); + } + + mutex_unlock(&mres->lock); +} + static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { + struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + if (!mr) return; - if (refcount_dec_and_test(&mr->refcount)) - _mlx5_vdpa_destroy_mr(mvdev, mr); + if (refcount_dec_and_test(&mr->refcount)) { + list_move_tail(&mr->mr_list, &mres->mr_gc_list_head); + queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent, + msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS)); + } } void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, @@ -851,9 +887,17 @@ int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; - INIT_LIST_HEAD(&mres->mr_list_head); + mres->wq_gc = create_singlethread_workqueue("mlx5_vdpa_mr_gc"); + if (!mres->wq_gc) + return -ENOMEM; + + INIT_DELAYED_WORK(&mres->gc_dwork_ent, mlx5_vdpa_mr_gc_handler); + mutex_init(&mres->lock); + INIT_LIST_HEAD(&mres->mr_list_head); + INIT_LIST_HEAD(&mres->mr_gc_list_head); + return 0; } @@ -861,5 +905,10 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + atomic_set(&mres->shutdown, 1); + + flush_delayed_work(&mres->gc_dwork_ent); + destroy_workqueue(mres->wq_gc); + mres->wq_gc = NULL; mutex_destroy(&mres->lock); } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 4b7dcfcba444..dee019977716 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3435,6 +3435,8 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) free_fixed_resources(ndev); mlx5_vdpa_clean_mrs(mvdev); mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); + mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); + if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); @@ -4042,8 +4044,6 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mvdev->wq = NULL; destroy_workqueue(wq); mgtdev->ndev = NULL; - - mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); } static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, From 4a21d31d7bcb4c245783119252b0389255964cd2 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 4 Sep 2024 09:17:43 +0800 Subject: [PATCH 32/34] fw_cfg: Constify struct kobj_type This 'struct kobj_type' is not modified. It is only used in kobject_init_and_add() which takes a 'const struct kobj_type *ktype' parameter. Constifying this structure and moving it to a read-only section, and this can increase over all security. ``` [Before] text data bss dec hex filename 5974 1008 96 7078 1ba6 drivers/firmware/qemu_fw_cfg.o [After] text data bss dec hex filename 6038 944 96 7078 1ba6 drivers/firmware/qemu_fw_cfg.o ``` Signed-off-by: Hongbo Li Message-Id: <20240904011743.2010319-1-lihongbo22@huawei.com> Signed-off-by: Michael S. Tsirkin --- drivers/firmware/qemu_fw_cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index 5f43dfa22f79..85c525745b31 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -452,7 +452,7 @@ static void fw_cfg_sysfs_release_entry(struct kobject *kobj) } /* kobj_type: ties together all properties required to register an entry */ -static struct kobj_type fw_cfg_sysfs_entry_ktype = { +static const struct kobj_type fw_cfg_sysfs_entry_ktype = { .default_groups = fw_cfg_sysfs_entry_groups, .sysfs_ops = &fw_cfg_sysfs_attr_ops, .release = fw_cfg_sysfs_release_entry, From 26618da3b2f3d510a3082a1cb0abafc0f92e8362 Mon Sep 17 00:00:00 2001 From: Marco Pinna Date: Tue, 30 Jul 2024 21:47:31 +0200 Subject: [PATCH 33/34] vsock/virtio: refactor virtio_transport_send_pkt_work Preliminary patch to introduce an optimization to the enqueue system. All the code used to enqueue a packet into the virtqueue is removed from virtio_transport_send_pkt_work() and moved to the new virtio_transport_send_skb() function. Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Marco Pinna Reviewed-by: Stefano Garzarella Message-Id: <20240730-pinna-v4-1-5c9179164db5@outlook.com> Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport.c | 105 +++++++++++++++++-------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 64a07acfef12..f641e906f351 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -94,6 +94,63 @@ out_rcu: return ret; } +/* Caller need to hold vsock->tx_lock on vq */ +static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, + struct virtio_vsock *vsock) +{ + int ret, in_sg = 0, out_sg = 0; + struct scatterlist **sgs; + + sgs = vsock->out_sgs; + sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), + sizeof(*virtio_vsock_hdr(skb))); + out_sg++; + + if (!skb_is_nonlinear(skb)) { + if (skb->len > 0) { + sg_init_one(sgs[out_sg], skb->data, skb->len); + out_sg++; + } + } else { + struct skb_shared_info *si; + int i; + + /* If skb is nonlinear, then its buffer must contain + * only header and nothing more. Data is stored in + * the fragged part. + */ + WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); + + si = skb_shinfo(skb); + + for (i = 0; i < si->nr_frags; i++) { + skb_frag_t *skb_frag = &si->frags[i]; + void *va; + + /* We will use 'page_to_virt()' for the userspace page + * here, because virtio or dma-mapping layers will call + * 'virt_to_phys()' later to fill the buffer descriptor. + * We don't touch memory at "virtual" address of this page. + */ + va = page_to_virt(skb_frag_page(skb_frag)); + sg_init_one(sgs[out_sg], + va + skb_frag_off(skb_frag), + skb_frag_size(skb_frag)); + out_sg++; + } + } + + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); + /* Usually this means that there is no more space available in + * the vq + */ + if (ret < 0) + return ret; + + virtio_transport_deliver_tap_pkt(skb); + return 0; +} + static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -111,66 +168,22 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - int ret, in_sg = 0, out_sg = 0; - struct scatterlist **sgs; struct sk_buff *skb; bool reply; + int ret; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) break; reply = virtio_vsock_skb_reply(skb); - sgs = vsock->out_sgs; - sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), - sizeof(*virtio_vsock_hdr(skb))); - out_sg++; - if (!skb_is_nonlinear(skb)) { - if (skb->len > 0) { - sg_init_one(sgs[out_sg], skb->data, skb->len); - out_sg++; - } - } else { - struct skb_shared_info *si; - int i; - - /* If skb is nonlinear, then its buffer must contain - * only header and nothing more. Data is stored in - * the fragged part. - */ - WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); - - si = skb_shinfo(skb); - - for (i = 0; i < si->nr_frags; i++) { - skb_frag_t *skb_frag = &si->frags[i]; - void *va; - - /* We will use 'page_to_virt()' for the userspace page - * here, because virtio or dma-mapping layers will call - * 'virt_to_phys()' later to fill the buffer descriptor. - * We don't touch memory at "virtual" address of this page. - */ - va = page_to_virt(skb_frag_page(skb_frag)); - sg_init_one(sgs[out_sg], - va + skb_frag_off(skb_frag), - skb_frag_size(skb_frag)); - out_sg++; - } - } - - ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); - /* Usually this means that there is no more space available in - * the vq - */ + ret = virtio_transport_send_skb(skb, vq, vsock); if (ret < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } - virtio_transport_deliver_tap_pkt(skb); - if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; From efcd71af38be403fa52223092f79ada446e121ba Mon Sep 17 00:00:00 2001 From: Luigi Leonardi Date: Tue, 30 Jul 2024 21:47:32 +0200 Subject: [PATCH 34/34] vsock/virtio: avoid queuing packets when intermediate queue is empty When the driver needs to send new packets to the device, it always queues the new sk_buffs into an intermediate queue (send_pkt_queue) and schedules a worker (send_pkt_work) to then queue them into the virtqueue exposed to the device. This increases the chance of batching, but also introduces a lot of latency into the communication. So we can optimize this path by adding a fast path to be taken when there is no element in the intermediate queue, there is space available in the virtqueue, and no other process that is sending packets (tx_lock held). The following benchmarks were run to check improvements in latency and throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz and L1 guest running on QEMU/KVM with vhost process and all vCPUs pinned individually to pCPUs. - Latency Tool: Fio version 3.37-56 Mode: pingpong (h-g-h) Test runs: 50 Runtime-per-test: 50s Type: SOCK_STREAM In the following fio benchmark (pingpong mode) the host sends a payload to the guest and waits for the same payload back. fio process pinned both inside the host and the guest system. Before: Linux 6.9.8 Payload 64B: 1st perc. overall 99th perc. Before 12.91 16.78 42.24 us After 9.77 13.57 39.17 us Payload 512B: 1st perc. overall 99th perc. Before 13.35 17.35 41.52 us After 10.25 14.11 39.58 us Payload 4K: 1st perc. overall 99th perc. Before 14.71 19.87 41.52 us After 10.51 14.96 40.81 us - Throughput Tool: iperf-vsock The size represents the buffer length (-l) to read/write P represents the number of parallel streams P=1 4K 64K 128K Before 6.87 29.3 29.5 Gb/s After 10.5 39.4 39.9 Gb/s P=2 4K 64K 128K Before 10.5 32.8 33.2 Gb/s After 17.8 47.7 48.5 Gb/s P=4 4K 64K 128K Before 12.7 33.6 34.2 Gb/s After 16.9 48.1 50.5 Gb/s The performance improvement is related to this optimization, I used a ebpf kretprobe on virtio_transport_send_skb to check that each packet was sent directly to the virtqueue Co-developed-by: Marco Pinna Signed-off-by: Marco Pinna Signed-off-by: Luigi Leonardi Message-Id: <20240730-pinna-v4-2-5c9179164db5@outlook.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- net/vmw_vsock/virtio_transport.c | 39 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f641e906f351..f992f9a216f0 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -208,6 +208,28 @@ out: queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +/* Caller need to hold RCU for vsock. + * Returns 0 if the packet is successfully put on the vq. + */ +static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struct sk_buff *skb) +{ + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; + int ret; + + /* Inside RCU, can't sleep! */ + ret = mutex_trylock(&vsock->tx_lock); + if (unlikely(ret == 0)) + return -EBUSY; + + ret = virtio_transport_send_skb(skb, vq, vsock); + if (ret == 0) + virtqueue_kick(vq); + + mutex_unlock(&vsock->tx_lock); + + return ret; +} + static int virtio_transport_send_pkt(struct sk_buff *skb) { @@ -231,11 +253,20 @@ virtio_transport_send_pkt(struct sk_buff *skb) goto out_rcu; } - if (virtio_vsock_skb_reply(skb)) - atomic_inc(&vsock->queued_replies); + /* If send_pkt_queue is empty, we can safely bypass this queue + * because packet order is maintained and (try) to put the packet + * on the virtqueue using virtio_transport_send_skb_fast_path. + * If this fails we simply put the packet on the intermediate + * queue and schedule the worker. + */ + if (!skb_queue_empty_lockless(&vsock->send_pkt_queue) || + virtio_transport_send_skb_fast_path(vsock, skb)) { + if (virtio_vsock_skb_reply(skb)) + atomic_inc(&vsock->queued_replies); - virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); - queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + } out_rcu: rcu_read_unlock();