mlxsw: pci: Use page pool for Rx buffers allocation

As part of driver init, all Rx queues are filled with buffers for hardware usage. Later, when a packet is received, a new buffer should be allocated to be used by hardware instead of the received buffer. Packet's processing time includes allocation time, which can be improved using page pool. Using page pool, DMA mapping is done only for first allocation of buffers. As subsequent buffers allocation avoid DMA mapping, it results in performance improvement. The purpose of page pool is to allocate pages fast from cache without locking. This lockless guarantee naturally comes from running under a NAPI. Use page pool to allocate the data buffer only, so hardware will use it to fill the packet. At completion time, attach the data buffer (now filled with packet payload) to new SKB which is allocated around the received buffer. SKB building at completion time prevents cache miss for each packet, as now the SKB is allocated right before packets will be handled by networking stack. Page pool for each Rx queue enhances Rx side performance by reclaiming buffers back to each queue specific pool. This change significantly improves driver performance, CPU can handle about 345% of the packets per second it previously handled. Signed-off-by: Amit Cohen <amcohen@nvidia.com> Reviewed-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Link: https://lore.kernel.org/r/1cf788a8f43c70aae6d526018ef77becb27ad6d3.1718709196.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-18 13:34:43 +02:00 · 2024-06-18 13:34:43 +02:00 · b5b60bb491
commit b5b60bb491
parent 5642c6a086
1 changed files with 66 additions and 41 deletions
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@ -62,6 +62,7 @@ struct mlxsw_pci_mem_item {
 };

 struct mlxsw_pci_queue_elem_info {
+	struct page *page;
 	char *elem; /* pointer to actual dma mapped element mem chunk */
 	union {
 		struct {
@ -346,6 +347,19 @@ static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
 		(MLXSW_PCI_SKB_HEADROOM +	\
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

+static void
+mlxsw_pci_wqe_rx_frag_set(struct mlxsw_pci *mlxsw_pci, struct page *page,
+			  char *wqe, int index, size_t frag_len)
+{
+	dma_addr_t mapaddr;
+
+	mapaddr = page_pool_get_dma_addr(page);
+	mapaddr += MLXSW_PCI_SKB_HEADROOM;
+
+	mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
+	mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
+}
+
 static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
 				  int index, char *frag_data, size_t frag_len,
 				  int direction)
@ -375,43 +389,46 @@ static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
 	dma_unmap_single(&pdev->dev, mapaddr, frag_len, direction);
 }

-static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
-				   struct mlxsw_pci_queue_elem_info *elem_info,
-				   gfp_t gfp)
+static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page,
+					       u16 byte_count)
 {
-	size_t buf_len = MLXSW_PORT_MAX_MTU;
-	char *wqe = elem_info->elem;
+	void *data = page_address(page);
+	unsigned int allocated_size;
 	struct sk_buff *skb;
-	int err;

-	skb = __netdev_alloc_skb_ip_align(NULL, buf_len, gfp);
-	if (!skb)
-		return -ENOMEM;
+	allocated_size = page_size(page);
+	skb = napi_build_skb(data, allocated_size);
+	if (unlikely(!skb))
+		return ERR_PTR(-ENOMEM);

-	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
-				     buf_len, DMA_FROM_DEVICE);
-	if (err)
-		goto err_frag_map;
-
-	elem_info->u.rdq.skb = skb;
-	return 0;
-
-err_frag_map:
-	dev_kfree_skb_any(skb);
-	return err;
+	skb_reserve(skb, MLXSW_PCI_SKB_HEADROOM);
+	skb_put(skb, byte_count);
+	return skb;
 }

-static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
-				   struct mlxsw_pci_queue_elem_info *elem_info)
+static int mlxsw_pci_rdq_page_alloc(struct mlxsw_pci_queue *q,
+				    struct mlxsw_pci_queue_elem_info *elem_info)
 {
-	struct sk_buff *skb;
-	char *wqe;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
+	size_t buf_len = MLXSW_PORT_MAX_MTU;
+	char *wqe = elem_info->elem;
+	struct page *page;

-	skb = elem_info->u.rdq.skb;
-	wqe = elem_info->elem;
+	page = page_pool_dev_alloc_pages(cq->u.cq.page_pool);
+	if (unlikely(!page))
+		return -ENOMEM;

-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
-	dev_kfree_skb_any(skb);
+	mlxsw_pci_wqe_rx_frag_set(q->pci, page, wqe, 0, buf_len);
+	elem_info->page = page;
+	return 0;
+}
+
+static void mlxsw_pci_rdq_page_free(struct mlxsw_pci_queue *q,
+				    struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
+
+	page_pool_put_page(cq->u.cq.page_pool, elem_info->page, -1, false);
 }

 static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
@ -452,7 +469,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
 		BUG_ON(!elem_info);
-		err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_KERNEL);
+		err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 		if (err)
 			goto rollback;
 		/* Everything is set up, ring doorbell to pass elem to HW */
@ -465,7 +482,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 rollback:
 	for (i--; i >= 0; i--) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 	q->u.rdq.cq = NULL;
 	cq->u.cq.dq = NULL;
@ -483,7 +500,7 @@ static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
 	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 }

@ -618,26 +635,38 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 {
 	struct pci_dev *pdev = mlxsw_pci->pdev;
 	struct mlxsw_pci_queue_elem_info *elem_info;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 	struct mlxsw_rx_info rx_info = {};
-	char wqe[MLXSW_PCI_WQE_SIZE];
 	struct sk_buff *skb;
+	struct page *page;
 	u16 byte_count;
 	int err;

 	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-	skb = elem_info->u.rdq.skb;
-	memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);

 	if (q->consumer_counter++ != consumer_counter_limit)
 		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");

-	err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_ATOMIC);
+	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
+		byte_count -= ETH_FCS_LEN;
+
+	page = elem_info->page;
+
+	err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 	if (err) {
-		dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+		dev_err_ratelimited(&pdev->dev, "Failed to alloc page\n");
 		goto out;
 	}

-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+	skb = mlxsw_pci_rdq_build_skb(page, byte_count);
+	if (IS_ERR(skb)) {
+		dev_err_ratelimited(&pdev->dev, "Failed to build skb for RDQ\n");
+		page_pool_recycle_direct(cq->u.cq.page_pool, page);
+		goto out;
+	}
+
+	skb_mark_for_recycle(skb);

 	if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
 		rx_info.is_lag = true;
@ -670,10 +699,6 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,

 	mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe);

-	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
-	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
-		byte_count -= ETH_FCS_LEN;
-	skb_put(skb, byte_count);
 	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);

 out: