accel/habanalabs/gaudi2: add eq health check using irq
This is the second patch for applying the eq health check mechanism which will add support for the interrupt flow for gaudi2 asic. More info about the interrupt mechanism: set a dedicated msix for the eq error interrupt, and add interrupt handler for it. when FW detects some issue with EQ like EQ_FULL, it'll raise that interrupt and driver should reset the device. Driver will inform the FW which msix index to use through the already existing handshake mechanism which will send msix info message to fw. Signed-off-by: farah kassabri <fkassabri@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
7c4130e6dd
commit
764bfd138f
@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
|
||||
irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
|
||||
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
|
||||
irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
|
||||
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
|
||||
u32 hl_cq_inc_ptr(u32 ptr);
|
||||
|
||||
int hl_asid_init(struct hl_device *hdev);
|
||||
|
@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
|
||||
{
|
||||
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
|
||||
struct hl_device *hdev = arg;
|
||||
|
||||
dev_err(hdev->dev, "EQ error interrupt received\n");
|
||||
|
||||
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
/**
|
||||
* hl_irq_handler_eq - irq handler for event queue
|
||||
*
|
||||
|
@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
|
||||
return "gaudi2 unexpected error";
|
||||
case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
|
||||
return "gaudi2 user completion";
|
||||
case GAUDI2_IRQ_NUM_EQ_ERROR:
|
||||
return "gaudi2 eq error";
|
||||
default:
|
||||
return "invalid";
|
||||
}
|
||||
@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
|
||||
}
|
||||
}
|
||||
|
||||
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
|
||||
rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
|
||||
IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
|
||||
hdev);
|
||||
if (rc) {
|
||||
dev_err(hdev->dev, "Failed to request IRQ %d", irq);
|
||||
goto free_user_irq;
|
||||
}
|
||||
|
||||
gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
|
||||
|
||||
return 0;
|
||||
@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
|
||||
}
|
||||
|
||||
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
|
||||
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
|
||||
}
|
||||
|
||||
static void gaudi2_disable_msix(struct hl_device *hdev)
|
||||
@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
|
||||
cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
|
||||
free_irq(irq, cq);
|
||||
|
||||
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
|
||||
free_irq(irq, hdev);
|
||||
|
||||
pci_free_irq_vectors(hdev->pdev);
|
||||
|
||||
gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
|
||||
@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
|
||||
static void gaudi2_get_msi_info(__le32 *table)
|
||||
{
|
||||
table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
|
||||
table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
|
||||
}
|
||||
|
||||
static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)
|
||||
|
@ -419,6 +419,7 @@ enum gaudi2_irq_num {
|
||||
GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
|
||||
GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
|
||||
GAUDI2_IRQ_NUM_TPC_ASSERT,
|
||||
GAUDI2_IRQ_NUM_EQ_ERROR,
|
||||
GAUDI2_IRQ_NUM_RESERVED_FIRST,
|
||||
GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
|
||||
GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
|
||||
|
@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
|
||||
CPUCP_NIC_PORT5_MSI_TYPE,
|
||||
CPUCP_NIC_PORT7_MSI_TYPE,
|
||||
CPUCP_NIC_PORT9_MSI_TYPE,
|
||||
CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
|
||||
CPUCP_NUM_OF_MSI_TYPES
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user