nvscic2c-pcie: add support for error handling

Enable AER reporting for ednpoint.

Provide error handler for AER reception and
update error code to be read by user space process.
Change PCIe link to down.

Implement error reporting for runtime eDMA xfer errors.

Bug 4088959
Jira NVIPC-334

Change-Id: I74871e5226eab1b708c72aa71216cd160c6ebf68
Signed-off-by: dbadgaiyan <dbadgaiyan@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/2915753
(cherry picked from commit 7b24941fb486ada70229c42ae1deec12f75028c2)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/2921480
Reviewed-by: Arihant Jejani <ajejani@nvidia.com>
Reviewed-by: Mikko Perttunen <mperttunen@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
dbadgaiyan
2023-06-05 13:30:40 +00:00
committed by mobile promotions
parent f19d5372b4
commit 3067ceeb00
8 changed files with 313 additions and 58 deletions

View File

@@ -40,6 +40,8 @@ enum mem_mmap_type {
SELF_MEM_MMAP, SELF_MEM_MMAP,
/* Map Link memory segment to query link status with Peer.*/ /* Map Link memory segment to query link status with Peer.*/
LINK_MEM_MMAP, LINK_MEM_MMAP,
/* Map eDMA error memory segment to query eDMA xfer errors.*/
EDMA_ERR_MEM_MMAP,
/* Maximum. */ /* Maximum. */
MEM_MAX_MMAP, MEM_MAX_MMAP,
}; };
@@ -110,11 +112,14 @@ struct endpoint_t {
/* msi irq to x86 RP */ /* msi irq to x86 RP */
u16 msi_irq; u16 msi_irq;
/* book-keeping of peer notifications.*/ /*
atomic_t dataevent_count; * book-keeping of:
* peer notifications.
* PCIe link event.
* eDMA xfer error event.
*/
atomic_t event_count;
/* book-keeping of PCIe link event.*/
atomic_t linkevent_count;
u32 linkevent_id; u32 linkevent_id;
/* propagate events when endpoint was initialized.*/ /* propagate events when endpoint was initialized.*/
@@ -168,7 +173,7 @@ struct endpoint_drv_ctx_t {
* in PCIe link status(up->down OR down->up). * in PCIe link status(up->down OR down->up).
*/ */
static void static void
link_event_callback(void *event_type, void *ctx); event_callback(void *event_type, void *ctx);
/* prototype. */ /* prototype. */
static void static void
@@ -320,6 +325,10 @@ endpoint_fops_mmap(struct file *filp, struct vm_area_struct *vma)
} }
ret = pci_client_mmap_link_mem(endpoint->pci_client_h, vma); ret = pci_client_mmap_link_mem(endpoint->pci_client_h, vma);
goto exit; goto exit;
case EDMA_ERR_MEM_MMAP:
ret = pci_client_mmap_edma_err_mem(endpoint->pci_client_h,
endpoint->minor, vma);
goto exit;
default: default:
pr_err("(%s): unrecognised mmap type: (%llu)\n", pr_err("(%s): unrecognised mmap type: (%llu)\n",
endpoint->name, mmap_type); endpoint->name, mmap_type);
@@ -376,13 +385,10 @@ endpoint_fops_poll(struct file *filp, poll_table *wait)
/* /*
* wake up read, write (& exception - those who want to use) fd on * wake up read, write (& exception - those who want to use) fd on
* getting Link + peer notifications. * getting Link + peer notifications + eDMA xfer error notifications.
*/ */
if (atomic_read(&endpoint->linkevent_count)) { if (atomic_read(&endpoint->event_count)) {
atomic_dec(&endpoint->linkevent_count); atomic_dec(&endpoint->event_count);
mask = (__force __poll_t)(POLLPRI | POLLIN | POLLOUT);
} else if (atomic_read(&endpoint->dataevent_count)) {
atomic_dec(&endpoint->dataevent_count);
mask = (__force __poll_t)(POLLPRI | POLLIN | POLLOUT); mask = (__force __poll_t)(POLLPRI | POLLIN | POLLOUT);
} }
@@ -465,6 +471,8 @@ ioctl_get_info_impl(struct endpoint_t *endpoint,
get_info->self.size = endpoint->self_mem.size; get_info->self.size = endpoint->self_mem.size;
get_info->link.offset = (LINK_MEM_MMAP << PAGE_SHIFT); get_info->link.offset = (LINK_MEM_MMAP << PAGE_SHIFT);
get_info->link.size = PAGE_ALIGN(sizeof(enum nvscic2c_pcie_link)); get_info->link.size = PAGE_ALIGN(sizeof(enum nvscic2c_pcie_link));
get_info->edma_err.offset = (EDMA_ERR_MEM_MMAP << PAGE_SHIFT);
get_info->edma_err.size = PAGE_ALIGN(sizeof(u32));
return 0; return 0;
} }
@@ -518,8 +526,7 @@ enable_event_handling(struct endpoint_t *endpoint)
* propagate link and state change events that occur after the device * propagate link and state change events that occur after the device
* is opened and not the stale ones. * is opened and not the stale ones.
*/ */
atomic_set(&endpoint->dataevent_count, 0); atomic_set(&endpoint->event_count, 0);
atomic_set(&endpoint->linkevent_count, 0);
atomic_set(&endpoint->event_handling, 1); atomic_set(&endpoint->event_handling, 1);
} }
@@ -532,14 +539,13 @@ disable_event_handling(struct endpoint_t *endpoint)
return ret; return ret;
atomic_set(&endpoint->event_handling, 0); atomic_set(&endpoint->event_handling, 0);
atomic_set(&endpoint->linkevent_count, 0); atomic_set(&endpoint->event_count, 0);
atomic_set(&endpoint->dataevent_count, 0);
return ret; return ret;
} }
static void static void
link_event_callback(void *data, void *ctx) event_callback(void *data, void *ctx)
{ {
struct endpoint_t *endpoint = NULL; struct endpoint_t *endpoint = NULL;
@@ -550,9 +556,9 @@ link_event_callback(void *data, void *ctx)
endpoint = (struct endpoint_t *)(ctx); endpoint = (struct endpoint_t *)(ctx);
/* notify only if the endpoint was openend.*/ /* notify only if the endpoint was opened.*/
if (atomic_read(&endpoint->event_handling)) { if (atomic_read(&endpoint->event_handling)) {
atomic_inc(&endpoint->linkevent_count); atomic_inc(&endpoint->event_count);
wake_up_interruptible_all(&endpoint->poll_waitq); wake_up_interruptible_all(&endpoint->poll_waitq);
} }
} }
@@ -634,13 +640,7 @@ syncpt_callback(void *data)
{ {
/* Skip args ceck, trusting host1x. */ /* Skip args ceck, trusting host1x. */
struct endpoint_t *endpoint = (struct endpoint_t *)(data); event_callback(NULL, data);
/* notify only if the endpoint was openend - else drain.*/
if (atomic_read(&endpoint->event_handling)) {
atomic_inc(&endpoint->dataevent_count);
wake_up_interruptible_all(&endpoint->poll_waitq);
}
} }
/* /*
@@ -992,7 +992,7 @@ create_endpoint_device(struct endpoint_drv_ctx_t *eps_ctx,
} }
/* Register for link events.*/ /* Register for link events.*/
ops.callback = &(link_event_callback); ops.callback = &(event_callback);
ops.ctx = (void *)(endpoint); ops.ctx = (void *)(endpoint);
ret = pci_client_register_for_link_event(endpoint->pci_client_h, &ops, ret = pci_client_register_for_link_event(endpoint->pci_client_h, &ops,
&endpoint->linkevent_id); &endpoint->linkevent_id);

View File

@@ -3,6 +3,7 @@
#define pr_fmt(fmt) "nvscic2c-pcie: epc: " fmt #define pr_fmt(fmt) "nvscic2c-pcie: epc: " fmt
#include <linux/aer.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/mm.h> #include <linux/mm.h>
@@ -222,8 +223,13 @@ nvscic2c_pcie_epc_remove(struct pci_dev *pdev)
pr_err("(%s): Error waiting for endpoints to close\n", pr_err("(%s): Error waiting for endpoints to close\n",
drv_ctx->drv_name); drv_ctx->drv_name);
/* if PCIe EP SoC went away abruptly already, jump to local deinit. */ /*
if (!pci_device_is_present(pdev)) * Jump to local deinit if any of below condition is true:
* => if PCIe EP SoC went away abruptly already.
* => if PCIe AER received.
*/
if (!pci_device_is_present(pdev) ||
atomic_read(&drv_ctx->epc_ctx->aer_received))
goto deinit; goto deinit;
/* /*
@@ -252,15 +258,19 @@ nvscic2c_pcie_epc_remove(struct pci_dev *pdev)
/* /*
* continue wait only if PCIe EP SoC is still there. It can * continue wait only if PCIe EP SoC is still there. It can
* go away abruptly waiting for it's own endpoints to close. * go away abruptly waiting for it's own endpoints to close.
* Also check PCIe AER not received.
*/ */
if (pci_device_is_present(pdev)) { if (!pci_device_is_present(pdev)) {
pr_err("(%s): Still waiting for nvscic2c-pcie-epf to close\n",
drv_ctx->drv_name);
} else {
pr_debug("(%s): nvscic2c-pcie-epf went away\n", pr_debug("(%s): nvscic2c-pcie-epf went away\n",
drv_ctx->drv_name); drv_ctx->drv_name);
break; break;
} else if (atomic_read(&drv_ctx->epc_ctx->aer_received)) {
pr_debug("(%s): PCIe AER received\n",
drv_ctx->drv_name);
break;
} }
pr_err("(%s): Still waiting for nvscic2c-pcie-epf to close\n",
drv_ctx->drv_name);
} else if (timeout > 0) { } else if (timeout > 0) {
pr_debug("(%s): nvscic2c-pcie-epf closed\n", pr_debug("(%s): nvscic2c-pcie-epf closed\n",
drv_ctx->drv_name); drv_ctx->drv_name);
@@ -280,6 +290,7 @@ deinit:
pci_release_region(pdev, 0); pci_release_region(pdev, 0);
pci_clear_master(pdev); pci_clear_master(pdev);
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev); pci_disable_device(pdev);
dt_release(&drv_ctx->drv_param); dt_release(&drv_ctx->drv_param);
@@ -324,6 +335,7 @@ nvscic2c_pcie_epc_probe(struct pci_dev *pdev,
} }
init_completion(&epc_ctx->epf_ready_cmpl); init_completion(&epc_ctx->epf_ready_cmpl);
init_completion(&epc_ctx->epf_shutdown_cmpl); init_completion(&epc_ctx->epf_shutdown_cmpl);
atomic_set(&epc_ctx->aer_received, 0);
drv_ctx->drv_mode = DRV_MODE_EPC; drv_ctx->drv_mode = DRV_MODE_EPC;
drv_ctx->drv_name = name; drv_ctx->drv_name = name;
@@ -338,6 +350,7 @@ nvscic2c_pcie_epc_probe(struct pci_dev *pdev,
ret = pcim_enable_device(pdev); ret = pcim_enable_device(pdev);
if (ret) if (ret)
goto err_enable_device; goto err_enable_device;
pci_enable_pcie_error_reporting(pdev);
pci_set_master(pdev); pci_set_master(pdev);
ret = pci_request_region(pdev, 0, MODULE_NAME); ret = pci_request_region(pdev, 0, MODULE_NAME);
if (ret) if (ret)
@@ -477,6 +490,61 @@ err_dt_parse:
return ret; return ret;
} }
/*
* Hot-replug is required to recover for both type of errors.
* Hence we will return PCI_ERS_RESULT_DISCONNECT in both cases.
*/
static pci_ers_result_t
nvscic2c_pcie_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct driver_ctx_t *drv_ctx = NULL;
if (WARN_ON(!pdev))
return PCI_ERS_RESULT_DISCONNECT;
drv_ctx = pci_get_drvdata(pdev);
if (WARN_ON(!drv_ctx))
return PCI_ERS_RESULT_DISCONNECT;
atomic_set(&drv_ctx->epc_ctx->aer_received, 1);
if (state == pci_channel_io_normal) {
pr_err("AER(NONFATAL) detected for dev %04x:%02x:%02x.%x\n",
pci_domain_nr(pdev->bus),
pdev->bus->number,
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
(void)pci_client_set_link_aer_error(drv_ctx->pci_client_h,
NVSCIC2C_PCIE_AER_UNCORRECTABLE_NONFATAL);
} else {
if (state == pci_channel_io_frozen) {
pr_err("AER: FATAL detected for dev %04x:%02x:%02x.%x\n",
pci_domain_nr(pdev->bus),
pdev->bus->number,
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
} else {
pr_err("Unknow error for dev %04x:%02x:%02x.%x treat as AER: FATAL\n",
pci_domain_nr(pdev->bus),
pdev->bus->number,
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
}
(void)pci_client_set_link_aer_error(drv_ctx->pci_client_h,
NVSCIC2C_PCIE_AER_UNCORRECTABLE_FATAL);
}
/* Mark PCIe Link down and notify all subscribers. */
pci_client_change_link_status(drv_ctx->pci_client_h,
NVSCIC2C_PCIE_LINK_DOWN);
return PCI_ERS_RESULT_DISCONNECT;
}
static struct pci_error_handlers nvscic2c_pcie_error_handlers = {
.error_detected = nvscic2c_pcie_error_detected,
};
MODULE_DEVICE_TABLE(pci, nvscic2c_pcie_epc_tbl); MODULE_DEVICE_TABLE(pci, nvscic2c_pcie_epc_tbl);
static struct pci_driver nvscic2c_pcie_epc_driver = { static struct pci_driver nvscic2c_pcie_epc_driver = {
.name = DRIVER_NAME_EPC, .name = DRIVER_NAME_EPC,
@@ -484,6 +552,7 @@ static struct pci_driver nvscic2c_pcie_epc_driver = {
.probe = nvscic2c_pcie_epc_probe, .probe = nvscic2c_pcie_epc_probe,
.remove = nvscic2c_pcie_epc_remove, .remove = nvscic2c_pcie_epc_remove,
.shutdown = nvscic2c_pcie_epc_remove, .shutdown = nvscic2c_pcie_epc_remove,
.err_handler = &nvscic2c_pcie_error_handlers,
}; };
module_pci_driver(nvscic2c_pcie_epc_driver); module_pci_driver(nvscic2c_pcie_epc_driver);

View File

@@ -413,6 +413,7 @@ shutdown_msg_cb(void *data, void *ctx)
return; return;
} }
atomic_set(&drv_ctx->epf_ctx->shutdown_msg_received, 1);
/* schedule deinitialization of epf interfaces. */ /* schedule deinitialization of epf interfaces. */
schedule_work(&drv_ctx->epf_ctx->deinitialization_work); schedule_work(&drv_ctx->epf_ctx->deinitialization_work);
} }
@@ -460,16 +461,19 @@ deinit_work(struct work_struct *work)
/* /*
* Acknowledge @DRV_MODE_EPC that @DRV_MODE_EPF(this) endpoints are * Acknowledge @DRV_MODE_EPC that @DRV_MODE_EPF(this) endpoints are
* closed. If PCIe RP SoC went abnormally away(halt/reset/kernel oops) * closed if shutdown message was received from @DRV_MODE_EPC.
* signal anyway (sending signal will not cause local SoC fault when * If @DRV_MODE_EPC went abruptly or AER was generated, @DRV_MODE_EPC
* PCIe RP SoC (@DRV_MODE_EPC) went abnormally away). * will not send shutdown message.
*/ */
if (atomic_read(&drv_ctx->epf_ctx->shutdown_msg_received)) {
msg.type = COMM_MSG_TYPE_LINK; msg.type = COMM_MSG_TYPE_LINK;
msg.u.link.status = NVSCIC2C_PCIE_LINK_DOWN; msg.u.link.status = NVSCIC2C_PCIE_LINK_DOWN;
ret = comm_channel_ctrl_msg_send(drv_ctx->comm_channel_h, &msg); ret = comm_channel_ctrl_msg_send(drv_ctx->comm_channel_h, &msg);
if (ret) if (ret)
pr_err("(%s): Failed to send LINK (DOWN) message\n", pr_err("(%s): Failed to send LINK (DOWN) message\n",
drv_ctx->drv_name); drv_ctx->drv_name);
atomic_set(&drv_ctx->epf_ctx->shutdown_msg_received, 0);
}
endpoints_release(&drv_ctx->endpoints_h); endpoints_release(&drv_ctx->endpoints_h);
edma_module_deinit(drv_ctx); edma_module_deinit(drv_ctx);
@@ -788,6 +792,9 @@ nvscic2c_pcie_epf_probe(struct pci_epf *epf)
atomic_set(&drv_ctx->epf_ctx->epf_initialized, 0); atomic_set(&drv_ctx->epf_ctx->epf_initialized, 0);
init_waitqueue_head(&epf_ctx->core_initialized_waitq); init_waitqueue_head(&epf_ctx->core_initialized_waitq);
/* to check if shutdown message response required. */
atomic_set(&epf_ctx->shutdown_msg_received, 0);
return ret; return ret;
err_alloc_epf_ctx: err_alloc_epf_ctx:

View File

@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */ /* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ /* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
/* /*
* Internal to nvscic2c-pcie module. This file is not supposed to be included * Internal to nvscic2c-pcie module. This file is not supposed to be included
@@ -83,6 +83,7 @@ struct epf_context_t {
struct work_struct deinitialization_work; struct work_struct deinitialization_work;
atomic_t core_initialized; atomic_t core_initialized;
atomic_t epf_initialized; atomic_t epf_initialized;
atomic_t shutdown_msg_received;
wait_queue_head_t core_initialized_waitq; wait_queue_head_t core_initialized_waitq;
}; };
@@ -90,6 +91,7 @@ struct epf_context_t {
struct epc_context_t { struct epc_context_t {
struct completion epf_ready_cmpl; struct completion epf_ready_cmpl;
struct completion epf_shutdown_cmpl; struct completion epf_shutdown_cmpl;
atomic_t aer_received;
}; };
/* /*

View File

@@ -8,6 +8,7 @@
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/iommu.h> #include <linux/iommu.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/libnvdimm.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/pci-epc.h> #include <linux/pci-epc.h>
@@ -17,8 +18,6 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/version.h> #include <linux/version.h>
#include <asm/cacheflush.h>
#include <uapi/misc/nvscic2c-pcie-ioctl.h> #include <uapi/misc/nvscic2c-pcie-ioctl.h>
#include <linux/tegra-pcie-edma.h> #include <linux/tegra-pcie-edma.h>
@@ -85,6 +84,8 @@ struct pci_client_t {
*/ */
void *mem_mngr_h; void *mem_mngr_h;
/* eDMA error memory for each endpoint. */
struct cpu_buff_t ep_edma_err_mem[MAX_LINK_EVENT_USERS];
/* /*
* the context of DRV_MODE_EPC/DRV_MODE_EPF * the context of DRV_MODE_EPC/DRV_MODE_EPF
*/ */
@@ -92,6 +93,47 @@ struct pci_client_t {
}; };
static void
free_ep_edma_err_mem(struct pci_client_t *ctx)
{
uint32_t i = 0U;
struct cpu_buff_t *edma_err_mem = NULL;
if (ctx != NULL)
for (i = 0U; i < MAX_LINK_EVENT_USERS; i++) {
edma_err_mem = &ctx->ep_edma_err_mem[i];
kfree(edma_err_mem->pva);
edma_err_mem->pva = NULL;
}
}
static int
allocate_ep_edma_err_mem(struct pci_client_t *ctx)
{
int ret = 0;
uint32_t i = 0U;
struct cpu_buff_t *edma_err_mem = NULL;
for (i = 0U; i < MAX_LINK_EVENT_USERS; i++) {
edma_err_mem = &ctx->ep_edma_err_mem[i];
edma_err_mem->size = PAGE_ALIGN(sizeof(u32));
edma_err_mem->pva = kzalloc(edma_err_mem->size, GFP_KERNEL);
if (WARN_ON(!edma_err_mem->pva)) {
ret = -ENOMEM;
goto err;
}
/* physical address to be mmap() in user-space.*/
edma_err_mem->phys_addr = virt_to_phys(edma_err_mem->pva);
*(u32 *)edma_err_mem->pva = NVSCIC2C_PCIE_NO_ERROR;
}
return ret;
err:
free_ep_edma_err_mem(ctx);
return ret;
}
static void static void
free_link_status_mem(struct pci_client_t *ctx) free_link_status_mem(struct pci_client_t *ctx)
{ {
@@ -106,6 +148,7 @@ static int
allocate_link_status_mem(struct pci_client_t *ctx) allocate_link_status_mem(struct pci_client_t *ctx)
{ {
int ret = 0; int ret = 0;
struct nvscic2c_pcie_link_mem *link_mem = NULL;
struct cpu_buff_t *mem = &ctx->link_status_mem; struct cpu_buff_t *mem = &ctx->link_status_mem;
mem->size = PAGE_ALIGN(sizeof(enum nvscic2c_pcie_link)); mem->size = PAGE_ALIGN(sizeof(enum nvscic2c_pcie_link));
@@ -114,7 +157,10 @@ allocate_link_status_mem(struct pci_client_t *ctx)
return -ENOMEM; return -ENOMEM;
atomic_set(&ctx->link_status, NVSCIC2C_PCIE_LINK_DOWN); atomic_set(&ctx->link_status, NVSCIC2C_PCIE_LINK_DOWN);
*((enum nvscic2c_pcie_link *)mem->pva) = NVSCIC2C_PCIE_LINK_DOWN; link_mem = ((struct nvscic2c_pcie_link_mem *)mem->pva);
link_mem->link_status = NVSCIC2C_PCIE_LINK_DOWN;
link_mem->aer_err = NVSCIC2C_PCIE_NO_ERROR;
/* physical address to be mmap() in user-space.*/ /* physical address to be mmap() in user-space.*/
mem->phys_addr = virt_to_phys(mem->pva); mem->phys_addr = virt_to_phys(mem->pva);
@@ -250,6 +296,10 @@ pci_client_init(struct pci_client_params *params, void **pci_client_h)
if (ret) if (ret)
goto err; goto err;
ret = allocate_ep_edma_err_mem(ctx);
if (ret)
goto err;
/* /*
* for mapping application objs and endpoint physical memory to remote * for mapping application objs and endpoint physical memory to remote
* visible area. * visible area.
@@ -317,6 +367,7 @@ pci_client_deinit(void **pci_client_h)
ctx->mem_mngr_h = NULL; ctx->mem_mngr_h = NULL;
} }
free_ep_edma_err_mem(ctx);
free_link_status_mem(ctx); free_link_status_mem(ctx);
mutex_destroy(&ctx->event_tbl_lock); mutex_destroy(&ctx->event_tbl_lock);
kfree(ctx); kfree(ctx);
@@ -439,6 +490,73 @@ pci_client_mmap_link_mem(void *pci_client_h, struct vm_area_struct *vma)
return ret; return ret;
} }
/* Helper function to mmap eDMA error memory to user-space.*/
int
pci_client_mmap_edma_err_mem(void *pci_client_h,
u32 ep_id, struct vm_area_struct *vma)
{
int ret = 0;
struct cpu_buff_t *edma_err_mem = NULL;
struct pci_client_t *ctx = (struct pci_client_t *)pci_client_h;
if (WARN_ON(!vma || !ctx))
return -EINVAL;
if (WARN_ON(ep_id >= MAX_LINK_EVENT_USERS))
return -EINVAL;
edma_err_mem = &ctx->ep_edma_err_mem[ep_id];
if (WARN_ON(!edma_err_mem->pva))
return -EINVAL;
if ((vma->vm_end - vma->vm_start) != edma_err_mem->size)
return -EINVAL;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
ret = remap_pfn_range(vma,
vma->vm_start,
PFN_DOWN(edma_err_mem->phys_addr),
edma_err_mem->size,
vma->vm_page_prot);
if (ret)
pr_err("remap_pfn_range returns error: (%d) for eDMA err mem\n", ret);
return ret;
}
/* Update eDMA xfer error code.*/
int
pci_client_set_edma_error(void *pci_client_h, u32 ep_id, u32 err)
{
int ret = 0;
struct event_t *event = NULL;
struct callback_ops *ops = NULL;
struct cpu_buff_t *edma_err_mem = NULL;
struct pci_client_t *ctx = (struct pci_client_t *)pci_client_h;
if (WARN_ON(!ctx))
return -EINVAL;
if (WARN_ON(ep_id > MAX_LINK_EVENT_USERS ||
err != NVSCIC2C_PCIE_EDMA_XFER_ERROR))
return -EINVAL;
edma_err_mem = &ctx->ep_edma_err_mem[ep_id];
*(u32 *)edma_err_mem->pva = err;
arch_invalidate_pmem(edma_err_mem->pva, edma_err_mem->size);
mutex_lock(&ctx->event_tbl_lock);
/* notify user. */
event = &ctx->event_tbl[ep_id];
if (atomic_read(&event->in_use)) {
ops = &event->cb_ops;
ops->callback(NULL, ops->ctx);
}
mutex_unlock(&ctx->event_tbl_lock);
return ret;
}
/* Query PCI link status. */ /* Query PCI link status. */
enum nvscic2c_pcie_link enum nvscic2c_pcie_link
pci_client_query_link_status(void *pci_client_h) pci_client_query_link_status(void *pci_client_h)
@@ -526,9 +644,9 @@ pci_client_change_link_status(void *pci_client_h,
{ {
u32 i = 0; u32 i = 0;
int ret = 0; int ret = 0;
struct page *page = NULL;
struct event_t *event = NULL; struct event_t *event = NULL;
struct callback_ops *ops = NULL; struct callback_ops *ops = NULL;
struct nvscic2c_pcie_link_mem *link_mem = NULL;
struct pci_client_t *ctx = (struct pci_client_t *)pci_client_h; struct pci_client_t *ctx = (struct pci_client_t *)pci_client_h;
if (WARN_ON(!ctx)) if (WARN_ON(!ctx))
@@ -544,9 +662,9 @@ pci_client_change_link_status(void *pci_client_h,
* Call is arm64 specific. * Call is arm64 specific.
*/ */
atomic_set(&ctx->link_status, status); atomic_set(&ctx->link_status, status);
*((enum nvscic2c_pcie_link *)ctx->link_status_mem.pva) = status; link_mem = ((struct nvscic2c_pcie_link_mem *)ctx->link_status_mem.pva);
page = virt_to_page(ctx->link_status_mem.pva); link_mem->link_status = status;
flush_dcache_page(page); arch_invalidate_pmem(ctx->link_status_mem.pva, ctx->link_status_mem.size);
/* interrupt registered users. */ /* interrupt registered users. */
mutex_lock(&ctx->event_tbl_lock); mutex_lock(&ctx->event_tbl_lock);
@@ -562,6 +680,32 @@ pci_client_change_link_status(void *pci_client_h,
return ret; return ret;
} }
/* Update PCIe error offset with error. */
int
pci_client_set_link_aer_error(void *pci_client_h, u32 err)
{
int ret = 0;
struct nvscic2c_pcie_link_mem *link_mem = NULL;
struct pci_client_t *ctx = (struct pci_client_t *)pci_client_h;
if (WARN_ON(!ctx))
return -EINVAL;
if (WARN_ON((err != NVSCIC2C_PCIE_AER_UNCORRECTABLE_FATAL) &&
(err != NVSCIC2C_PCIE_AER_UNCORRECTABLE_NONFATAL)))
return -EINVAL;
link_mem = ((struct nvscic2c_pcie_link_mem *)ctx->link_status_mem.pva);
/*
* There can be more than one type of AER raised before system recovery is done.
* Hence update the offset with masked error codes.
*/
link_mem->aer_err |= err;
arch_invalidate_pmem(ctx->link_status_mem.pva, ctx->link_status_mem.size);
return ret;
}
/* /*
* Helper functions to set and get driver context from pci_client t * Helper functions to set and get driver context from pci_client t
* *

View File

@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */ /* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ /* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#ifndef __PCI_CLIENT_H__ #ifndef __PCI_CLIENT_H__
#define __PCI_CLIENT_H__ #define __PCI_CLIENT_H__
@@ -85,10 +85,22 @@ int
pci_client_change_link_status(void *pci_client_h, pci_client_change_link_status(void *pci_client_h,
enum nvscic2c_pcie_link status); enum nvscic2c_pcie_link status);
/* Update PCIe error offset with error. */
int
pci_client_set_link_aer_error(void *pci_client_h, u32 err);
/* Helper function to mmap the PCI link status memory to user-space.*/ /* Helper function to mmap the PCI link status memory to user-space.*/
int int
pci_client_mmap_link_mem(void *pci_client_h, struct vm_area_struct *vma); pci_client_mmap_link_mem(void *pci_client_h, struct vm_area_struct *vma);
/* Helper function to mmap eDMA error memory to user-space.*/
int
pci_client_mmap_edma_err_mem(void *pci_client_h,
u32 ep_id, struct vm_area_struct *vma);
/* Update eDMA xfer error code.*/
int
pci_client_set_edma_error(void *pci_client_h, u32 ep_id, u32 err);
/* Query PCI link status. */ /* Query PCI link status. */
enum nvscic2c_pcie_link enum nvscic2c_pcie_link
pci_client_query_link_status(void *pci_client_h); pci_client_query_link_status(void *pci_client_h);

View File

@@ -609,6 +609,9 @@ ioctl_submit_copy_request(struct stream_ext_ctx_t *ctx,
ret = -EIO; ret = -EIO;
atomic_dec(&ctx->transfer_count); atomic_dec(&ctx->transfer_count);
release_copy_request_handles(cr); release_copy_request_handles(cr);
/* Scheduling edma job failed. Update edma error and Notify user. */
(void)pci_client_set_edma_error(ctx->pci_client_h, ctx->ep_id,
NVSCIC2C_PCIE_EDMA_XFER_ERROR);
goto reclaim_cr; goto reclaim_cr;
} }
@@ -959,6 +962,11 @@ callback_edma_xfer(void *priv, edma_xfer_status_t status,
if (status == EDMA_XFER_SUCCESS) { if (status == EDMA_XFER_SUCCESS) {
signal_remote_post_fences(cr); signal_remote_post_fences(cr);
signal_local_post_fences(cr); signal_local_post_fences(cr);
} else {
/* eDMA xfer failed, Update eDMA error and notify user. */
(void)pci_client_set_edma_error(cr->ctx->pci_client_h,
cr->ctx->ep_id,
NVSCIC2C_PCIE_EDMA_XFER_ERROR);
} }
/* releases the references of the cubmit-copy handles.*/ /* releases the references of the cubmit-copy handles.*/

View File

@@ -15,12 +15,24 @@
#define MAX_NAME_SZ (32) #define MAX_NAME_SZ (32)
/* Represents PCIe runtime errors reported to user space. */
#define NVSCIC2C_PCIE_NO_ERROR (0x00U)
#define NVSCIC2C_PCIE_EDMA_XFER_ERROR (0x01U)
#define NVSCIC2C_PCIE_AER_UNCORRECTABLE_FATAL (0x02U)
#define NVSCIC2C_PCIE_AER_UNCORRECTABLE_NONFATAL (0x04U)
/* Link status between the two peers - encapsulates PCIe link also.*/ /* Link status between the two peers - encapsulates PCIe link also.*/
enum nvscic2c_pcie_link { enum nvscic2c_pcie_link {
NVSCIC2C_PCIE_LINK_DOWN = 0, NVSCIC2C_PCIE_LINK_DOWN = 0,
NVSCIC2C_PCIE_LINK_UP, NVSCIC2C_PCIE_LINK_UP,
}; };
/* Represents layout of link status memory. */
struct nvscic2c_pcie_link_mem {
enum nvscic2c_pcie_link link_status;
__u32 aer_err;
};
/** /**
* stream extensions - object type. * stream extensions - object type.
*/ */
@@ -70,6 +82,7 @@ struct nvscic2c_pcie_endpoint_info {
struct nvscic2c_pcie_endpoint_mem_info peer; struct nvscic2c_pcie_endpoint_mem_info peer;
struct nvscic2c_pcie_endpoint_mem_info self; struct nvscic2c_pcie_endpoint_mem_info self;
struct nvscic2c_pcie_endpoint_mem_info link; struct nvscic2c_pcie_endpoint_mem_info link;
struct nvscic2c_pcie_endpoint_mem_info edma_err;
}; };
/** /**