Open source GPL/LGPL release

This commit is contained in:
svcmobrel-release
2022-07-21 16:03:29 -07:00
commit f338182221
2260 changed files with 576813 additions and 0 deletions

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_CE) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_ce_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.ce_info.header.sub_unit_id = inst;
err_pkt.err_desc = err_desc;
/* sub_err_type can be decoded using intr_info by referring
* to the interrupt status register definition corresponding
* to the error that is being reported.
*/
err_pkt.err_info.ce_info.header.sub_err_type = intr_info;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ce_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report CE error: "
"inst=%u err_id=%u intr_info=%u",
inst, err_id, intr_info);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_ce_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type)
{
nvgpu_report_ce_err(g, hw_unit, 0U, err_index, sub_err_type);
}

View File

@@ -0,0 +1,161 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/kmem.h>
#include <nvgpu/log.h>
#include <nvgpu/cic.h>
#include <nvgpu/nvgpu_err_info.h>
#include "cic_priv.h"
int nvgpu_cic_init_common(struct gk20a *g)
{
struct nvgpu_cic *cic;
int err = 0;
if (g->cic != NULL) {
cic_dbg(g, "CIC unit already initialized");
return 0;
}
cic = nvgpu_kzalloc(g, sizeof(*cic));
if (cic == NULL) {
nvgpu_err(g, "Failed to allocate memory "
"for struct nvgpu_cic");
return -ENOMEM;
}
if (g->ops.cic.init != NULL) {
err = g->ops.cic.init(g, cic);
if (err != 0) {
nvgpu_err(g, "CIC chip specific "
"initialization failed.");
goto cleanup;
}
} else {
cic->err_lut = NULL;
cic->num_hw_modules = 0;
}
g->cic = cic;
cic_dbg(g, "CIC unit initialization done.");
return 0;
cleanup:
if (cic != NULL) {
nvgpu_kfree(g, cic);
}
return err;
}
int nvgpu_cic_deinit_common(struct gk20a *g)
{
struct nvgpu_cic *cic;
cic = g->cic;
if (cic == NULL) {
cic_dbg(g, "CIC unit already deinitialized");
return 0;
}
cic->err_lut = NULL;
cic->num_hw_modules = 0;
nvgpu_kfree(g, cic);
g->cic = NULL;
return 0;
}
int nvgpu_cic_check_hw_unit_id(struct gk20a *g, u32 hw_unit_id)
{
if (g->cic == NULL) {
nvgpu_err(g, "CIC is not initialized");
return -EINVAL;
}
if (g->cic->num_hw_modules == 0U) {
cic_dbg(g, "LUT not initialized.");
return -EINVAL;
}
if (hw_unit_id >= g->cic->num_hw_modules) {
cic_dbg(g, "Invalid input HW unit ID.");
return -EINVAL;
}
return 0;
}
int nvgpu_cic_check_err_id(struct gk20a *g, u32 hw_unit_id,
u32 err_id)
{
int err = 0;
if ((g->cic == NULL) || (g->cic->err_lut == NULL)) {
cic_dbg(g, "CIC/LUT not initialized.");
return -EINVAL;
}
err = nvgpu_cic_check_hw_unit_id(g, hw_unit_id);
if (err != 0) {
return err;
}
if (err_id >= g->cic->err_lut[hw_unit_id].num_errs) {
err = -EINVAL;
}
return err;
}
int nvgpu_cic_get_err_desc(struct gk20a *g, u32 hw_unit_id,
u32 err_id, struct nvgpu_err_desc **err_desc)
{
int err = 0;
/* if (g->cic != NULL) and (g->cic->err_lut != NULL) check
* can be skipped here as it checked as part of
* nvgpu_cic_check_err_id() called below.
*/
err = nvgpu_cic_check_err_id(g, hw_unit_id, err_id);
if (err != 0) {
return err;
}
*err_desc = &(g->cic->err_lut[hw_unit_id].errs[err_id]);
return err;
}
int nvgpu_cic_get_num_hw_modules(struct gk20a *g)
{
if (g->cic == NULL) {
nvgpu_err(g, "CIC is not initialized");
return -EINVAL;
}
return g->cic->num_hw_modules;
}

View File

@@ -0,0 +1,251 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/cic.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/bug.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/trace.h>
int nvgpu_cic_wait_for_stall_interrupts(struct gk20a *g, u32 timeout)
{
/* wait until all stalling irqs are handled */
return NVGPU_COND_WAIT(&g->mc.sw_irq_stall_last_handled_cond,
nvgpu_atomic_read(&g->mc.sw_irq_stall_pending) == 0,
timeout);
}
int nvgpu_cic_wait_for_nonstall_interrupts(struct gk20a *g, u32 timeout)
{
/* wait until all non-stalling irqs are handled */
return NVGPU_COND_WAIT(&g->mc.sw_irq_nonstall_last_handled_cond,
nvgpu_atomic_read(&g->mc.sw_irq_nonstall_pending) == 0,
timeout);
}
void nvgpu_cic_wait_for_deferred_interrupts(struct gk20a *g)
{
int ret;
ret = nvgpu_cic_wait_for_stall_interrupts(g, 0U);
if (ret != 0) {
nvgpu_err(g, "wait for stall interrupts failed %d", ret);
}
ret = nvgpu_cic_wait_for_nonstall_interrupts(g, 0U);
if (ret != 0) {
nvgpu_err(g, "wait for nonstall interrupts failed %d", ret);
}
}
void nvgpu_cic_intr_mask(struct gk20a *g)
{
unsigned long flags = 0;
if (g->ops.mc.intr_mask != NULL) {
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_mask(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
}
#ifdef CONFIG_NVGPU_NON_FUSA
void nvgpu_cic_log_pending_intrs(struct gk20a *g)
{
if (g->ops.mc.log_pending_intrs != NULL) {
g->ops.mc.log_pending_intrs(g);
}
}
void nvgpu_cic_intr_enable(struct gk20a *g)
{
unsigned long flags = 0;
if (g->ops.mc.intr_enable != NULL) {
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_enable(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
}
#endif
void nvgpu_cic_intr_stall_unit_config(struct gk20a *g, u32 unit, bool enable)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_stall_unit_config(g, unit, enable);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
void nvgpu_cic_intr_nonstall_unit_config(struct gk20a *g, u32 unit, bool enable)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_nonstall_unit_config(g, unit, enable);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
void nvgpu_cic_intr_stall_pause(struct gk20a *g)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_stall_pause(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
void nvgpu_cic_intr_stall_resume(struct gk20a *g)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_stall_resume(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
void nvgpu_cic_intr_nonstall_pause(struct gk20a *g)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_nonstall_pause(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
void nvgpu_cic_intr_nonstall_resume(struct gk20a *g)
{
unsigned long flags = 0;
nvgpu_spinlock_irqsave(&g->mc.intr_lock, flags);
g->ops.mc.intr_nonstall_resume(g);
nvgpu_spinunlock_irqrestore(&g->mc.intr_lock, flags);
}
static void nvgpu_cic_intr_nonstall_work(struct gk20a *g, u32 work_ops)
{
bool semaphore_wakeup, post_events;
semaphore_wakeup =
(((work_ops & NVGPU_CIC_NONSTALL_OPS_WAKEUP_SEMAPHORE) != 0U) ?
true : false);
post_events = (((work_ops & NVGPU_CIC_NONSTALL_OPS_POST_EVENTS) != 0U) ?
true : false);
if (semaphore_wakeup) {
g->ops.semaphore_wakeup(g, post_events);
}
}
u32 nvgpu_cic_intr_nonstall_isr(struct gk20a *g)
{
u32 non_stall_intr_val = 0U;
if (nvgpu_is_powered_off(g)) {
return NVGPU_CIC_INTR_UNMASK;
}
/* not from gpu when sharing irq with others */
non_stall_intr_val = g->ops.mc.intr_nonstall(g);
if (non_stall_intr_val == 0U) {
return NVGPU_CIC_INTR_NONE;
}
nvgpu_cic_intr_nonstall_pause(g);
if (g->sw_quiesce_pending) {
return NVGPU_CIC_INTR_QUIESCE_PENDING;
}
nvgpu_atomic_set(&g->mc.sw_irq_nonstall_pending, 1);
return NVGPU_CIC_INTR_HANDLE;
}
void nvgpu_cic_intr_nonstall_handle(struct gk20a *g)
{
int err;
u32 nonstall_ops = 0;
nonstall_ops = g->ops.mc.isr_nonstall(g);
if (nonstall_ops != 0U) {
nvgpu_cic_intr_nonstall_work(g, nonstall_ops);
}
/* sync handled irq counter before re-enabling interrupts */
nvgpu_atomic_set(&g->mc.sw_irq_nonstall_pending, 0);
nvgpu_cic_intr_nonstall_resume(g);
err = nvgpu_cond_broadcast(&g->mc.sw_irq_nonstall_last_handled_cond);
if (err != 0) {
nvgpu_err(g, "nvgpu_cond_broadcast failed err=%d", err);
}
}
u32 nvgpu_cic_intr_stall_isr(struct gk20a *g)
{
u32 mc_intr_0 = 0U;
nvgpu_trace_intr_stall_start(g);
if (nvgpu_is_powered_off(g)) {
return NVGPU_CIC_INTR_UNMASK;
}
/* not from gpu when sharing irq with others */
mc_intr_0 = g->ops.mc.intr_stall(g);
if (mc_intr_0 == 0U) {
return NVGPU_CIC_INTR_NONE;
}
nvgpu_cic_intr_stall_pause(g);
if (g->sw_quiesce_pending) {
return NVGPU_CIC_INTR_QUIESCE_PENDING;
}
nvgpu_atomic_set(&g->mc.sw_irq_stall_pending, 1);
nvgpu_trace_intr_stall_done(g);
return NVGPU_CIC_INTR_HANDLE;
}
void nvgpu_cic_intr_stall_handle(struct gk20a *g)
{
int err;
nvgpu_trace_intr_thread_stall_start(g);
g->ops.mc.isr_stall(g);
nvgpu_trace_intr_thread_stall_done(g);
/* sync handled irq counter before re-enabling interrupts */
nvgpu_atomic_set(&g->mc.sw_irq_stall_pending, 0);
nvgpu_cic_intr_stall_resume(g);
err = nvgpu_cond_broadcast(&g->mc.sw_irq_stall_last_handled_cond);
if (err != 0) {
nvgpu_err(g, "nvgpu_cond_broadcast failed err=%d", err);
}
}

View File

@@ -0,0 +1,291 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef CIC_PRIV_H
#define CIC_PRIV_H
#include <nvgpu/types.h>
struct gk20a;
struct nvgpu_err_hw_module;
struct nvgpu_err_msg;
struct gpu_err_header;
/*
* @file
*
* Declare CIC's private structure to store error-policy LUT and
* other data and ops needed during error reporting.
*/
#define ERR_INJECT_TEST_PATTERN 0xA5
/*
* This struct contains members related to error-policy look-up table,
* number of units reporting errors.
*/
struct nvgpu_cic {
/** Pointer for error look-up table. */
struct nvgpu_err_hw_module *err_lut;
/** Total number of GPU HW modules considered in CIC. */
u32 num_hw_modules;
};
/**
* @brief Inject ECC error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param inst [in] - Instance ID.
*
* - Sets values for error address and error count.
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_ecc_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 inst);
/**
* @brief Inject HOST error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param sub_err_type [in] - Sub error type.
*
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_host_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type);
/**
* @brief Inject GR error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param sub_err_type [in] - Sub error type.
*
* - Sets values for GR exception and SM machine check error information.
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_gr_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type);
/**
* @brief Inject CE error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param sub_err_type [in] - Sub error type.
*
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_ce_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type);
/**
* @brief Inject CE error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param err_code [in] - Error code.
*
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_pri_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 err_code);
/**
* @brief Inject PMU error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param sub_err_type [in] - Sub error type.
*
* - Sets values for error info.
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_pmu_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type);
/**
* @brief Inject CTXSW error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param inst [in] - Instance ID.
*
* - Sets values for error info.
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_ctxsw_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 inst);
/**
* @brief Inject MMU error.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* @param err_index [in] - Error index.
* @param sub_err_type [in] - Sub error type.
*
* - Sets values for mmu page fault info.
* - Invokes error reporting API with the required set of inputs.
*
* @return None
*/
void nvgpu_inject_mmu_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type);
/**
* @brief Initialize error message header.
*
* @param header [in] - Error message header.
*
* This is used to initialize error message header.
*
* @return None
*/
void nvgpu_init_err_msg_header(struct gpu_err_header *header);
/**
* @brief Initialize error message.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is common
* for all HW units.
*
* @return None
*/
void nvgpu_init_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for HOST unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to HOST unit.
*
* @return None
*/
void nvgpu_init_host_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize ECC error message.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to ECC errors.
*
* @return None
*/
void nvgpu_init_ecc_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for PRI unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to PRI unit.
*
* @return None
*/
void nvgpu_init_pri_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for CE unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to CE unit.
*
* @return None
*/
void nvgpu_init_ce_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for PMU unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to PMU unit.
*
* @return None
*/
void nvgpu_init_pmu_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for GR unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to GR unit.
*
* @return None
*/
void nvgpu_init_gr_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for CTXSW.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to CTXSW.
*
* @return None
*/
void nvgpu_init_ctxsw_err_msg(struct nvgpu_err_msg *msg);
/**
* @brief Initialize error message for MMU unit.
*
* @param msg [in] - Error message.
*
* This is used to initialize error message that is specific to MMU unit.
*
* @return None
*/
void nvgpu_init_mmu_err_msg(struct nvgpu_err_msg *msg);
#endif /* CIC_PRIV_H */

View File

@@ -0,0 +1,97 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
void *data)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
u32 inst = 0;
struct ctxsw_err_info *err_info = (struct ctxsw_err_info *)data;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_FECS) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for"
" err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_ctxsw_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.ctxsw_info.header.sub_unit_id = inst;
err_pkt.err_info.ctxsw_info.curr_ctx = err_info->curr_ctx;
err_pkt.err_info.ctxsw_info.chid = err_info->chid;
err_pkt.err_info.ctxsw_info.ctxsw_status0 = err_info->ctxsw_status0;
err_pkt.err_info.ctxsw_info.ctxsw_status1 = err_info->ctxsw_status1;
err_pkt.err_info.ctxsw_info.mailbox_value = err_info->mailbox_value;
err_pkt.err_desc = err_desc;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ctxsw_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report CTXSW error: "
"err_id=%u, mailbox_val=%u",
err_id, err_info->mailbox_value);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_ctxsw_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 inst)
{
struct ctxsw_err_info err_info;
(void)memset(&err_info, ERR_INJECT_TEST_PATTERN, sizeof(err_info));
nvgpu_report_ctxsw_err(g, hw_unit, err_index, (void *)&err_info);
}

View File

@@ -0,0 +1,87 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_ecc_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.ecc_info.header.sub_unit_id = inst;
err_pkt.err_info.ecc_info.header.address = err_addr;
err_pkt.err_info.ecc_info.err_cnt = err_count;
err_pkt.err_desc = err_desc;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.ecc_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report ECC error: hw_unit=%u, inst=%u, "
"err_id=%u, err_addr=%llu, err_count=%llu",
hw_unit, inst, err_id, err_addr, err_count);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_ecc_swerror(struct gk20a *g, u32 hw_unit, u32 err_index,
u32 inst)
{
u64 err_addr, err_count;
err_addr = (u64)ERR_INJECT_TEST_PATTERN;
err_count = (u64)ERR_INJECT_TEST_PATTERN;
nvgpu_report_ecc_err(g, hw_unit, inst, err_index, err_addr, err_count);
}

View File

@@ -0,0 +1,169 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
static void nvpgu_report_fill_err_info(u32 hw_unit,
struct nvgpu_err_msg *err_pkt, struct gr_err_info *err_info)
{
if (hw_unit == NVGPU_ERR_MODULE_SM) {
struct gr_sm_mcerr_info *info = err_info->sm_mcerr_info;
err_pkt->err_info.sm_info.warp_esr_pc =
info->hww_warp_esr_pc;
err_pkt->err_info.sm_info.warp_esr_status =
info->hww_warp_esr_status;
err_pkt->err_info.sm_info.curr_ctx =
info->curr_ctx;
err_pkt->err_info.sm_info.chid =
info->chid;
err_pkt->err_info.sm_info.tsgid =
info->tsgid;
err_pkt->err_info.sm_info.gpc =
info->gpc;
err_pkt->err_info.sm_info.tpc =
info->tpc;
err_pkt->err_info.sm_info.sm =
info->sm;
} else {
struct gr_exception_info *info = err_info->exception_info;
err_pkt->err_info.gr_info.curr_ctx = info->curr_ctx;
err_pkt->err_info.gr_info.chid = info->chid;
err_pkt->err_info.gr_info.tsgid = info->tsgid;
err_pkt->err_info.gr_info.status = info->status;
}
}
void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, struct gr_err_info *err_info, u32 sub_err_type)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if ((hw_unit != NVGPU_ERR_MODULE_SM) &&
(hw_unit != NVGPU_ERR_MODULE_PGRAPH)) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_gr_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_desc = err_desc;
err_pkt.err_info.gr_info.header.sub_err_type = sub_err_type;
err_pkt.err_info.gr_info.header.sub_unit_id = inst;
nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info);
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
if (hw_unit == NVGPU_ERR_MODULE_SM) {
nvgpu_err(g, "Failed to report SM exception"
"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
err_pkt.err_info.sm_info.gpc,
err_pkt.err_info.sm_info.tpc,
err_pkt.err_info.sm_info.sm,
err_pkt.err_info.sm_info.warp_esr_status);
}
if (hw_unit == NVGPU_ERR_MODULE_PGRAPH) {
nvgpu_err(g, "Failed to report PGRAPH"
"exception: inst=%u, err_id=%u, "
"status=%u", inst, err_id,
err_pkt.err_info.gr_info.status);
}
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_gr_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type)
{
struct gr_err_info err_info;
struct gr_exception_info gr_error_info;
struct gr_sm_mcerr_info sm_error_info;
int err = 0;
u32 inst = 0U;
/*
* Fill fixed test pattern data for the error message
* payload.
*/
(void)memset(&gr_error_info, ERR_INJECT_TEST_PATTERN, sizeof(gr_error_info));
(void)memset(&sm_error_info, ERR_INJECT_TEST_PATTERN, sizeof(sm_error_info));
switch (hw_unit) {
case NVGPU_ERR_MODULE_PGRAPH:
{
err_info.exception_info = &gr_error_info;
}
break;
case NVGPU_ERR_MODULE_SM:
{
err_info.sm_mcerr_info = &sm_error_info;
}
break;
default:
{
nvgpu_err(g, "unsupported hw_unit(%u)", hw_unit);
err = -EINVAL;
}
break;
}
if (err != 0) {
return;
}
nvgpu_report_gr_err(g, hw_unit, inst, err_index,
&err_info, sub_err_type);
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
u32 inst, u32 err_id, u32 intr_info)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_HOST) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_host_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.host_info.header.sub_unit_id = inst;
err_pkt.err_desc = err_desc;
/* sub_err_type can be decoded using intr_info by referring
* to the interrupt status register definition corresponding
* to the error that is being reported.
*/
err_pkt.err_info.host_info.header.sub_err_type = intr_info;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.host_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report HOST error: "
"inst=%u, err_id=%u, intr_info=%u",
inst, err_id, intr_info);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_host_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type)
{
nvgpu_report_host_err(g, hw_unit, 0U, err_index, sub_err_type);
}

View File

@@ -0,0 +1,131 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
struct mmu_fault_info *fault_info, u32 status, u32 sub_err_type)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_HUBMMU) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_mmu_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.mmu_info.header.sub_err_type = sub_err_type;
err_pkt.err_info.mmu_info.status = status;
/* Copy contents of mmu_fault_info */
if (fault_info != NULL) {
err_pkt.err_info.mmu_info.info.inst_ptr = fault_info->inst_ptr;
err_pkt.err_info.mmu_info.info.inst_aperture
= fault_info->inst_aperture;
err_pkt.err_info.mmu_info.info.fault_addr
= fault_info->fault_addr;
err_pkt.err_info.mmu_info.info.fault_addr_aperture
= fault_info->fault_addr_aperture;
err_pkt.err_info.mmu_info.info.timestamp_lo
= fault_info->timestamp_lo;
err_pkt.err_info.mmu_info.info.timestamp_hi
= fault_info->timestamp_hi;
err_pkt.err_info.mmu_info.info.mmu_engine_id
= fault_info->mmu_engine_id;
err_pkt.err_info.mmu_info.info.gpc_id = fault_info->gpc_id;
err_pkt.err_info.mmu_info.info.client_type
= fault_info->client_type;
err_pkt.err_info.mmu_info.info.client_id
= fault_info->client_id;
err_pkt.err_info.mmu_info.info.fault_type
= fault_info->fault_type;
err_pkt.err_info.mmu_info.info.access_type
= fault_info->access_type;
err_pkt.err_info.mmu_info.info.protected_mode
= fault_info->protected_mode;
err_pkt.err_info.mmu_info.info.replayable_fault
= fault_info->replayable_fault;
err_pkt.err_info.mmu_info.info.replay_fault_en
= fault_info->replay_fault_en;
err_pkt.err_info.mmu_info.info.valid = fault_info->valid;
err_pkt.err_info.mmu_info.info.faulted_pbdma =
fault_info->faulted_pbdma;
err_pkt.err_info.mmu_info.info.faulted_engine =
fault_info->faulted_engine;
err_pkt.err_info.mmu_info.info.faulted_subid =
fault_info->faulted_subid;
err_pkt.err_info.mmu_info.info.chid = fault_info->chid;
}
err_pkt.err_desc = err_desc;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.mmu_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report MMU fault: hw_unit=%u, "
"err_id=%u, sub_err_type=%u, status=%u",
hw_unit, err_id, sub_err_type, status);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_mmu_swerror(struct gk20a *g, u32 hw_unit, u32 err_index,
u32 sub_err_type)
{
u32 status = 0U;
struct mmu_fault_info fault_info;
(void) memset(&fault_info, ERR_INJECT_TEST_PATTERN, sizeof(fault_info));
nvgpu_report_mmu_err(g, hw_unit, err_index,
&fault_info, status, sub_err_type);
}

View File

@@ -0,0 +1,126 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/string.h>
#include "cic_priv.h"
void nvgpu_init_err_msg_header(struct gpu_err_header *header)
{
header->version.major = (u16)1U;
header->version.minor = (u16)0U;
header->sub_err_type = 0U;
header->sub_unit_id = 0UL;
header->address = 0UL;
header->timestamp_ns = 0UL;
}
void nvgpu_init_err_msg(struct nvgpu_err_msg *msg)
{
(void) memset(msg, 0, sizeof(struct nvgpu_err_msg));
msg->hw_unit_id = 0U;
msg->is_critical = false;
msg->err_id = (u8)0U;
msg->err_size = (u8)0U;
}
void nvgpu_init_host_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.host_info.header);
}
void nvgpu_init_ecc_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.ecc_info.header);
msg->err_info.ecc_info.err_cnt = 0UL;
}
void nvgpu_init_pri_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.pri_info.header);
}
void nvgpu_init_ce_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.ce_info.header);
}
void nvgpu_init_pmu_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.pmu_err_info.header);
msg->err_info.pmu_err_info.status = 0U;
}
void nvgpu_init_gr_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.gr_info.header);
msg->err_info.gr_info.curr_ctx = 0U;
msg->err_info.gr_info.chid = 0U;
msg->err_info.gr_info.tsgid = 0U;
msg->err_info.gr_info.status = 0U;
}
void nvgpu_init_ctxsw_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.ctxsw_info.header);
msg->err_info.ctxsw_info.curr_ctx = 0U;
msg->err_info.ctxsw_info.tsgid = 0U;
msg->err_info.ctxsw_info.chid = 0U;
msg->err_info.ctxsw_info.ctxsw_status0 = 0U;
msg->err_info.ctxsw_info.ctxsw_status1 = 0U;
msg->err_info.ctxsw_info.mailbox_value = 0U;
}
void nvgpu_init_mmu_err_msg(struct nvgpu_err_msg *msg)
{
nvgpu_init_err_msg(msg);
nvgpu_init_err_msg_header(&msg->err_info.mmu_info.header);
msg->err_info.mmu_info.info.inst_ptr = 0UL;
msg->err_info.mmu_info.info.inst_aperture = 0U;
msg->err_info.mmu_info.info.fault_addr = 0UL;
msg->err_info.mmu_info.info.fault_addr_aperture = 0U;
msg->err_info.mmu_info.info.timestamp_lo = 0U;
msg->err_info.mmu_info.info.timestamp_hi = 0U;
msg->err_info.mmu_info.info.mmu_engine_id = 0U;
msg->err_info.mmu_info.info.gpc_id = 0U;
msg->err_info.mmu_info.info.client_type = 0U;
msg->err_info.mmu_info.info.client_id = 0U;
msg->err_info.mmu_info.info.fault_type = 0U;
msg->err_info.mmu_info.info.access_type = 0U;
msg->err_info.mmu_info.info.protected_mode = 0U;
msg->err_info.mmu_info.info.replayable_fault = false;
msg->err_info.mmu_info.info.replay_fault_en = 0U;
msg->err_info.mmu_info.info.valid = false;
msg->err_info.mmu_info.info.faulted_pbdma = 0U;
msg->err_info.mmu_info.info.faulted_engine = 0U;
msg->err_info.mmu_info.info.faulted_subid = 0U;
msg->err_info.mmu_info.info.chid = 0U;
msg->err_info.mmu_info.status = 0U;
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
u32 sub_err_type, u32 status)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_PMU) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_pmu_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.pmu_err_info.status = status;
err_pkt.err_info.pmu_err_info.header.sub_err_type = sub_err_type;
err_pkt.err_desc = err_desc;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.pmu_err_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report PMU error: "
"err_id=%u, sub_err_type=%u, status=%u",
err_id, sub_err_type, status);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_pmu_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 sub_err_type)
{
u32 err_info;
err_info = (u32)ERR_INJECT_TEST_PATTERN;
nvgpu_report_pmu_err(g, hw_unit, err_index, sub_err_type, err_info);
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_init.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_err_info.h>
#include <nvgpu/cic.h>
#include "cic_priv.h"
void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u32 err_addr, u32 err_code)
{
int err = 0;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_msg err_pkt;
if (g->ops.cic.report_err == NULL) {
cic_dbg(g, "CIC does not support reporting error "
"to safety services");
return;
}
if (hw_unit != NVGPU_ERR_MODULE_PRI) {
nvgpu_err(g, "invalid hw module (%u)", hw_unit);
err = -EINVAL;
goto handle_report_failure;
}
err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc);
if (err != 0) {
nvgpu_err(g, "Failed to get err_desc for "
"err_id (%u) for hw module (%u)",
err_id, hw_unit);
goto handle_report_failure;
}
nvgpu_init_pri_err_msg(&err_pkt);
err_pkt.hw_unit_id = hw_unit;
err_pkt.err_id = err_desc->error_id;
err_pkt.is_critical = err_desc->is_critical;
err_pkt.err_info.pri_info.header.sub_unit_id = inst;
err_pkt.err_info.pri_info.header.address = (u64) err_addr;
err_pkt.err_desc = err_desc;
/* sub_err_type can be decoded using err_code by referring
* to the FECS pri error codes.
*/
err_pkt.err_info.pri_info.header.sub_err_type = err_code;
err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(
sizeof(err_pkt.err_info.pri_info));
if (g->ops.cic.report_err != NULL) {
err = g->ops.cic.report_err(g, (void *)&err_pkt,
sizeof(err_pkt), err_desc->is_critical);
if (err != 0) {
nvgpu_err(g, "Failed to report PRI error: "
"inst=%u, err_id=%u, err_code=%u",
inst, err_id, err_code);
}
}
handle_report_failure:
if (err != 0) {
nvgpu_sw_quiesce(g);
}
}
void nvgpu_inject_pri_swerror(struct gk20a *g, u32 hw_unit,
u32 err_index, u32 err_code)
{
nvgpu_report_pri_err(g, hw_unit, 0U, err_index, 0U, err_code);
}