diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index b4b5af375..361e366d5 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -1054,7 +1054,19 @@ grmgr: cic: owner: Tejal K safe: yes - sources: [ include/nvgpu/nvgpu_cic.h ] + sources: [ common/cic/cic.c, + common/cic/ce_cic.c, + common/cic/ctxsw_cic.c, + common/cic/msg_cic.c, + common/cic/ecc_cic.c, + common/cic/host_cic.c, + common/cic/gr_cic.c, + common/cic/pri_cic.c, + common/cic/pmu_cic.c, + common/cic/mmu_cic.c, + common/cic/cic_priv.h, + include/nvgpu/gops/cic.h, + include/nvgpu/cic.h ] ## ## HAL units. Currently they are under common but this needs to change. diff --git a/arch/nvgpu-hal-new.yaml b/arch/nvgpu-hal-new.yaml index 384680a84..979451eec 100644 --- a/arch/nvgpu-hal-new.yaml +++ b/arch/nvgpu-hal-new.yaml @@ -827,3 +827,10 @@ tpc: owner: Divya S sources: [ hal/tpc/tpc_gv11b.c, hal/tpc/tpc_gv11b.h ] + +cic: + safe: yes + owner: Tejal K + sources: [ hal/cic/cic_gv11b_fusa.c, + hal/cic/cic_lut_gv11b_fusa.c, + hal/cic/cic_gv11b.h ] diff --git a/arch/nvgpu-linux.yaml b/arch/nvgpu-linux.yaml index d7b9dc4bd..8beff0201 100644 --- a/arch/nvgpu-linux.yaml +++ b/arch/nvgpu-linux.yaml @@ -227,8 +227,8 @@ vgpu: vm: sources: [ os/linux/vm.c ] -sdl: - sources: [ os/linux/sdl/sdl_stub.c ] +cic: + sources: [ os/linux/cic/cic_stub.c ] # Group all the Linux headers for now. headers: diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 849be93c2..42a147e35 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -308,6 +308,16 @@ nvgpu-y += \ common/clk_arb/clk_arb_gp10b.o \ common/rc/rc.o \ common/grmgr/grmgr.o \ + common/cic/cic.o \ + common/cic/ce_cic.o \ + common/cic/ctxsw_cic.o \ + common/cic/ecc_cic.o \ + common/cic/host_cic.o \ + common/cic/gr_cic.o \ + common/cic/pri_cic.o \ + common/cic/pmu_cic.o \ + common/cic/mmu_cic.o \ + common/cic/msg_cic.o \ hal/bus/bus_gk20a.o \ hal/class/class_gm20b.o \ hal/class/class_gp10b.o \ @@ -380,7 +390,9 @@ nvgpu-y += \ hal/top/top_gp106.o \ hal/top/top_gp10b.o \ hal/tpc/tpc_gv11b.o \ - hal/priv_ring/priv_ring_gv11b.o + hal/priv_ring/priv_ring_gv11b.o \ + hal/cic/cic_gv11b_fusa.o \ + hal/cic/cic_lut_gv11b_fusa.o # Linux specific parts of nvgpu. nvgpu-y += \ @@ -418,7 +430,7 @@ nvgpu-y += \ os/linux/dt.o \ os/linux/ecc_sysfs.o \ os/linux/bsearch.o \ - os/linux/sdl/sdl_stub.o \ + os/linux/cic/cic_stub.o \ os/linux/dmabuf_priv.o \ os/linux/power_ops.o diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index 58e91ba66..f63591443 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -152,6 +152,16 @@ srcs += common/device.c \ common/rc/rc.c \ common/ce/ce.c \ common/grmgr/grmgr.c \ + common/cic/cic.c \ + common/cic/ce_cic.c \ + common/cic/ctxsw_cic.c \ + common/cic/ecc_cic.c \ + common/cic/host_cic.c \ + common/cic/gr_cic.c \ + common/cic/pri_cic.c \ + common/cic/pmu_cic.c \ + common/cic/mmu_cic.c \ + common/cic/msg_cic.c \ hal/init/hal_gv11b.c \ hal/init/hal_gv11b_litter.c \ hal/init/hal_init.c \ @@ -246,7 +256,9 @@ srcs += hal/mm/mm_gv11b_fusa.c \ hal/sync/syncpt_cmdbuf_gv11b_fusa.c \ hal/therm/therm_gv11b_fusa.c \ hal/top/top_gm20b_fusa.c \ - hal/top/top_gv11b_fusa.c + hal/top/top_gv11b_fusa.c \ + hal/cic/cic_gv11b_fusa.c \ + hal/cic/cic_lut_gv11b_fusa.c # Source files below are not guaranteed to be functionaly safe (FuSa) and are # only included in the normal build. diff --git a/drivers/gpu/nvgpu/common/cic/ce_cic.c b/drivers/gpu/nvgpu/common/cic/ce_cic.c new file mode 100644 index 000000000..d608ea76b --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/ce_cic.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, + u32 inst, u32 err_id, u32 intr_info) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_CE) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_ce_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.ce_info.header.sub_unit_id = inst; + err_pkt.err_desc = err_desc; + /* sub_err_type can be decoded using intr_info by referring + * to the interrupt status register definition corresponding + * to the error that is being reported. + */ + err_pkt.err_info.ce_info.header.sub_err_type = intr_info; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.ce_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report CE error: " + "inst=%u err_id=%u intr_info=%u", + inst, err_id, intr_info); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_ce_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type) +{ + nvgpu_report_ce_err(g, hw_unit, 0U, err_index, sub_err_type); +} diff --git a/drivers/gpu/nvgpu/common/cic/cic.c b/drivers/gpu/nvgpu/common/cic/cic.c new file mode 100644 index 000000000..12d3877aa --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/cic.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +int nvgpu_cic_init_common(struct gk20a *g) +{ + struct nvgpu_cic *cic; + int err = 0; + + if (g->cic != NULL) { + cic_dbg(g, "CIC unit already initialized"); + return 0; + } + + cic = nvgpu_kzalloc(g, sizeof(*cic)); + if (cic == NULL) { + nvgpu_err(g, "Failed to allocate memory " + "for struct nvgpu_cic"); + return -ENOMEM; + } + + if (g->ops.cic.init != NULL) { + err = g->ops.cic.init(g, cic); + if (err != 0) { + nvgpu_err(g, "CIC chip specific " + "initialization failed."); + goto cleanup; + } + } else { + cic->err_lut = NULL; + cic->num_hw_modules = 0; + } + + g->cic = cic; + cic_dbg(g, "CIC unit initialization done."); + return 0; + +cleanup: + if (cic != NULL) { + nvgpu_kfree(g, cic); + } + return err; +} + +int nvgpu_cic_deinit_common(struct gk20a *g) +{ + struct nvgpu_cic *cic; + + cic = g->cic; + + if (cic == NULL) { + cic_dbg(g, "CIC unit already deinitialized"); + return 0; + } + + cic->err_lut = NULL; + cic->num_hw_modules = 0; + + nvgpu_kfree(g, cic); + g->cic = NULL; + + return 0; +} + +int nvgpu_cic_check_hw_unit_id(struct gk20a *g, u32 hw_unit_id) +{ + if (g->cic == NULL) { + nvgpu_err(g, "CIC is not initialized"); + return -EINVAL; + } + + if (g->cic->num_hw_modules == 0U) { + cic_dbg(g, "LUT not initialized."); + return -EINVAL; + } + + if (hw_unit_id >= g->cic->num_hw_modules) { + cic_dbg(g, "Invalid input HW unit ID."); + return -EINVAL; + } + + return 0; +} + +int nvgpu_cic_check_err_id(struct gk20a *g, u32 hw_unit_id, + u32 err_id) +{ + int err = 0; + + if ((g->cic == NULL) || (g->cic->err_lut == NULL)) { + cic_dbg(g, "CIC/LUT not initialized."); + return -EINVAL; + } + + err = nvgpu_cic_check_hw_unit_id(g, hw_unit_id); + if (err != 0) { + return err; + } + + if (err_id >= g->cic->err_lut[hw_unit_id].num_errs) { + err = -EINVAL; + } + + return err; +} + +int nvgpu_cic_get_err_desc(struct gk20a *g, u32 hw_unit_id, + u32 err_id, struct nvgpu_err_desc **err_desc) +{ + int err = 0; + + /* if (g->cic != NULL) and (g->cic->err_lut != NULL) check + * can be skipped here as it checked as part of + * nvgpu_cic_check_err_id() called below. + */ + + err = nvgpu_cic_check_err_id(g, hw_unit_id, err_id); + if (err != 0) { + return err; + } + + *err_desc = &(g->cic->err_lut[hw_unit_id].errs[err_id]); + + return err; +} + +int nvgpu_cic_get_num_hw_modules(struct gk20a *g) +{ + if (g->cic == NULL) { + nvgpu_err(g, "CIC is not initialized"); + return -EINVAL; + } + + return g->cic->num_hw_modules; +} diff --git a/drivers/gpu/nvgpu/common/cic/cic_priv.h b/drivers/gpu/nvgpu/common/cic/cic_priv.h new file mode 100644 index 000000000..526d7b461 --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/cic_priv.h @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIC_PRIV_H +#define CIC_PRIV_H + +#include + +struct gk20a; +struct nvgpu_err_hw_module; +struct nvgpu_err_msg; +struct gpu_err_header; + +/* + * @file + * + * Declare CIC's private structure to store error-policy LUT and + * other data and ops needed during error reporting. + */ + +#define ERR_INJECT_TEST_PATTERN 0xA5 + +/* + * This struct contains members related to error-policy look-up table, + * number of units reporting errors. + */ +struct nvgpu_cic { + /** Pointer for error look-up table. */ + struct nvgpu_err_hw_module *err_lut; + + /** Total number of GPU HW modules considered in CIC. */ + u32 num_hw_modules; + +}; + +/** + * @brief Inject ECC error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param inst [in] - Instance ID. + * + * - Sets values for error address and error count. + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_ecc_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 inst); + +/** + * @brief Inject HOST error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param sub_err_type [in] - Sub error type. + * + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_host_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type); + +/** + * @brief Inject GR error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param sub_err_type [in] - Sub error type. + * + * - Sets values for GR exception and SM machine check error information. + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_gr_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type); + +/** + * @brief Inject CE error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param sub_err_type [in] - Sub error type. + * + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_ce_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type); + +/** + * @brief Inject CE error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param err_code [in] - Error code. + * + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_pri_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 err_code); + +/** + * @brief Inject PMU error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param sub_err_type [in] - Sub error type. + * + * - Sets values for error info. + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_pmu_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type); + +/** + * @brief Inject CTXSW error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param inst [in] - Instance ID. + * + * - Sets values for error info. + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_ctxsw_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 inst); + +/** + * @brief Inject MMU error. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit [in] - Index of HW unit. + * @param err_index [in] - Error index. + * @param sub_err_type [in] - Sub error type. + * + * - Sets values for mmu page fault info. + * - Invokes error reporting API with the required set of inputs. + * + * @return None + */ +void nvgpu_inject_mmu_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type); + +/** + * @brief Initialize error message header. + * + * @param header [in] - Error message header. + * + * This is used to initialize error message header. + * + * @return None + */ +void nvgpu_init_err_msg_header(struct gpu_err_header *header); + +/** + * @brief Initialize error message. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is common + * for all HW units. + * + * @return None + */ +void nvgpu_init_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for HOST unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to HOST unit. + * + * @return None + */ +void nvgpu_init_host_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize ECC error message. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to ECC errors. + * + * @return None + */ +void nvgpu_init_ecc_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for PRI unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to PRI unit. + * + * @return None + */ +void nvgpu_init_pri_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for CE unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to CE unit. + * + * @return None + */ +void nvgpu_init_ce_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for PMU unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to PMU unit. + * + * @return None + */ +void nvgpu_init_pmu_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for GR unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to GR unit. + * + * @return None + */ +void nvgpu_init_gr_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for CTXSW. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to CTXSW. + * + * @return None + */ +void nvgpu_init_ctxsw_err_msg(struct nvgpu_err_msg *msg); + +/** + * @brief Initialize error message for MMU unit. + * + * @param msg [in] - Error message. + * + * This is used to initialize error message that is specific to MMU unit. + * + * @return None + */ +void nvgpu_init_mmu_err_msg(struct nvgpu_err_msg *msg); + +#endif /* CIC_PRIV_H */ diff --git a/drivers/gpu/nvgpu/common/cic/ctxsw_cic.c b/drivers/gpu/nvgpu/common/cic/ctxsw_cic.c new file mode 100644 index 000000000..bb6a75652 --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/ctxsw_cic.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, + void *data) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + u32 inst = 0; + struct ctxsw_err_info *err_info = (struct ctxsw_err_info *)data; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_FECS) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for" + " err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_ctxsw_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.ctxsw_info.header.sub_unit_id = inst; + err_pkt.err_info.ctxsw_info.curr_ctx = err_info->curr_ctx; + err_pkt.err_info.ctxsw_info.chid = err_info->chid; + err_pkt.err_info.ctxsw_info.ctxsw_status0 = err_info->ctxsw_status0; + err_pkt.err_info.ctxsw_info.ctxsw_status1 = err_info->ctxsw_status1; + err_pkt.err_info.ctxsw_info.mailbox_value = err_info->mailbox_value; + err_pkt.err_desc = err_desc; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.ctxsw_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report CTXSW error: " + "err_id=%u, mailbox_val=%u", + err_id, err_info->mailbox_value); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_ctxsw_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 inst) +{ + struct ctxsw_err_info err_info; + + (void)memset(&err_info, ERR_INJECT_TEST_PATTERN, sizeof(err_info)); + + nvgpu_report_ctxsw_err(g, hw_unit, err_index, (void *)&err_info); +} diff --git a/drivers/gpu/nvgpu/common/cic/ecc_cic.c b/drivers/gpu/nvgpu/common/cic/ecc_cic.c new file mode 100644 index 000000000..728fc8fe2 --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/ecc_cic.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u64 err_addr, u64 err_count) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_ecc_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.ecc_info.header.sub_unit_id = inst; + err_pkt.err_info.ecc_info.header.address = err_addr; + err_pkt.err_info.ecc_info.err_cnt = err_count; + err_pkt.err_desc = err_desc; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.ecc_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report ECC error: hw_unit=%u, inst=%u, " + "err_id=%u, err_addr=%llu, err_count=%llu", + hw_unit, inst, err_id, err_addr, err_count); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_ecc_swerror(struct gk20a *g, u32 hw_unit, u32 err_index, + u32 inst) +{ + u64 err_addr, err_count; + + err_addr = (u64)ERR_INJECT_TEST_PATTERN; + err_count = (u64)ERR_INJECT_TEST_PATTERN; + + nvgpu_report_ecc_err(g, hw_unit, inst, err_index, err_addr, err_count); +} diff --git a/drivers/gpu/nvgpu/common/cic/gr_cic.c b/drivers/gpu/nvgpu/common/cic/gr_cic.c new file mode 100644 index 000000000..87269f6a9 --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/gr_cic.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +static void nvpgu_report_fill_err_info(u32 hw_unit, + struct nvgpu_err_msg *err_pkt, struct gr_err_info *err_info) +{ + if (hw_unit == NVGPU_ERR_MODULE_SM) { + struct gr_sm_mcerr_info *info = err_info->sm_mcerr_info; + + err_pkt->err_info.sm_info.warp_esr_pc = + info->hww_warp_esr_pc; + err_pkt->err_info.sm_info.warp_esr_status = + info->hww_warp_esr_status; + err_pkt->err_info.sm_info.curr_ctx = + info->curr_ctx; + err_pkt->err_info.sm_info.chid = + info->chid; + err_pkt->err_info.sm_info.tsgid = + info->tsgid; + err_pkt->err_info.sm_info.gpc = + info->gpc; + err_pkt->err_info.sm_info.tpc = + info->tpc; + err_pkt->err_info.sm_info.sm = + info->sm; + } else { + struct gr_exception_info *info = err_info->exception_info; + + err_pkt->err_info.gr_info.curr_ctx = info->curr_ctx; + err_pkt->err_info.gr_info.chid = info->chid; + err_pkt->err_info.gr_info.tsgid = info->tsgid; + err_pkt->err_info.gr_info.status = info->status; + } +} + +void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, struct gr_err_info *err_info, u32 sub_err_type) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if ((hw_unit != NVGPU_ERR_MODULE_SM) && + (hw_unit != NVGPU_ERR_MODULE_PGRAPH)) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_gr_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_desc = err_desc; + err_pkt.err_info.gr_info.header.sub_err_type = sub_err_type; + err_pkt.err_info.gr_info.header.sub_unit_id = inst; + nvpgu_report_fill_err_info(hw_unit, &err_pkt, err_info); + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8(sizeof(err_pkt.err_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + if (hw_unit == NVGPU_ERR_MODULE_SM) { + nvgpu_err(g, "Failed to report SM exception" + "gpc=%u, tpc=%u, sm=%u, esr_status=%x", + err_pkt.err_info.sm_info.gpc, + err_pkt.err_info.sm_info.tpc, + err_pkt.err_info.sm_info.sm, + err_pkt.err_info.sm_info.warp_esr_status); + } + if (hw_unit == NVGPU_ERR_MODULE_PGRAPH) { + nvgpu_err(g, "Failed to report PGRAPH" + "exception: inst=%u, err_id=%u, " + "status=%u", inst, err_id, + err_pkt.err_info.gr_info.status); + } + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_gr_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type) +{ + struct gr_err_info err_info; + struct gr_exception_info gr_error_info; + struct gr_sm_mcerr_info sm_error_info; + int err = 0; + u32 inst = 0U; + + /* + * Fill fixed test pattern data for the error message + * payload. + */ + (void)memset(&gr_error_info, ERR_INJECT_TEST_PATTERN, sizeof(gr_error_info)); + (void)memset(&sm_error_info, ERR_INJECT_TEST_PATTERN, sizeof(sm_error_info)); + + switch (hw_unit) { + case NVGPU_ERR_MODULE_PGRAPH: + { + err_info.exception_info = &gr_error_info; + } + break; + + case NVGPU_ERR_MODULE_SM: + { + err_info.sm_mcerr_info = &sm_error_info; + } + break; + + default: + { + nvgpu_err(g, "unsupported hw_unit(%u)", hw_unit); + err = -EINVAL; + } + break; + } + if (err != 0) { + return; + } + + nvgpu_report_gr_err(g, hw_unit, inst, err_index, + &err_info, sub_err_type); +} diff --git a/drivers/gpu/nvgpu/common/cic/host_cic.c b/drivers/gpu/nvgpu/common/cic/host_cic.c new file mode 100644 index 000000000..44a64177a --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/host_cic.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, + u32 inst, u32 err_id, u32 intr_info) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_HOST) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_host_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.host_info.header.sub_unit_id = inst; + err_pkt.err_desc = err_desc; + /* sub_err_type can be decoded using intr_info by referring + * to the interrupt status register definition corresponding + * to the error that is being reported. + */ + err_pkt.err_info.host_info.header.sub_err_type = intr_info; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.host_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report HOST error: " + "inst=%u, err_id=%u, intr_info=%u", + inst, err_id, intr_info); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_host_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type) +{ + nvgpu_report_host_err(g, hw_unit, 0U, err_index, sub_err_type); +} diff --git a/drivers/gpu/nvgpu/common/cic/mmu_cic.c b/drivers/gpu/nvgpu/common/cic/mmu_cic.c new file mode 100644 index 000000000..d832e630c --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/mmu_cic.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, + struct mmu_fault_info *fault_info, u32 status, u32 sub_err_type) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_HUBMMU) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_mmu_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.mmu_info.header.sub_err_type = sub_err_type; + err_pkt.err_info.mmu_info.status = status; + /* Copy contents of mmu_fault_info */ + if (fault_info != NULL) { + err_pkt.err_info.mmu_info.info.inst_ptr = fault_info->inst_ptr; + err_pkt.err_info.mmu_info.info.inst_aperture + = fault_info->inst_aperture; + err_pkt.err_info.mmu_info.info.fault_addr + = fault_info->fault_addr; + err_pkt.err_info.mmu_info.info.fault_addr_aperture + = fault_info->fault_addr_aperture; + err_pkt.err_info.mmu_info.info.timestamp_lo + = fault_info->timestamp_lo; + err_pkt.err_info.mmu_info.info.timestamp_hi + = fault_info->timestamp_hi; + err_pkt.err_info.mmu_info.info.mmu_engine_id + = fault_info->mmu_engine_id; + err_pkt.err_info.mmu_info.info.gpc_id = fault_info->gpc_id; + err_pkt.err_info.mmu_info.info.client_type + = fault_info->client_type; + err_pkt.err_info.mmu_info.info.client_id + = fault_info->client_id; + err_pkt.err_info.mmu_info.info.fault_type + = fault_info->fault_type; + err_pkt.err_info.mmu_info.info.access_type + = fault_info->access_type; + err_pkt.err_info.mmu_info.info.protected_mode + = fault_info->protected_mode; + err_pkt.err_info.mmu_info.info.replayable_fault + = fault_info->replayable_fault; + err_pkt.err_info.mmu_info.info.replay_fault_en + = fault_info->replay_fault_en; + err_pkt.err_info.mmu_info.info.valid = fault_info->valid; + err_pkt.err_info.mmu_info.info.faulted_pbdma = + fault_info->faulted_pbdma; + err_pkt.err_info.mmu_info.info.faulted_engine = + fault_info->faulted_engine; + err_pkt.err_info.mmu_info.info.faulted_subid = + fault_info->faulted_subid; + err_pkt.err_info.mmu_info.info.chid = fault_info->chid; + } + err_pkt.err_desc = err_desc; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.mmu_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report MMU fault: hw_unit=%u, " + "err_id=%u, sub_err_type=%u, status=%u", + hw_unit, err_id, sub_err_type, status); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_mmu_swerror(struct gk20a *g, u32 hw_unit, u32 err_index, + u32 sub_err_type) +{ + u32 status = 0U; + struct mmu_fault_info fault_info; + + (void) memset(&fault_info, ERR_INJECT_TEST_PATTERN, sizeof(fault_info)); + nvgpu_report_mmu_err(g, hw_unit, err_index, + &fault_info, status, sub_err_type); +} diff --git a/drivers/gpu/nvgpu/common/cic/msg_cic.c b/drivers/gpu/nvgpu/common/cic/msg_cic.c new file mode 100644 index 000000000..ffb6c153d --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/msg_cic.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "cic_priv.h" + +void nvgpu_init_err_msg_header(struct gpu_err_header *header) +{ + header->version.major = (u16)1U; + header->version.minor = (u16)0U; + header->sub_err_type = 0U; + header->sub_unit_id = 0UL; + header->address = 0UL; + header->timestamp_ns = 0UL; +} + +void nvgpu_init_err_msg(struct nvgpu_err_msg *msg) +{ + (void) memset(msg, 0, sizeof(struct nvgpu_err_msg)); + msg->hw_unit_id = 0U; + msg->is_critical = false; + msg->err_id = (u8)0U; + msg->err_size = (u8)0U; +} + +void nvgpu_init_host_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.host_info.header); +} + +void nvgpu_init_ecc_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.ecc_info.header); + msg->err_info.ecc_info.err_cnt = 0UL; +} + +void nvgpu_init_pri_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.pri_info.header); +} + +void nvgpu_init_ce_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.ce_info.header); +} + +void nvgpu_init_pmu_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.pmu_err_info.header); + msg->err_info.pmu_err_info.status = 0U; +} + +void nvgpu_init_gr_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.gr_info.header); + msg->err_info.gr_info.curr_ctx = 0U; + msg->err_info.gr_info.chid = 0U; + msg->err_info.gr_info.tsgid = 0U; + msg->err_info.gr_info.status = 0U; +} + +void nvgpu_init_ctxsw_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.ctxsw_info.header); + msg->err_info.ctxsw_info.curr_ctx = 0U; + msg->err_info.ctxsw_info.tsgid = 0U; + msg->err_info.ctxsw_info.chid = 0U; + msg->err_info.ctxsw_info.ctxsw_status0 = 0U; + msg->err_info.ctxsw_info.ctxsw_status1 = 0U; + msg->err_info.ctxsw_info.mailbox_value = 0U; +} + +void nvgpu_init_mmu_err_msg(struct nvgpu_err_msg *msg) +{ + nvgpu_init_err_msg(msg); + nvgpu_init_err_msg_header(&msg->err_info.mmu_info.header); + msg->err_info.mmu_info.info.inst_ptr = 0UL; + msg->err_info.mmu_info.info.inst_aperture = 0U; + msg->err_info.mmu_info.info.fault_addr = 0UL; + msg->err_info.mmu_info.info.fault_addr_aperture = 0U; + msg->err_info.mmu_info.info.timestamp_lo = 0U; + msg->err_info.mmu_info.info.timestamp_hi = 0U; + msg->err_info.mmu_info.info.mmu_engine_id = 0U; + msg->err_info.mmu_info.info.gpc_id = 0U; + msg->err_info.mmu_info.info.client_type = 0U; + msg->err_info.mmu_info.info.client_id = 0U; + msg->err_info.mmu_info.info.fault_type = 0U; + msg->err_info.mmu_info.info.access_type = 0U; + msg->err_info.mmu_info.info.protected_mode = 0U; + msg->err_info.mmu_info.info.replayable_fault = false; + msg->err_info.mmu_info.info.replay_fault_en = 0U; + msg->err_info.mmu_info.info.valid = false; + msg->err_info.mmu_info.info.faulted_pbdma = 0U; + msg->err_info.mmu_info.info.faulted_engine = 0U; + msg->err_info.mmu_info.info.faulted_subid = 0U; + msg->err_info.mmu_info.info.chid = 0U; + msg->err_info.mmu_info.status = 0U; +} diff --git a/drivers/gpu/nvgpu/common/cic/pmu_cic.c b/drivers/gpu/nvgpu/common/cic/pmu_cic.c new file mode 100644 index 000000000..167c84d1d --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/pmu_cic.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, + u32 sub_err_type, u32 status) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_PMU) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_pmu_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.pmu_err_info.status = status; + err_pkt.err_info.pmu_err_info.header.sub_err_type = sub_err_type; + err_pkt.err_desc = err_desc; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.pmu_err_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report PMU error: " + "err_id=%u, sub_err_type=%u, status=%u", + err_id, sub_err_type, status); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_pmu_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 sub_err_type) +{ + u32 err_info; + + err_info = (u32)ERR_INJECT_TEST_PATTERN; + + nvgpu_report_pmu_err(g, hw_unit, err_index, sub_err_type, err_info); +} diff --git a/drivers/gpu/nvgpu/common/cic/pri_cic.c b/drivers/gpu/nvgpu/common/cic/pri_cic.c new file mode 100644 index 000000000..e5efd9c49 --- /dev/null +++ b/drivers/gpu/nvgpu/common/cic/pri_cic.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "cic_priv.h" + +void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, + u32 err_id, u32 err_addr, u32 err_code) +{ + int err = 0; + struct nvgpu_err_desc *err_desc = NULL; + struct nvgpu_err_msg err_pkt; + + if (g->ops.cic.report_err == NULL) { + cic_dbg(g, "CIC does not support reporting error " + "to safety services"); + return; + } + + if (hw_unit != NVGPU_ERR_MODULE_PRI) { + nvgpu_err(g, "invalid hw module (%u)", hw_unit); + err = -EINVAL; + goto handle_report_failure; + } + + err = nvgpu_cic_get_err_desc(g, hw_unit, err_id, &err_desc); + if (err != 0) { + nvgpu_err(g, "Failed to get err_desc for " + "err_id (%u) for hw module (%u)", + err_id, hw_unit); + goto handle_report_failure; + } + + nvgpu_init_pri_err_msg(&err_pkt); + err_pkt.hw_unit_id = hw_unit; + err_pkt.err_id = err_desc->error_id; + err_pkt.is_critical = err_desc->is_critical; + err_pkt.err_info.pri_info.header.sub_unit_id = inst; + err_pkt.err_info.pri_info.header.address = (u64) err_addr; + err_pkt.err_desc = err_desc; + /* sub_err_type can be decoded using err_code by referring + * to the FECS pri error codes. + */ + err_pkt.err_info.pri_info.header.sub_err_type = err_code; + err_pkt.err_size = nvgpu_safe_cast_u64_to_u8( + sizeof(err_pkt.err_info.pri_info)); + + if (g->ops.cic.report_err != NULL) { + err = g->ops.cic.report_err(g, (void *)&err_pkt, + sizeof(err_pkt), err_desc->is_critical); + if (err != 0) { + nvgpu_err(g, "Failed to report PRI error: " + "inst=%u, err_id=%u, err_code=%u", + inst, err_id, err_code); + } + } +handle_report_failure: + if (err != 0) { + nvgpu_sw_quiesce(g); + } +} + +void nvgpu_inject_pri_swerror(struct gk20a *g, u32 hw_unit, + u32 err_index, u32 err_code) +{ + nvgpu_report_pri_err(g, hw_unit, 0U, err_index, 0U, err_code); +} diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index edc8af946..943c1a832 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -46,6 +46,7 @@ #ifdef CONFIG_NVGPU_NON_FUSA #include #endif +#include #ifdef CONFIG_NVGPU_LS_PMU #include @@ -357,6 +358,12 @@ int nvgpu_prepare_poweroff(struct gk20a *g) #endif gk20a_mask_interrupts(g); + /* Disable CIC after the interrupts are masked; + * This will ensure that CIC will not get probed + * after it's deinit. + */ + nvgpu_cic_deinit_common(g); + return ret; } @@ -716,6 +723,14 @@ int nvgpu_early_poweron(struct gk20a *g) goto done; } + /* Initialize CIC early on before the interrupts are + * enabled. + */ + err = nvgpu_cic_init_common(g); + if (err != 0) { + nvgpu_err(g, "CIC Initialization failed[%d]", err); + goto done; + } done: return err; } diff --git a/drivers/gpu/nvgpu/hal/cic/cic_gv11b.h b/drivers/gpu/nvgpu/hal/cic/cic_gv11b.h new file mode 100644 index 000000000..131faf8f0 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/cic_gv11b.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIC_GV11B_H +#define CIC_GV11B_H + +#include + +struct gk20a; +struct nvgpu_cic; + +extern struct nvgpu_err_hw_module gv11b_err_lut[]; +extern u32 size_of_gv11b_lut; + +int gv11b_cic_init(struct gk20a *g, struct nvgpu_cic *cic); + +#endif /* CIC_GV11B_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_cic.h b/drivers/gpu/nvgpu/hal/cic/cic_gv11b_fusa.c similarity index 77% rename from drivers/gpu/nvgpu/include/nvgpu/nvgpu_cic.h rename to drivers/gpu/nvgpu/hal/cic/cic_gv11b_fusa.c index 886554e04..9d1608736 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_cic.h +++ b/drivers/gpu/nvgpu/hal/cic/cic_gv11b_fusa.c @@ -20,23 +20,19 @@ * DEALINGS IN THE SOFTWARE. */ -#ifndef NVGPU_CIC_H -#define NVGPU_CIC_H - #include -/** - * @file - * - * Public structs and APIs exposed by Central Interrupt Controller - * (CIC) unit. - */ +#include "common/cic/cic_priv.h" +#include "cic_gv11b.h" -/* - * Requires a string literal for the format - notice the string - * concatination. - */ -#define cic_dbg(g, fmt, args...) \ - nvgpu_log((g), gpu_dbg_cic, "CIC | " fmt, ##args) +int gv11b_cic_init(struct gk20a *g, struct nvgpu_cic *cic) +{ + if (cic == NULL) { + nvgpu_err(g, "Invalid CIC reference pointer."); + return -EINVAL; + } -#endif /* NVGPU_CIC_H */ + cic->err_lut = gv11b_err_lut; + cic->num_hw_modules = size_of_gv11b_lut; + return 0; +} diff --git a/drivers/gpu/nvgpu/hal/cic/cic_lut_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/cic/cic_lut_gv11b_fusa.c new file mode 100644 index 000000000..20607c801 --- /dev/null +++ b/drivers/gpu/nvgpu/hal/cic/cic_lut_gv11b_fusa.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include "common/cic/cic_priv.h" +#include "cic_gv11b.h" + +/* + * A flag to enable/disable hw error injection. + */ +#ifdef CONFIG_NVGPU_INJECT_HWERR +#define INJECT_TYPE (INJECT_HW) +#else +#define INJECT_TYPE (INJECT_SW) +#endif + +/* This look-up table initializes the list of hw units and their errors. + * It also specifies the error injection mechanism supported, for each error. + * In case of hw error injection support, this initialization will be overriden + * by the values provided from the hal layes of corresponding hw units. + */ +struct nvgpu_err_hw_module gv11b_err_lut[] = { + { + .name = "host", + .hw_unit = (u32)NVGPU_ERR_MODULE_HOST, + .num_instances = 1U, + .num_errs = 17U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("pfifo_bind_error", + GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pfifo_sched_error", + GPU_HOST_PFIFO_SCHED_ERROR, INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pfifo_chsw_error", + GPU_HOST_PFIFO_CHSW_ERROR, INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pfifo_memop_error", + GPU_HOST_PFIFO_MEMOP_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pfifo_lb_error", + GPU_HOST_PFIFO_LB_ERROR, INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbus_squash_error", + GPU_HOST_PBUS_SQUASH_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbus_fecs_error", + GPU_HOST_PBUS_FECS_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbus_timeout_error", + GPU_HOST_PBUS_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_timeout_error", + GPU_HOST_PBDMA_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_extra_error", + GPU_HOST_PBDMA_EXTRA_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_gpfifo_pb_error", + GPU_HOST_PBDMA_GPFIFO_PB_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_method_error", + GPU_HOST_PBDMA_METHOD_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_signature_error", + GPU_HOST_PBDMA_SIGNATURE_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_hce_error", + GPU_HOST_PBDMA_HCE_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pbdma_preempt_error", + GPU_HOST_PBDMA_PREEMPT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("pfifo_ctxsw_timeout", + GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pfifo_fb_flush_timeout", + GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_host_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "sm", + .hw_unit = (u32)NVGPU_ERR_MODULE_SM, + .num_instances = 8U, + .num_errs = 21U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("l1_tag_ecc_corrected", + GPU_SM_L1_TAG_ECC_CORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_ecc_uncorrected", + GPU_SM_L1_TAG_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("cbu_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cbu_ecc_uncorrected", + GPU_SM_CBU_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("lrf_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("lrf_ecc_uncorrected", + GPU_SM_LRF_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("l1_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_data_ecc_uncorrected", + GPU_SM_L1_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("icache_l0_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l0_data_ecc_uncorrected", + GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("icache_l1_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l1_data_ecc_uncorrected", + GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", + GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", + GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", + GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("machine_check_error", + GPU_SM_MACHINE_CHECK_ERROR, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", + GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "fecs", + .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, + .num_instances = 1U, + .num_errs = 8U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_FECS_FALCON_IMEM_ECC_CORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("falcon_dmem_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ctxsw_watchdog_timeout", + GPU_FECS_CTXSW_WATCHDOG_TIMEOUT, + INJECT_SW, + NULL, nvgpu_inject_ctxsw_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ctxsw_crc_mismatch", + GPU_FECS_CTXSW_CRC_MISMATCH, + INJECT_SW, + NULL, nvgpu_inject_ctxsw_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("fault_during_ctxsw", + GPU_FECS_FAULT_DURING_CTXSW, + INJECT_SW, + NULL, nvgpu_inject_ctxsw_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ctxsw_init_error", + GPU_FECS_CTXSW_INIT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_ctxsw_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "gpccs", + .hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS, + .num_instances = 1U, + .num_errs = 4U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("falcon_dmem_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "mmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_MMU, + .num_instances = 1U, + .num_errs = 4U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("l1tlb_sa_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected", + GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("l1tlb_fa_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected", + GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "gcc", + .hw_unit = (u32)NVGPU_ERR_MODULE_GCC, + .num_instances = 1U, + .num_errs = 2U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("l15_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("l15_ecc_uncorrected", + GPU_GCC_L15_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_PMU, + .num_instances = 1U, + .num_errs = 5U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("falcon_imem_ecc_corrected", + GPU_PMU_FALCON_IMEM_ECC_CORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_imem_ecc_uncorrected", + GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("falcon_dmem_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("falcon_dmem_ecc_uncorrected", + GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("bar0_error_timeout", + GPU_PMU_BAR0_ERROR_TIMEOUT, INJECT_SW, + NULL, nvgpu_inject_pmu_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pgraph", + .hw_unit = (u32)NVGPU_ERR_MODULE_PGRAPH, + .num_instances = 1U, + .num_errs = 12U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("fe_exception", + GPU_PGRAPH_FE_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("memfmt_exception", + GPU_PGRAPH_MEMFMT_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pd_exception", + GPU_PGRAPH_PD_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("scc_exception", + GPU_PGRAPH_SCC_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ds_exception", + GPU_PGRAPH_DS_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ssync_exception", + GPU_PGRAPH_SSYNC_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("mme_exception", + GPU_PGRAPH_MME_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("sked_exception", + GPU_PGRAPH_SKED_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("be_exception", + GPU_PGRAPH_BE_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("mpc_exception", + GPU_PGRAPH_MPC_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("illegal_error", + GPU_PGRAPH_ILLEGAL_ERROR, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("gpc_gfx_exception", + GPU_PGRAPH_GPC_GFX_EXCEPTION, + INJECT_SW, + NULL, nvgpu_inject_gr_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "ltc", + .hw_unit = (u32)NVGPU_ERR_MODULE_LTC, + .num_instances = 1U, + .num_errs = 8U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("cache_dstg_ecc_corrected", + GPU_LTC_CACHE_DSTG_ECC_CORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_dstg_ecc_uncorrected", + GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("cache_tstg_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_tstg_ecc_uncorrected", + GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("cache_rstg_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_rstg_ecc_uncorrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_NONCRITERR("cache_dstg_be_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("cache_dstg_be_ecc_uncorrected", + GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "hubmmu", + .hw_unit = (u32)NVGPU_ERR_MODULE_HUBMMU, + .num_instances = 1U, + .num_errs = 9U, + .errs = (struct nvgpu_err_desc[]) { + GPU_NONCRITERR("hubmmu_l2tlb_sa_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected", + GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_tlb_sa_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected", + GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_pte_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected", + GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED, + INJECT_TYPE, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("hubmmu_pde0_data_ecc_corrected", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected", + GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED, + INJECT_SW, + NULL, nvgpu_inject_ecc_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("hubmmu_page_fault_error", + GPU_HUBMMU_PAGE_FAULT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_mmu_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "pri", + .hw_unit = (u32)NVGPU_ERR_MODULE_PRI, + .num_instances = 1U, + .num_errs = 2U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("pri_timeout_error", + GPU_PRI_TIMEOUT_ERROR, + INJECT_SW, + NULL, nvgpu_inject_pri_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("pri_access_violation", + GPU_PRI_ACCESS_VIOLATION, + INJECT_SW, + NULL, nvgpu_inject_pri_swerror, + NULL, NULL, 0, 0), + }, + }, + { + .name = "ce", + .hw_unit = (u32)NVGPU_ERR_MODULE_CE, + .num_instances = 1U, + .num_errs = 5U, + .errs = (struct nvgpu_err_desc[]) { + GPU_CRITERR("ce_launch_error", + GPU_CE_LAUNCH_ERROR, + INJECT_SW, + NULL, nvgpu_inject_ce_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ce_block_pipe", + GPU_CE_BLOCK_PIPE, + INJECT_SW, + NULL, nvgpu_inject_ce_swerror, + NULL, NULL, 0, 0), + GPU_NONCRITERR("ce_nonblock_pipe", + 0, INJECT_NONE, + NULL, NULL, + NULL, NULL, 0, 0), + GPU_CRITERR("ce_invalid_config", + GPU_CE_INVALID_CONFIG, + INJECT_SW, + NULL, nvgpu_inject_ce_swerror, + NULL, NULL, 0, 0), + GPU_CRITERR("ce_method_buffer_fault", + GPU_CE_METHOD_BUFFER_FAULT, + INJECT_SW, + NULL, nvgpu_inject_ce_swerror, + NULL, NULL, 0, 0), + }, + }, +}; + +u32 size_of_gv11b_lut = sizeof(gv11b_err_lut) / + sizeof(struct nvgpu_err_hw_module); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index f55e66ee0..20818ac53 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -1060,6 +1060,11 @@ static const struct gops_grmgr gm20b_ops_grmgr = { .init_gr_manager = nvgpu_init_gr_manager, }; +static const struct gops_cic gm20b_ops_cic = { + .init = NULL, + .report_err = NULL, +}; + int gm20b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1165,6 +1170,7 @@ int gm20b_init_hal(struct gk20a *g) gops->tpc = gm20b_ops_tpc; #endif gops->grmgr = gm20b_ops_grmgr; + gops->cic = gm20b_ops_cic; gops->chip_init_gpu_characteristics = nvgpu_init_gpu_characteristics; gops->get_litter_value = gm20b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index 05034f4ec..f77d2797f 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -1155,6 +1155,11 @@ static const struct gops_grmgr gp10b_ops_grmgr = { .init_gr_manager = nvgpu_init_gr_manager, }; +static const struct gops_cic gp10b_ops_cic = { + .init = NULL, + .report_err = NULL, +}; + int gp10b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1250,6 +1255,7 @@ int gp10b_init_hal(struct gk20a *g) gops->tpc = gp10b_ops_tpc; #endif gops->grmgr = gp10b_ops_grmgr; + gops->cic = gp10b_ops_cic; gops->chip_init_gpu_characteristics = gp10b_init_gpu_characteristics; gops->get_litter_value = gp10b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 3ee57cafa..42b0e1e7f 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -188,6 +188,8 @@ #include "hal/fifo/channel_gm20b.h" #include "hal/fifo/channel_gv11b.h" +#include "hal/cic/cic_gv11b.h" + #ifdef CONFIG_NVGPU_TPC_POWERGATE #include "hal/tpc/tpc_gv11b.h" #endif @@ -209,6 +211,7 @@ #include #include #include +#include #include @@ -1421,6 +1424,11 @@ static const struct gops_grmgr gv11b_ops_grmgr = { .init_gr_manager = nvgpu_init_gr_manager, }; +static const struct gops_cic gv11b_ops_cic = { + .init = gv11b_cic_init, + .report_err = nvgpu_cic_report_err_safety_services, +}; + int gv11b_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1516,6 +1524,7 @@ int gv11b_init_hal(struct gk20a *g) gops->tpc = gv11b_ops_tpc; #endif gops->grmgr = gv11b_ops_grmgr; + gops->cic = gv11b_ops_cic; gops->chip_init_gpu_characteristics = gv11b_init_gpu_characteristics; gops->get_litter_value = gv11b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index 658fec0c1..b2574df5b 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -1612,6 +1612,11 @@ static const struct gops_grmgr tu104_ops_grmgr = { }; #endif +static const struct gops_cic tu104_ops_cic = { + .init = NULL, + .report_err = NULL, +}; + int tu104_init_hal(struct gk20a *g) { struct gpu_ops *gops = &g->ops; @@ -1720,6 +1725,7 @@ int tu104_init_hal(struct gk20a *g) gops->gsp = tu104_ops_gsp; gops->top = tu104_ops_top; gops->grmgr = tu104_ops_grmgr; + gops->cic = tu104_ops_cic; gops->chip_init_gpu_characteristics = tu104_init_gpu_characteristics; gops->get_litter_value = tu104_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; diff --git a/drivers/gpu/nvgpu/include/nvgpu/cic.h b/drivers/gpu/nvgpu/include/nvgpu/cic.h new file mode 100644 index 000000000..5208df9b9 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/cic.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_CIC_H +#define NVGPU_CIC_H + +#include + +struct nvgpu_err_desc; +/** + * @file + * + * Public structs and APIs exposed by Central Interrupt Controller + * (CIC) unit. + */ + +/* + * Requires a string literal for the format - notice the string + * concatination. + */ +#define cic_dbg(g, fmt, args...) \ + nvgpu_log((g), gpu_dbg_cic, "CIC | " fmt, ##args) + +/** + * @brief Initialize the CIC unit's data structures + * + * @param g [in] - The GPU driver struct. + * + * - Check if CIC unit is already initialized by checking its + * reference in struct gk20a. + * - If not initialized, allocate memory for CIC's private data + * structure. + * - Initialize the members of this private structure. + * - Store a reference pointer to the CIC struct in struct gk20a. + * + * @return 0 if Initialization had already happened or was + * successful in this call. + * < 0 if any steps in initialization fail. + * + * @retval -ENOMEM if sufficient memory is not available for CIC + * struct. + * + */ +int nvgpu_cic_init_common(struct gk20a *g); + +/** + * @brief De-initialize the CIC unit's data structures + * + * @param g [in] - The GPU driver struct. + * + * - Check if CIC unit is already deinitialized by checking its + * reference in struct gk20a. + * - If not deinitialized, set the LUT pointer to NULL and set the + * num_hw_modules to 0. + * - Free the memory allocated for CIC's private data structure. + * - Invalidate reference pointer to the CIC struct in struct gk20a. + * + * @return 0 if Deinitialization had already happened or was + * successful in this call. + * + * @retval None. + */ +int nvgpu_cic_deinit_common(struct gk20a *g); + +/** + * @brief Check if the input HW unit ID is valid CIC HW unit. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit_id [in] - HW unit ID to be verified + * + * - Check if the CIC unit is initialized so that the LUT is + * available to verify the hw_unit_id. + * - LUT is an array of nvgpu_err_hw_module struct which contains the + * hw_unit_id for a specific unit. + * - The hw_unit_id starts from 0 and ends at + * (g->cic->num_hw_modules -1) and hence effectively can serve as + * index into the LUT array. + * + * @return 0 if input hw_unit_id is valid, + * < 0 if input hw_unit_id is invalid + * @retval -EINVAL if CIC is not initialized and + * if input hw_unit_id is invalid. + */ +int nvgpu_cic_check_hw_unit_id(struct gk20a *g, u32 hw_unit_id); + +/** + * @brief Check if the input error ID is valid in CIC domain. + * + * @param g [in] - The GPU driver struct. + * @param hw_unit_id [in] - HW unit ID corresponding to err_id + * @param err_id [in] - Error ID to be verified + * + * - Check if the CIC unit is initialized so that the LUT is + * available to verify the hw_unit_id. + * - LUT is an array of nvgpu_err_hw_module struct which contains the + * hw_unit_id for a specific unit and also the number of errors + * reported by the unit. + * - The hw_unit_id starts from 0 and ends at + * (g->cic->num_hw_modules -1) and hence effectively can serve as + * index into the LUT array. + * - Before using the input hw_unit_id to index into LUT, verify that + * the hw_unit_id is valid. + * - Index using hw_unit_id and derive the num_errs from LUT for the + * given HW unit + * - Check if the input err_id lies between 0 and (num_errs-1). + * + * @return 0 if input err_id is valid, < 0 if input err_id is invalid + * @retval -EINVAL if CIC is not initialized and + * if input hw_unit_id or err_id is invalid. + */ +int nvgpu_cic_check_err_id(struct gk20a *g, u32 hw_unit_id, + u32 err_id); + +/** + * @brief Get the LUT data for input HW unit ID and error ID + * + * @param g [in] - The GPU driver struct. + * @param hw_unit_id [in] - HW unit ID corresponding to err_id + * @param err_id [in] - Error ID whose LUT data is required. + * @param err_desc [out] - Pointer to store LUT data into. + * + * - LUT is an array of nvgpu_err_hw_module struct which contains the + * all the static data for each HW unit reporting error to CIC. + * - nvgpu_err_hw_module struct is inturn an array of struct + * nvgpu_err_desc which stores static data per error ID. + * - Use the nvgpu_cic_check_err_id() API to + * - Check if the CIC unit is initialized so that the LUT is + * available to read the static data for input err_id. + * - Check if input HW unit ID and error ID are valid. + * - The hw_unit_id starts from 0 and ends at + * (g->cic->num_hw_modules -1) and hence effectively can serve as + * index into the LUT array. + * - The err_id starts from 0 and ends at + * [lut[hw_unit_id].num_err) - 1], and hence effectively can serve + * as index into array of errs[]. + * - Index using hw_unit_id and err_id and store the LUT data into + * + * @return 0 if err_desc was successfully filled with LUT data, + * < 0 otherwise. + * @retval -EINVAL if CIC is not initialized and + * if input hw_unit_id or err_id is invalid. + */ +int nvgpu_cic_get_err_desc(struct gk20a *g, u32 hw_unit_id, + u32 err_id, struct nvgpu_err_desc **err_desc); + +/** + * @brief GPU HW errors are reported to Safety_Services via SDL unit. + * This function provides an interface between error reporting functions + * used by sub-units in nvgpu-rm and SDL unit. + * + * @param g [in] - The GPU driver struct. + * @param err_info [in] - Error message. + * @param err_size [in] - Size of the error message. + * @param is_critical [in] - Criticality of the error being reported. + * + * On QNX: + * - Checks whether SDL is initialized. + * - Enqueues \a err_info into error message queue. + * - Signals the workqueue condition variable. + * - If the reported error is critical, invokes #nvgpu_sw_quiesce() api. + * + * on Linux: + * - NOP currently as safety services are absent in Linux + * + * @return 0 in case of success, <0 in case of failure. + * @retval -EAGAIN if SDL not initialized. + * @retval -ENOMEM if sufficient memory is not available. + */ +int nvgpu_cic_report_err_safety_services(struct gk20a *g, + void *err_info, size_t err_size, bool is_critical); + +/** + * @brief Get the number of HW modules supported by CIC. + * + * @param g [in] - The GPU driver struct. + * + * - Check if the CIC unit is initialized so that num_hw_modules is + * initialized. + * - Return the num_hw_modules variable stored in CIC's private + * struct. + * + * @return 0 or >0 value of num_hw_modules if successful; + * < 0 otherwise. + * @retval -EINVAL if CIC is not initialized. + */ +int nvgpu_cic_get_num_hw_modules(struct gk20a *g); +#endif /* NVGPU_CIC_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 2f46cc651..2906f81a4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -107,6 +107,7 @@ struct nvgpu_gpfifo_entry; struct vm_gk20a_mapping_batch; struct pmu_pg_stats_data; struct clk_domains_mon_status_params; +struct nvgpu_cic; enum nvgpu_flush_op; enum gk20a_mem_rw_flag; @@ -791,6 +792,8 @@ struct gk20a { /** Multi Instance GPU information. */ struct nvgpu_mig mig; + /** Pointer to struct storing CIC unit's data */ + struct nvgpu_cic *cic; }; /** diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/cic.h b/drivers/gpu/nvgpu/include/nvgpu/gops/cic.h new file mode 100644 index 000000000..19f8b6e1d --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/cic.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef NVGPU_GOPS_CIC_H +#define NVGPU_GOPS_CIC_H + +#include + +/** + * @file + * + * Central Interrupt Controller unit HAL interface + * + */ +struct gk20a; +struct nvgpu_cic; + +/** + * CIC unit HAL operations + * + * @see gpu_ops + */ +struct gops_cic { + /** + * @brief Chip specific CIC unit initialization. + * + * @param g [in] Pointer to GPU driver struct. + * @param cic [in] Pointer to CIC private struct. + * + * @return 0 in case of success, < 0 in case of failure. + */ + int (*init)(struct gk20a *g, struct nvgpu_cic *cic); + + /** + * @brief Report error to safety services. + * + * @param g [in] Pointer to GPU driver struct. + * @param err_pkt [in] Pointer to struct holding err details. + * @param err_size [in] Size of err_pkt. + * @param is_critical [in] Flag indicating criticality of error. + * + * @return 0 in case of success, < 0 in case of failure. + */ + int (*report_err)(struct gk20a *g, + void *err_pkt, size_t err_size, + bool is_critical); +}; + +#endif/*NVGPU_GOPS_CIC_H*/ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gpu_ops.h b/drivers/gpu/nvgpu/include/nvgpu/gpu_ops.h index 79d4f94f8..998ee069f 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gpu_ops.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gpu_ops.h @@ -71,6 +71,7 @@ #include #include #include +#include struct gk20a; struct nvgpu_debug_context; @@ -224,6 +225,7 @@ struct gpu_ops { struct gops_grmgr grmgr; + struct gops_cic cic; }; #endif /* NVGPU_GOPS_OPS_H */ diff --git a/drivers/gpu/nvgpu/os/linux/cic/cic_stub.c b/drivers/gpu/nvgpu/os/linux/cic/cic_stub.c new file mode 100644 index 000000000..5a9319853 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/cic/cic_stub.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +struct gk20a; + +int nvgpu_cic_report_err_safety_services(struct gk20a *g, + void *err_info, size_t err_size, bool is_critical) +{ + return 0; +} diff --git a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c b/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c deleted file mode 100644 index 4f1ecc0a7..000000000 --- a/drivers/gpu/nvgpu/os/linux/sdl/sdl_stub.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include - -struct gk20a; -struct mmu_fault_info; - -void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, - u32 inst, u32 err_id, u32 intr_info) -{ - return; -} - -void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, u64 err_addr, u64 err_count) -{ - return; -} - -void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, struct gr_err_info *err_info, u32 sub_err_type) -{ - return; -} - -void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, - u32 sub_err_type, u32 status) -{ - return; -} - -void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, - u32 inst, u32 err_id, u32 intr_info) -{ - return; -} - -void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, u32 err_addr, u32 err_code) -{ - return; -} - -void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, - void *data) -{ - return; -} - -void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, - u32 err_id, struct mmu_fault_info *fault_info, - u32 status, u32 sub_err_type) -{ - return; -} diff --git a/drivers/gpu/nvgpu/os/posix/stubs.c b/drivers/gpu/nvgpu/os/posix/stubs.c index 045b006be..d6c9146e4 100644 --- a/drivers/gpu/nvgpu/os/posix/stubs.c +++ b/drivers/gpu/nvgpu/os/posix/stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,10 +27,10 @@ #include #include -#include +#include +#include struct gk20a; -struct mmu_fault_info; #ifdef CONFIG_NVGPU_DEBUGGER void nvgpu_dbg_session_post_event(struct dbg_session_gk20a *dbg_s) @@ -49,51 +49,8 @@ void nvgpu_ecc_sysfs_remove(struct gk20a *g) } #endif -void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit, - u32 inst, u32 err_id, u32 intr_info) +int nvgpu_cic_report_err_safety_services(struct gk20a *g, + void *err_info, size_t err_size, bool is_critical) { - return; -} - -void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, u64 err_addr, u64 err_count) -{ - return; -} - -void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, struct gr_err_info *err_info, u32 sub_err_type) -{ - return; -} - -void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id, - u32 sub_err_type, u32 status) -{ - return; -} - -void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit, - u32 inst, u32 err_id, u32 intr_info) -{ - return; -} - -void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst, - u32 err_id, u32 err_addr, u32 err_code) -{ - return; -} - -void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id, - void *data) -{ - return; -} - -void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit, - u32 err_id, struct mmu_fault_info *fault_info, - u32 status, u32 sub_err_type) -{ - return; + return 0; } diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export index 7e5f4dde7..eab3406d1 100644 --- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export +++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export @@ -795,3 +795,10 @@ nvgpu_init_pramin gk20a_bus_set_bar0_window nvgpu_pramin_ops_init nvgpu_dma_alloc_vid_at +nvgpu_cic_init_common +nvgpu_cic_deinit_common +nvgpu_cic_check_hw_unit_id +nvgpu_cic_check_err_id +nvgpu_cic_get_err_desc +nvgpu_cic_report_err_safety_services +nvgpu_cic_get_num_hw_modules diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export index 12cf3cb10..ffc13f2dc 100644 --- a/libs/igpu/libnvgpu-drv-igpu_safe.export +++ b/libs/igpu/libnvgpu-drv-igpu_safe.export @@ -248,6 +248,7 @@ gv11b_blcg_hshub_get_gating_prod gv11b_netlist_is_firmware_defined gv11b_top_get_num_lce gv11b_bus_configure_debug_bus +gv11b_cic_init mc_gp10b_intr_stall_unit_config mc_gp10b_intr_nonstall_unit_config nvgpu_acr_bootstrap_hs_acr @@ -809,3 +810,10 @@ nvgpu_rc_tsg_and_related_engines nvgpu_rc_mmu_fault gp10b_priv_ring_isr_handle_0 gp10b_priv_ring_isr_handle_1 +nvgpu_cic_init_common +nvgpu_cic_deinit_common +nvgpu_cic_check_hw_unit_id +nvgpu_cic_check_err_id +nvgpu_cic_get_err_desc +nvgpu_cic_report_err_safety_services +nvgpu_cic_get_num_hw_modules diff --git a/userspace/units/acr/nvgpu-acr.c b/userspace/units/acr/nvgpu-acr.c index 7045ae02a..e25138f17 100644 --- a/userspace/units/acr/nvgpu-acr.c +++ b/userspace/units/acr/nvgpu-acr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -215,6 +216,11 @@ static int init_acr_falcon_test_env(struct unit_module *m, struct gk20a *g) return -ENODEV; } + err = nvgpu_cic_init_common(g); + if (err != 0) { + unit_return_fail(m, "CIC init failed\n"); + } + /* * Register space: FB_MMU */ diff --git a/userspace/units/bus/nvgpu-bus.c b/userspace/units/bus/nvgpu-bus.c index de407f8f3..c8743a031 100644 --- a/userspace/units/bus/nvgpu-bus.c +++ b/userspace/units/bus/nvgpu-bus.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include @@ -128,6 +130,8 @@ int test_bus_setup(struct unit_module *m, struct gk20a *g, void *args) g->ops.mc.intr_nonstall_unit_config = mc_gp10b_intr_nonstall_unit_config; g->ops.ptimer.isr = gk20a_ptimer_isr; + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; /* Map register space NV_PRIV_MASTER */ if (nvgpu_posix_io_add_reg_space(g, NV_PBUS_START, NV_PBUS_SIZE) != 0) { @@ -154,6 +158,12 @@ int test_bus_setup(struct unit_module *m, struct gk20a *g, void *args) (void)nvgpu_posix_register_io(g, &test_reg_callbacks); + if (nvgpu_cic_init_common(g) != 0) { + unit_err(m, "%s: Failed to initialize CIC\n", + __func__); + return UNIT_FAIL; + } + return UNIT_SUCCESS; } diff --git a/userspace/units/ce/nvgpu-ce.c b/userspace/units/ce/nvgpu-ce.c index 7cb39403e..af17e095e 100644 --- a/userspace/units/ce/nvgpu-ce.c +++ b/userspace/units/ce/nvgpu-ce.c @@ -27,8 +27,10 @@ #include #include #include +#include #include #include +#include #include #include "nvgpu-ce.h" @@ -126,6 +128,15 @@ int test_ce_setup_env(struct unit_module *m, g->blcg_enabled = false; nvgpu_spinlock_init(&g->mc.intr_lock); + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; + + if (nvgpu_cic_init_common(g) != 0) { + unit_err(m, "%s: failed to initialize CIC\n", + __func__); + return UNIT_FAIL; + } + return UNIT_SUCCESS; } diff --git a/userspace/units/fb/fb_gv11b_fusa.c b/userspace/units/fb/fb_gv11b_fusa.c index cfd59a7fb..af90b03f8 100644 --- a/userspace/units/fb/fb_gv11b_fusa.c +++ b/userspace/units/fb/fb_gv11b_fusa.c @@ -27,6 +27,7 @@ #include #include +#include #include #include "hal/mc/mc_gp10b.h" #include "hal/fb/fb_gm20b.h" @@ -34,6 +35,7 @@ #include "hal/fb/ecc/fb_ecc_gv11b.h" #include "hal/fb/intr/fb_intr_gv11b.h" #include "hal/fb/intr/fb_intr_ecc_gv11b.h" +#include "hal/cic/cic_gv11b.h" #include #include "fb_fusa.h" @@ -62,6 +64,8 @@ int fb_gv11b_init_test(struct unit_module *m, struct gk20a *g, void *args) g->ops.mc.intr_nonstall_unit_config = mc_gp10b_intr_nonstall_unit_config; g->ops.fb.intr.enable = gv11b_fb_intr_enable; + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; /* * Define some arbitrary addresses for test purposes. @@ -74,6 +78,10 @@ int fb_gv11b_init_test(struct unit_module *m, struct gk20a *g, void *args) g->mm.mmu_rd_mem.cpu_va = (void *) 0x30000000; g->mm.mmu_rd_mem.aperture = APERTURE_SYSMEM; + if (nvgpu_cic_init_common(g) != 0) { + unit_return_fail(m, "CIC init failed\n"); + } + g->ops.ecc.ecc_init_support(g); nvgpu_writel(g, fb_niso_intr_en_set_r(0), 0); diff --git a/userspace/units/fifo/nvgpu-fifo-common.c b/userspace/units/fifo/nvgpu-fifo-common.c index 81c604ec1..c5e1a2f57 100644 --- a/userspace/units/fifo/nvgpu-fifo-common.c +++ b/userspace/units/fifo/nvgpu-fifo-common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -187,6 +188,12 @@ int test_fifo_init_support(struct unit_module *m, struct gk20a *g, void *args) /* Do not allocate from vidmem */ nvgpu_set_enabled(g, NVGPU_MM_UNIFIED_MEMORY, true); + err = nvgpu_cic_init_common(g); + if (err != 0) { + unit_err(m, "CIC init failed!\n"); + return UNIT_FAIL; + } + return UNIT_SUCCESS; fail: diff --git a/userspace/units/gr/nvgpu-gr.c b/userspace/units/gr/nvgpu-gr.c index efbef32e1..1144dfaad 100644 --- a/userspace/units/gr/nvgpu-gr.c +++ b/userspace/units/gr/nvgpu-gr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -33,10 +33,12 @@ #include #include #include +#include #include "common/gr/gr_falcon_priv.h" #include "hal/init/hal_gv11b.h" +#include "hal/cic/cic_gv11b.h" #include "nvgpu-gr.h" #include "nvgpu-gr-gv11b.h" @@ -162,6 +164,14 @@ int test_gr_init_setup_ready(struct unit_module *m, nvgpu_device_init(g); nvgpu_fifo_setup_sw(g); + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; + + err = nvgpu_cic_init_common(g); + if (err != 0) { + unit_return_fail(m, "CIC init failed\n"); + } + /* Allocate and Initialize GR */ err = test_gr_init_setup(m, g, args); if (err != 0) { diff --git a/userspace/units/ltc/nvgpu-ltc.c b/userspace/units/ltc/nvgpu-ltc.c index 695b5ee45..99335fe16 100644 --- a/userspace/units/ltc/nvgpu-ltc.c +++ b/userspace/units/ltc/nvgpu-ltc.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -136,6 +137,11 @@ int test_ltc_init_support(struct unit_module *m, unit_return_fail(m, "nvgpu_init_hal failed\n"); } + err = nvgpu_cic_init_common(g); + if (err != 0) { + unit_return_fail(m, "CIC init failed\n"); + } + /* * Init dependent ECC unit */ diff --git a/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c b/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c index 77f6029d1..541648e0f 100644 --- a/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c +++ b/userspace/units/mm/hal/mmu_fault/gv11b_fusa/mmu-fault-gv11b-fusa.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,7 @@ #include "hal/mm/gmmu/gmmu_gv11b.h" #include "hal/mm/mm_gp10b.h" #include "hal/mm/mm_gv11b.h" +#include "hal/cic/cic_gv11b.h" #include "hal/mm/mmu_fault/mmu_fault_gv11b.h" #include "mmu-fault-gv11b-fusa.h" @@ -220,6 +222,13 @@ int test_env_init_mm_mmu_fault_gv11b_fusa(struct unit_module *m, unit_return_fail(m, "nvgpu_init_mm_support failed\n"); } + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; + + if (nvgpu_cic_init_common(g) != 0) { + unit_return_fail(m, "Failed to initialize CIC\n"); + } + return UNIT_SUCCESS; } diff --git a/userspace/units/priv_ring/nvgpu-priv_ring.c b/userspace/units/priv_ring/nvgpu-priv_ring.c index 60512340b..d560bfd34 100644 --- a/userspace/units/priv_ring/nvgpu-priv_ring.c +++ b/userspace/units/priv_ring/nvgpu-priv_ring.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,12 +23,14 @@ #include #include #include +#include #include #include #include #include #include +#include "hal/cic/cic_gv11b.h" #include #include @@ -123,6 +125,8 @@ int test_priv_ring_setup(struct unit_module *m, struct gk20a *g, void *args) g->ops.get_litter_value = gv11b_get_litter_value; g->ops.mc.intr_stall_unit_config = mc_gp10b_intr_stall_unit_config; + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; /* Map register space NV_PRIV_MASTER */ if (nvgpu_posix_io_add_reg_space(g, NV_PRIV_MASTER_START, @@ -158,6 +162,12 @@ int test_priv_ring_setup(struct unit_module *m, struct gk20a *g, void *args) (void)nvgpu_posix_register_io(g, &test_reg_callbacks); + if (nvgpu_cic_init_common(g) != 0) { + unit_err(m, "%s: Failed to initialize CIC\n", + __func__); + return UNIT_FAIL; + } + return UNIT_SUCCESS; } diff --git a/userspace/units/ptimer/nvgpu-ptimer.c b/userspace/units/ptimer/nvgpu-ptimer.c index 0c16402aa..cbe892025 100644 --- a/userspace/units/ptimer/nvgpu-ptimer.c +++ b/userspace/units/ptimer/nvgpu-ptimer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,7 +26,9 @@ #include #include +#include #include +#include #include #include "nvgpu-ptimer.h" @@ -88,6 +90,9 @@ int test_setup_env(struct unit_module *m, g->ops.ptimer.read_ptimer = gk20a_read_ptimer; g->ops.ptimer.isr = gk20a_ptimer_isr; + g->ops.cic.init = gv11b_cic_init; + g->ops.cic.report_err = nvgpu_cic_report_err_safety_services; + /* Create ptimer register space */ if (nvgpu_posix_io_add_reg_space(g, PTIMER_REG_SPACE_START, PTIMER_REG_SPACE_SIZE) != 0) { @@ -97,6 +102,12 @@ int test_setup_env(struct unit_module *m, } (void)nvgpu_posix_register_io(g, &test_reg_callbacks); + if (nvgpu_cic_init_common(g) != 0) { + unit_err(m, "%s: failed to initialize CIC\n", + __func__); + return UNIT_FAIL; + } + return UNIT_SUCCESS; }