Revert "gpu:nvgpu: Expose physical gpc,tpc layout for ecc sysfs nodes."

This reverts commit 2cc098eae7.

Reason for revert: intermittent boot failures on drv-orin-f1 and 
frspr-f1 on both AV+L and AV+Q.

Bug 3998230

Change-Id: I230ba7ba469fde3f470dab7538cc757c99360d99
Signed-off-by: srajum <srajum@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2863208
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
V M S Seeta Rama Raju Mudundi
2023-02-24 06:22:54 -08:00
committed by mobile promotions
parent a4eca46b4b
commit ab46ee3335
5 changed files with 69 additions and 168 deletions

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -549,7 +549,7 @@ struct nvgpu_gr_config *nvgpu_gr_config_init(struct gk20a *g)
{ {
struct nvgpu_gr_config *config; struct nvgpu_gr_config *config;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g); u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
u32 gpc_index, tpc_index, tpc_phys_pos, local_gpc_tpc_mask_phys, i; u32 gpc_index;
u32 gpc_phys_id; u32 gpc_phys_id;
int err; int err;
@@ -660,52 +660,9 @@ struct nvgpu_gr_config *nvgpu_gr_config_init(struct gk20a *g)
gr_config_init_gpc_skip_mask(config, gpc_index); gr_config_init_gpc_skip_mask(config, gpc_index);
} }
/*
* This structure holds the physical id for a TPC within a
* GPC. The GPC is indexed using physical id and the TPC is indexed using
* logical id.
*/
config->gpc_tpc_physical_id_map = nvgpu_kzalloc(g,
nvgpu_safe_mult_u64((size_t)config->gpc_count,
sizeof(u32 *)));
if (config->gpc_tpc_physical_id_map == NULL) {
nvgpu_err(g, "alloc gpc_tpc_physical_id_map failed");
goto clean_up_gpc_rop_config;
}
//Get Physical layout of tpc per physical gpc
for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g,
cur_gr_instance, (u32)gpc_index);
config->gpc_tpc_physical_id_map[gpc_phys_id] =
nvgpu_kzalloc(g, config->max_tpc_per_gpc_count);
if (config->gpc_tpc_physical_id_map[gpc_phys_id] == NULL) {
nvgpu_err(g, "alloc tpc_physical_id_map(%u) failed",
gpc_phys_id);
goto clean_up_gpc_tpc_physical_id_map_alloc_fail;
}
tpc_phys_pos = 0U;
local_gpc_tpc_mask_phys = config->gpc_tpc_mask_physical[gpc_phys_id];
tpc_index = 0U;
while (tpc_index < config->gpc_tpc_count[gpc_index]) {
while (local_gpc_tpc_mask_phys != 0x0U) {
if ((local_gpc_tpc_mask_phys & 0x1U) != 0x0U) {
config->gpc_tpc_physical_id_map[gpc_phys_id][tpc_index++] =
tpc_phys_pos;
}
local_gpc_tpc_mask_phys >>= 1;
tpc_phys_pos++;
}
}
}
gr_config_log_info(g, config); gr_config_log_info(g, config);
return config; return config;
clean_up_gpc_tpc_physical_id_map_alloc_fail:
for (i = 0; i < gpc_index; i++) {
nvgpu_kfree(g, config->gpc_tpc_physical_id_map[i]);
}
nvgpu_kfree(g, config->gpc_tpc_physical_id_map);
clean_up_gpc_rop_config: clean_up_gpc_rop_config:
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ROP_IN_GPC)) { if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ROP_IN_GPC)) {
gr_config_free_gpc_rop_config(g, config); gr_config_free_gpc_rop_config(g, config);
@@ -960,7 +917,6 @@ u32 nvgpu_gr_config_get_gpc_zcb_count(struct nvgpu_gr_config *config,
void nvgpu_gr_config_deinit(struct gk20a *g, struct nvgpu_gr_config *config) void nvgpu_gr_config_deinit(struct gk20a *g, struct nvgpu_gr_config *config)
{ {
u32 i;
if (config == NULL) { if (config == NULL) {
return; return;
} }
@@ -983,10 +939,6 @@ void nvgpu_gr_config_deinit(struct gk20a *g, struct nvgpu_gr_config *config)
config->sm_to_cluster_redex_config = NULL; config->sm_to_cluster_redex_config = NULL;
} }
#endif #endif
for (i = 0; i < config->gpc_count; i++) {
nvgpu_kfree(g, config->gpc_tpc_physical_id_map[i]);
}
nvgpu_kfree(g, config->gpc_tpc_physical_id_map);
nvgpu_kfree(g, config); nvgpu_kfree(g, config);
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -143,12 +143,6 @@ struct nvgpu_gr_config {
* Array is indexed by GPC physical-id. * Array is indexed by GPC physical-id.
*/ */
u32 *gpc_tpc_mask_physical; u32 *gpc_tpc_mask_physical;
/**
* 2D array to map TPC physical id to logical id.
* Array is indexed by GPC physical id and TPC is indexed using
* logical id.
*/
u32 **gpc_tpc_physical_id_map;
/** /**
* 2-D array to hold mask of TPCs attached to a PES unit * 2-D array to hold mask of TPCs attached to a PES unit
* in a GPC. * in a GPC.

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -23,12 +23,10 @@
#include <nvgpu/gr/gr_ecc.h> #include <nvgpu/gr/gr_ecc.h>
#include <nvgpu/gr/gr_utils.h> #include <nvgpu/gr/gr_utils.h>
#include <nvgpu/gr/config.h> #include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr_instances.h>
#include <nvgpu/string.h> #include <nvgpu/string.h>
#include <nvgpu/gk20a.h> #include <nvgpu/gk20a.h>
#include <nvgpu/kmem.h> #include <nvgpu/kmem.h>
#include <nvgpu/ecc.h> #include <nvgpu/ecc.h>
#include "common/gr/gr_config_priv.h"
int nvgpu_ecc_counter_init_per_gr(struct gk20a *g, int nvgpu_ecc_counter_init_per_gr(struct gk20a *g,
struct nvgpu_ecc_stat **stat, const char *name) struct nvgpu_ecc_stat **stat, const char *name)
@@ -72,9 +70,8 @@ int nvgpu_ecc_counter_init_per_tpc(struct gk20a *g,
{ {
struct nvgpu_ecc_stat **stats; struct nvgpu_ecc_stat **stats;
struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g); struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config); u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
u32 gpc, tpc, gpc_phys_id, tpc_phys_id; u32 gpc, tpc;
char gpc_str[10] = {0}, tpc_str[10] = {0}; char gpc_str[10] = {0}, tpc_str[10] = {0};
int err = 0; int err = 0;
@@ -85,48 +82,46 @@ int nvgpu_ecc_counter_init_per_tpc(struct gk20a *g,
} }
for (gpc = 0; gpc < gpc_count; gpc++) { for (gpc = 0; gpc < gpc_count; gpc++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc); stats[gpc] = nvgpu_kzalloc(g,
stats[gpc_phys_id] = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats[gpc]),
nvgpu_safe_mult_u64(sizeof(*stats[gpc_phys_id]), nvgpu_gr_config_get_gpc_tpc_count(gr_config,
nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config))); gpc)));
if (stats[gpc_phys_id] == NULL) { if (stats[gpc] == NULL) {
nvgpu_err(g, "Mem alloc failed for %s\n", name);
err = -ENOMEM; err = -ENOMEM;
goto fail; goto fail;
} }
} }
for (gpc = 0; gpc < gpc_count; gpc++) { for (gpc = 0; gpc < gpc_count; gpc++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc); for (tpc = 0;
// For getting tpc count, gpc id is logical because we read it using gpc_stride. tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc);
for (tpc = 0; tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc); tpc++) { tpc++) {
tpc_phys_id = gr_config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
/** /**
* Store stats name as below: * Store stats name as below:
* gpc<gpc_value>_tpc<tpc_value>_<name_string> * gpc<gpc_value>_tpc<tpc_value>_<name_string>
*/ */
(void)strcpy(stats[gpc_phys_id][tpc_phys_id].name, "gpc"); (void)strcpy(stats[gpc][tpc].name, "gpc");
(void)nvgpu_strnadd_u32(gpc_str, gpc_phys_id, (void)nvgpu_strnadd_u32(gpc_str, gpc,
sizeof(gpc_str), 10U); sizeof(gpc_str), 10U);
(void)strncat(stats[gpc_phys_id][tpc_phys_id].name, gpc_str, (void)strncat(stats[gpc][tpc].name, gpc_str,
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id][tpc_phys_id].name)); strlen(stats[gpc][tpc].name));
(void)strncat(stats[gpc_phys_id][tpc_phys_id].name, "_tpc", (void)strncat(stats[gpc][tpc].name, "_tpc",
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id][tpc_phys_id].name)); strlen(stats[gpc][tpc].name));
(void)nvgpu_strnadd_u32(tpc_str, tpc_phys_id, (void)nvgpu_strnadd_u32(tpc_str, tpc,
sizeof(tpc_str), 10U); sizeof(tpc_str), 10U);
(void)strncat(stats[gpc_phys_id][tpc_phys_id].name, tpc_str, (void)strncat(stats[gpc][tpc].name, tpc_str,
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id][tpc_phys_id].name)); strlen(stats[gpc][tpc].name));
(void)strncat(stats[gpc_phys_id][tpc_phys_id].name, "_", (void)strncat(stats[gpc][tpc].name, "_",
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id][tpc_phys_id].name)); strlen(stats[gpc][tpc].name));
(void)strncat(stats[gpc_phys_id][tpc_phys_id].name, name, (void)strncat(stats[gpc][tpc].name, name,
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id][tpc_phys_id].name)); strlen(stats[gpc][tpc].name));
nvgpu_ecc_stat_add(g, &stats[gpc_phys_id][tpc_phys_id]); nvgpu_ecc_stat_add(g, &stats[gpc][tpc]);
} }
} }
@@ -149,9 +144,8 @@ int nvgpu_ecc_counter_init_per_gpc(struct gk20a *g,
{ {
struct nvgpu_ecc_stat *stats; struct nvgpu_ecc_stat *stats;
struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g); struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config); u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
u32 gpc, gpc_phys_id; u32 gpc;
char gpc_str[10] = {0}; char gpc_str[10] = {0};
stats = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats), stats = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats),
@@ -161,24 +155,23 @@ int nvgpu_ecc_counter_init_per_gpc(struct gk20a *g,
} }
for (gpc = 0; gpc < gpc_count; gpc++) { for (gpc = 0; gpc < gpc_count; gpc++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
/** /**
* Store stats name as below: * Store stats name as below:
* gpc<gpc_value>_<name_string> * gpc<gpc_value>_<name_string>
*/ */
(void)strcpy(stats[gpc_phys_id].name, "gpc"); (void)strcpy(stats[gpc].name, "gpc");
(void)nvgpu_strnadd_u32(gpc_str, gpc_phys_id, sizeof(gpc_str), 10U); (void)nvgpu_strnadd_u32(gpc_str, gpc, sizeof(gpc_str), 10U);
(void)strncat(stats[gpc_phys_id].name, gpc_str, (void)strncat(stats[gpc].name, gpc_str,
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id].name)); strlen(stats[gpc].name));
(void)strncat(stats[gpc_phys_id].name, "_", (void)strncat(stats[gpc].name, "_",
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id].name)); strlen(stats[gpc].name));
(void)strncat(stats[gpc_phys_id].name, name, (void)strncat(stats[gpc].name, name,
NVGPU_ECC_STAT_NAME_MAX_SIZE - NVGPU_ECC_STAT_NAME_MAX_SIZE -
strlen(stats[gpc_phys_id].name)); strlen(stats[gpc].name));
nvgpu_ecc_stat_add(g, &stats[gpc_phys_id]); nvgpu_ecc_stat_add(g, &stats[gpc]);
} }
*stat = stats; *stat = stats;
@@ -210,28 +203,24 @@ void nvgpu_ecc_counter_deinit_per_tpc(struct gk20a *g,
struct nvgpu_ecc_stat **stats = NULL; struct nvgpu_ecc_stat **stats = NULL;
u32 gpc_count; u32 gpc_count;
u32 gpc, tpc; u32 gpc, tpc;
u32 gpc_phys_id, tpc_phys_id;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
if (*stats_p != NULL) { if (*stats_p != NULL) {
gpc_count = nvgpu_gr_config_get_gpc_count(gr_config); gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
stats = *stats_p; stats = *stats_p;
for (gpc = 0; gpc < gpc_count; gpc++) { for (gpc = 0; gpc < gpc_count; gpc++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc); if (stats[gpc] == NULL) {
if (stats[gpc_phys_id] == NULL) {
continue; continue;
} }
for (tpc = 0; for (tpc = 0;
tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc); tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc);
tpc++) { tpc++) {
tpc_phys_id = gr_config->gpc_tpc_physical_id_map[gpc_phys_id][tpc]; nvgpu_ecc_stat_del(g, &stats[gpc][tpc]);
nvgpu_ecc_stat_del(g, &stats[gpc_phys_id][tpc_phys_id]);
} }
nvgpu_kfree(g, stats[gpc_phys_id]); nvgpu_kfree(g, stats[gpc]);
stats[gpc_phys_id] = NULL; stats[gpc] = NULL;
} }
nvgpu_kfree(g, stats); nvgpu_kfree(g, stats);
@@ -244,17 +233,15 @@ void nvgpu_ecc_counter_deinit_per_gpc(struct gk20a *g,
{ {
struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g); struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
struct nvgpu_ecc_stat *stats = NULL; struct nvgpu_ecc_stat *stats = NULL;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
u32 gpc_count; u32 gpc_count;
u32 gpc, gpc_phys_id; u32 gpc;
if (*stats_p != NULL) { if (*stats_p != NULL) {
gpc_count = nvgpu_gr_config_get_gpc_count(gr_config); gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
stats = *stats_p; stats = *stats_p;
for (gpc = 0; gpc < gpc_count; gpc++) { for (gpc = 0; gpc < gpc_count; gpc++) {
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc); nvgpu_ecc_stat_del(g, &stats[gpc]);
nvgpu_ecc_stat_del(g, &stats[gpc_phys_id]);
} }
nvgpu_kfree(g, stats); nvgpu_kfree(g, stats);

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -34,7 +34,6 @@
#include <nvgpu/gr/gr_intr.h> #include <nvgpu/gr/gr_intr.h>
#include "common/gr/gr_priv.h" #include "common/gr/gr_priv.h"
#include "common/gr/gr_config_priv.h"
#include "common/gr/gr_intr_priv.h" #include "common/gr/gr_intr_priv.h"
#include "hal/gr/intr/gr_intr_gm20b.h" #include "hal/gr/intr/gr_intr_gm20b.h"
#include "hal/gr/intr/gr_intr_gp10b.h" #include "hal/gr/intr/gr_intr_gp10b.h"
@@ -811,14 +810,11 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g,
{ {
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset, gpc_phys_id, tpc_phys_id; u32 offset;
u32 rams_ecc_status; u32 rams_ecc_status;
u32 rams_uncorrected_err_count_delta = 0U; u32 rams_uncorrected_err_count_delta = 0U;
bool is_rams_ecc_uncorrected_total_err_overflow = false; bool is_rams_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -856,11 +852,9 @@ static void ga10b_gr_intr_handle_tpc_sm_rams_ecc_exception(struct gk20a *g,
rams_uncorrected_err_count_delta, rams_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_total_s()));
} }
gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc); g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc][tpc].counter =
tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc]; nvgpu_wrapping_add_u32(
g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc][tpc].counter,
nvgpu_wrapping_add_u32(
g->ecc.gr.sm_rams_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter,
rams_uncorrected_err_count_delta); rams_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_rams_ecc_uncorrected_err_count_r(), offset),

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -37,7 +37,6 @@
#include "common/gr/gr_priv.h" #include "common/gr/gr_priv.h"
#include "common/gr/gr_config_priv.h"
#include "gr_intr_gp10b.h" #include "gr_intr_gp10b.h"
#include "gr_intr_gv11b.h" #include "gr_intr_gv11b.h"
@@ -1078,11 +1077,6 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32
bool is_l1_tag_ecc_corrected_total_err_overflow = false; bool is_l1_tag_ecc_corrected_total_err_overflow = false;
bool is_l1_tag_ecc_uncorrected_total_err_overflow = false; bool is_l1_tag_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
u32 gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
u32 tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -1126,9 +1120,9 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32
l1_tag_corrected_err_count_delta, l1_tag_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()));
} }
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter,
l1_tag_corrected_err_count_delta); l1_tag_corrected_err_count_delta);
gv11b_gr_intr_report_l1_tag_corrected_err(g, &ecc_status, gpc, tpc); gv11b_gr_intr_report_l1_tag_corrected_err(g, &ecc_status, gpc, tpc);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -1147,9 +1141,9 @@ static void gv11b_gr_intr_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32
l1_tag_uncorrected_err_count_delta, l1_tag_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()));
} }
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_tag_uncorrected_err_count_delta); l1_tag_uncorrected_err_count_delta);
gv11b_gr_intr_report_l1_tag_uncorrected_err(g, &ecc_status, gpc, tpc); gv11b_gr_intr_report_l1_tag_uncorrected_err(g, &ecc_status, gpc, tpc);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
@@ -1227,11 +1221,6 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
bool is_lrf_ecc_corrected_total_err_overflow = false; bool is_lrf_ecc_corrected_total_err_overflow = false;
bool is_lrf_ecc_uncorrected_total_err_overflow = false; bool is_lrf_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
u32 gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
u32 tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -1294,9 +1283,9 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
lrf_uncorrected_err_count_delta, lrf_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()));
} }
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
lrf_uncorrected_err_count_delta); lrf_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset),
@@ -1365,11 +1354,6 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
bool is_cbu_ecc_corrected_total_err_overflow = false; bool is_cbu_ecc_corrected_total_err_overflow = false;
bool is_cbu_ecc_uncorrected_total_err_overflow = false; bool is_cbu_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
u32 gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
u32 tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -1430,9 +1414,9 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
nvgpu_wrapping_add_u32(cbu_uncorrected_err_count_delta, nvgpu_wrapping_add_u32(cbu_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()));
} }
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
cbu_uncorrected_err_count_delta); cbu_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset),
@@ -1496,11 +1480,6 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
bool is_l1_data_ecc_corrected_total_err_overflow = false; bool is_l1_data_ecc_corrected_total_err_overflow = false;
bool is_l1_data_ecc_uncorrected_total_err_overflow = false; bool is_l1_data_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
u32 gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
u32 tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -1562,9 +1541,9 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
nvgpu_wrapping_add_u32(l1_data_uncorrected_err_count_delta, nvgpu_wrapping_add_u32(l1_data_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()));
} }
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_data_uncorrected_err_count_delta); l1_data_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset),
@@ -1682,11 +1661,6 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32
bool is_icache_ecc_corrected_total_err_overflow = false; bool is_icache_ecc_corrected_total_err_overflow = false;
bool is_icache_ecc_uncorrected_total_err_overflow = false; bool is_icache_ecc_uncorrected_total_err_overflow = false;
struct nvgpu_gr_sm_ecc_status ecc_status; struct nvgpu_gr_sm_ecc_status ecc_status;
u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
struct nvgpu_gr_config *config = gr->config;
u32 gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g, cur_gr_instance, gpc);
u32 tpc_phys_id = config->gpc_tpc_physical_id_map[gpc_phys_id][tpc];
offset = nvgpu_safe_add_u32( offset = nvgpu_safe_add_u32(
nvgpu_safe_mult_u32(gpc_stride, gpc), nvgpu_safe_mult_u32(gpc_stride, gpc),
@@ -1729,9 +1703,9 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32
nvgpu_wrapping_add_u32(icache_corrected_err_count_delta, nvgpu_wrapping_add_u32(icache_corrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()));
} }
g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_icache_ecc_corrected_err_count[gpc][tpc].counter,
icache_corrected_err_count_delta); icache_corrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(), offset),
@@ -1750,9 +1724,9 @@ static void gv11b_gr_intr_handle_icache_exception(struct gk20a *g, u32 gpc, u32
icache_uncorrected_err_count_delta, icache_uncorrected_err_count_delta,
BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s())); BIT32(gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()));
} }
g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter = g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter =
nvgpu_wrapping_add_u32( nvgpu_wrapping_add_u32(
g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc_phys_id][tpc_phys_id].counter, g->ecc.gr.sm_icache_ecc_uncorrected_err_count[gpc][tpc].counter,
icache_uncorrected_err_count_delta); icache_uncorrected_err_count_delta);
nvgpu_writel(g, nvgpu_safe_add_u32( nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), offset), gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(), offset),