gpu: nvgpu: define error_notifiers in common code

All the linux specific error_notifier codes are defined in linux specific header file <uapi/linux/nvgpu.h> and used in all the common driver But since they are defined in linux specific file, we need to move all the uses of those error_notifiers in linux specific code only Hence define new error_notifiers in include/nvgpu/error_notifier.h and use them in the common code Add new API nvgpu_error_notifier_to_channel_notifier() to convert common error_notifier of the form NVGPU_ERR_NOTIFIER_* to linux specific error notifier of the form NVGPU_CHANNEL_* Any future additions to error notifiers requires update to both the form of error notifiers Move all error notifier related metadata from channel_gk20a (common code) to linux specific structure nvgpu_channel_linux Update all accesses to this data from new structure instead of channel_gk20a Move and rename below APIs to linux specific file and declare them in error_notifier.h nvgpu_set_error_notifier_locked() nvgpu_set_error_notifier() nvgpu_is_error_notifier_set() Add below new API and use it in fifo_vgpu.c nvgpu_set_error_notifier_if_empty() Include <nvgpu/error_notifier.h> wherever new error_notifier codes are used NVGPU-426 Change-Id: Iaa5bfc150e6e9ec17d797d445c2d6407afe9f4bd Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1593361 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2017-11-06 05:44:23 -08:00
parent a0cea295e7
commit c6b9177cff
10 changed files with 285 additions and 138 deletions
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -44,6 +44,7 @@
 #include <nvgpu/ltc.h>
 #include <nvgpu/barrier.h>
 #include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/error_notifier.h>

 #include "gk20a.h"
 #include "dbg_gpu_gk20a.h"
@@ -339,37 +340,6 @@ int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
 	return ret ? ret : g->ops.fifo.update_runlist(g, ch->runlist_id, ~0, true, true);
 }

-/**
- * gk20a_set_error_notifier_locked()
- * Should be called with ch->error_notifier_mutex held
- */
-void gk20a_set_error_notifier_locked(struct channel_gk20a *ch, __u32 error)
-{
-	if (ch->error_notifier_ref) {
-		struct timespec time_data;
-		u64 nsec;
-		getnstimeofday(&time_data);
-		nsec = ((u64)time_data.tv_sec) * 1000000000u +
-				(u64)time_data.tv_nsec;
-		ch->error_notifier->time_stamp.nanoseconds[0] =
-				(u32)nsec;
-		ch->error_notifier->time_stamp.nanoseconds[1] =
-				(u32)(nsec >> 32);
-		ch->error_notifier->info32 = error;
-		ch->error_notifier->status = 0xffff;
-
-		nvgpu_err(ch->g,
-		    "error notifier set to %d for ch %d", error, ch->chid);
-	}
-}
-
-void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
-{
-	nvgpu_mutex_acquire(&ch->error_notifier_mutex);
-	gk20a_set_error_notifier_locked(ch, error);
-	nvgpu_mutex_release(&ch->error_notifier_mutex);
-}
-
 static void gk20a_wait_until_counter_is_N(
 	struct channel_gk20a *ch, nvgpu_atomic_t *counter, int wait_value,
 	struct nvgpu_cond *c, const char *caller, const char *counter_name)
@@ -1550,7 +1520,7 @@ static void gk20a_channel_timeout_handler(struct channel_gk20a *ch)
 	gk20a_gr_debug_dump(g);

 	g->ops.fifo.force_reset_ch(ch,
-		NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT, true);
+		NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, true);
 }

 /**
@@ -2210,53 +2180,48 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	err = nvgpu_mutex_init(&c->ioctl_lock);
 	if (err)
 		return err;
-	err = nvgpu_mutex_init(&c->error_notifier_mutex);
-	if (err)
-		goto fail_1;
 	err = nvgpu_mutex_init(&c->joblist.cleanup_lock);
 	if (err)
-		goto fail_2;
+		goto fail_1;
 	err = nvgpu_mutex_init(&c->joblist.pre_alloc.read_lock);
 	if (err)
-		goto fail_3;
+		goto fail_2;
 	err = nvgpu_mutex_init(&c->sync_lock);
 	if (err)
-		goto fail_4;
+		goto fail_3;
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	err = nvgpu_mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
 	if (err)
-		goto fail_5;
+		goto fail_4;
 	err = nvgpu_mutex_init(&c->cs_client_mutex);
 	if (err)
-		goto fail_6;
+		goto fail_5;
 #endif
 	err = nvgpu_mutex_init(&c->event_id_list_lock);
 	if (err)
-		goto fail_7;
+		goto fail_6;
 	err = nvgpu_mutex_init(&c->dbg_s_lock);
 	if (err)
-		goto fail_8;
+		goto fail_7;

 	nvgpu_list_add(&c->free_chs, &g->fifo.free_chs);

 	return 0;

-fail_8:
-	nvgpu_mutex_destroy(&c->event_id_list_lock);
 fail_7:
+	nvgpu_mutex_destroy(&c->event_id_list_lock);
+fail_6:
 #if defined(CONFIG_GK20A_CYCLE_STATS)
 	nvgpu_mutex_destroy(&c->cs_client_mutex);
-fail_6:
-	nvgpu_mutex_destroy(&c->cyclestate.cyclestate_buffer_mutex);
 fail_5:
+	nvgpu_mutex_destroy(&c->cyclestate.cyclestate_buffer_mutex);
+fail_4:
 #endif
 	nvgpu_mutex_destroy(&c->sync_lock);
-fail_4:
-	nvgpu_mutex_destroy(&c->joblist.pre_alloc.read_lock);
 fail_3:
-	nvgpu_mutex_destroy(&c->joblist.cleanup_lock);
+	nvgpu_mutex_destroy(&c->joblist.pre_alloc.read_lock);
 fail_2:
-	nvgpu_mutex_destroy(&c->error_notifier_mutex);
+	nvgpu_mutex_destroy(&c->joblist.cleanup_lock);
 fail_1:
 	nvgpu_mutex_destroy(&c->ioctl_lock);

--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -273,11 +273,6 @@ struct channel_gk20a {
 	bool timeout_debug_dump;
 	unsigned int timeslice_us;

-	struct dma_buf *error_notifier_ref;
-	struct nvgpu_notification *error_notifier;
-	void *error_notifier_va;
-	struct nvgpu_mutex error_notifier_mutex;
-
 	struct nvgpu_mutex sync_lock;
 	struct gk20a_channel_sync *sync;

@@ -335,8 +330,6 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
 void gk20a_disable_channel(struct channel_gk20a *ch);
 void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt);
 void gk20a_channel_abort_clean_up(struct channel_gk20a *ch);
-void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error);
-void gk20a_set_error_notifier_locked(struct channel_gk20a *ch, __u32 error);
 void gk20a_channel_semaphore_wakeup(struct gk20a *g, bool post_events);
 int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 size,
 			     struct priv_cmd_entry *entry);
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -39,6 +39,7 @@
 #include <nvgpu/nvhost.h>
 #include <nvgpu/barrier.h>
 #include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/error_notifier.h>

 #include "gk20a.h"
 #include "mm_gk20a.h"
@@ -557,7 +558,6 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
 		nvgpu_mutex_destroy(&tsg->event_id_list_lock);

 		nvgpu_mutex_destroy(&c->ioctl_lock);
-		nvgpu_mutex_destroy(&c->error_notifier_mutex);
 		nvgpu_mutex_destroy(&c->joblist.cleanup_lock);
 		nvgpu_mutex_destroy(&c->joblist.pre_alloc.read_lock);
 		nvgpu_mutex_destroy(&c->sync_lock);
@@ -1339,14 +1339,10 @@ static bool gk20a_fifo_ch_timeout_debug_dump_state(struct gk20a *g,
 	if (!refch)
 		return verbose;

-	nvgpu_mutex_acquire(&refch->error_notifier_mutex);
-	if (refch->error_notifier_ref) {
-		u32 err = refch->error_notifier->info32;
+	if (nvgpu_is_error_notifier_set(refch,
+			NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT))
+		verbose = refch->timeout_debug_dump;

-		if (err == NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT)
-			verbose = refch->timeout_debug_dump;
-	}
-	nvgpu_mutex_release(&refch->error_notifier_mutex);
 	return verbose;
 }

@@ -1400,8 +1396,8 @@ void gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
 {
 	nvgpu_err(g,
 		"channel %d generated a mmu fault", refch->chid);
-	gk20a_set_error_notifier(refch,
-				NVGPU_CHANNEL_FIFO_ERROR_MMU_ERR_FLT);
+	nvgpu_set_error_notifier(refch,
+				NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
 }

 void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
@@ -1939,7 +1935,7 @@ int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,

 		list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
 			if (gk20a_channel_get(ch_tsg)) {
-				gk20a_set_error_notifier(ch_tsg, err_code);
+				nvgpu_set_error_notifier(ch_tsg, err_code);
 				gk20a_channel_put(ch_tsg);
 			}
 		}
@@ -1947,7 +1943,7 @@ int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
 		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
 		gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
 	} else {
-		gk20a_set_error_notifier(ch, err_code);
+		nvgpu_set_error_notifier(ch, err_code);
 		gk20a_fifo_recover_ch(g, ch->chid, verbose);
 	}

@@ -2108,8 +2104,8 @@ static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
 		*verbose = ch->timeout_debug_dump;
 		*ms = ch->timeout_accumulated_ms;
 		if (recover)
-			gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+			nvgpu_set_error_notifier(ch,
+					NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);

 		gk20a_channel_put(ch);
 	}
@@ -2170,8 +2166,8 @@ bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
 		gk20a_channel_put(ch);
 		list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
 			if (gk20a_channel_get(ch)) {
-				gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				nvgpu_set_error_notifier(ch,
+					NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 				*verbose |= ch->timeout_debug_dump;
 				gk20a_channel_put(ch);
 			}
@@ -2413,7 +2409,7 @@ unsigned int gk20a_fifo_handle_pbdma_intr_0(struct gk20a *g, u32 pbdma_id,
 			rc_type = RC_TYPE_PBDMA_FAULT;
 			nvgpu_err(g,
 				"semaphore acquire timeout!");
-			*error_notifier = NVGPU_CHANNEL_GR_SEMAPHORE_TIMEOUT;
+			*error_notifier = NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT;
 		}
 		*handled |= pbdma_intr_0_acquire_pending_f();
 	}
@@ -2431,7 +2427,7 @@ unsigned int gk20a_fifo_handle_pbdma_intr_0(struct gk20a *g, u32 pbdma_id,

 	if (pbdma_intr_0 & pbdma_intr_0_pbcrc_pending_f()) {
 		*error_notifier =
-			NVGPU_CHANNEL_PBDMA_PUSHBUFFER_CRC_MISMATCH;
+			NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH;
 		rc_type = RC_TYPE_PBDMA_FAULT;
 	}

@@ -2485,7 +2481,7 @@ static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
 		struct channel_gk20a *ch = &f->channel[id];

 		if (gk20a_channel_get(ch)) {
-			gk20a_set_error_notifier(ch, error_notifier);
+			nvgpu_set_error_notifier(ch, error_notifier);
 			gk20a_fifo_recover_ch(g, id, true);
 			gk20a_channel_put(ch);
 		}
@@ -2497,7 +2493,7 @@ static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
 		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 		list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
 			if (gk20a_channel_get(ch)) {
-				gk20a_set_error_notifier(ch,
+				nvgpu_set_error_notifier(ch,
 					error_notifier);
 				gk20a_channel_put(ch);
 			}
@@ -2514,7 +2510,7 @@ u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f,
 	u32 pbdma_intr_1 = gk20a_readl(g, pbdma_intr_1_r(pbdma_id));

 	u32 handled = 0;
-	u32 error_notifier = NVGPU_CHANNEL_PBDMA_ERROR;
+	u32 error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;
 	unsigned int rc_type = RC_TYPE_NO_RC;

 	if (pbdma_intr_0) {
@@ -2658,8 +2654,8 @@ void __locked_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
 		list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
 			if (!gk20a_channel_get(ch))
 				continue;
-			gk20a_set_error_notifier(ch,
-				NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+			nvgpu_set_error_notifier(ch,
+				NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 			gk20a_channel_put(ch);
 		}
 		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
@@ -2671,8 +2667,8 @@ void __locked_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
 			"preempt channel %d timeout", id);

 		if (gk20a_channel_get(ch)) {
-			gk20a_set_error_notifier(ch,
-					NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+			nvgpu_set_error_notifier(ch,
+					NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 			gk20a_fifo_recover_ch(g, id, true);
 			gk20a_channel_put(ch);
 		}
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -40,6 +40,7 @@
 #include <nvgpu/barrier.h>
 #include <nvgpu/mm.h>
 #include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/error_notifier.h>

 #include "gk20a.h"
 #include "gr_ctx_gk20a.h"
@@ -5113,14 +5114,14 @@ static void gk20a_gr_set_error_notifier(struct gk20a *g,
 			nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 			list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
 				if (gk20a_channel_get(ch_tsg)) {
-					gk20a_set_error_notifier(ch_tsg,
+					nvgpu_set_error_notifier(ch_tsg,
 							 error_notifier);
 					gk20a_channel_put(ch_tsg);
 				}
 			}
 			nvgpu_rwsem_up_read(&tsg->ch_list_lock);
 		} else {
-			gk20a_set_error_notifier(ch, error_notifier);
+			nvgpu_set_error_notifier(ch, error_notifier);
 		}
 	}
 }
@@ -5130,7 +5131,7 @@ static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
 {
 	gk20a_dbg_fn("");
 	gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_SEMAPHORE_TIMEOUT);
+			 NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT);
 	nvgpu_err(g,
 		   "gr semaphore timeout");
 	return -EINVAL;
@@ -5141,7 +5142,7 @@ static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
 {
 	gk20a_dbg_fn("");
 	gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_ILLEGAL_NOTIFY);
+			 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
 	/* This is an unrecoverable error, reset is needed */
 	nvgpu_err(g,
 		   "gr semaphore timeout");
@@ -5156,7 +5157,7 @@ static int gk20a_gr_handle_illegal_method(struct gk20a *g,
 			isr_data->data_lo);
 	if (ret) {
 		gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_ILLEGAL_NOTIFY);
+			 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
 		nvgpu_err(g, "invalid method class 0x%08x"
 			", offset 0x%08x address 0x%08x",
 			isr_data->class_num, isr_data->offset, isr_data->addr);
@@ -5169,7 +5170,7 @@ static int gk20a_gr_handle_illegal_class(struct gk20a *g,
 {
 	gk20a_dbg_fn("");
 	gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
+			 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
 	nvgpu_err(g,
 		   "invalid class 0x%08x, offset 0x%08x",
 		   isr_data->class_num, isr_data->offset);
@@ -5193,7 +5194,7 @@ int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,

 	if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
 		gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_FECS_ERR_UNIMP_FIRMWARE_METHOD);
+			 NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
 		nvgpu_err(g,
 			  "firmware method error 0x%08x for offset 0x%04x",
 			  gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
@@ -5215,7 +5216,7 @@ static int gk20a_gr_handle_class_error(struct gk20a *g,
 	gr_class_error =
 		gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
 	gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
+			 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
 	nvgpu_err(g, "class error 0x%08x, offset 0x%08x,"
 		"sub channel 0x%08x mme generated %d,"
 		" mme pc 0x%08xdata high %d priv status %d"
@@ -5244,7 +5245,7 @@ static int gk20a_gr_handle_firmware_method(struct gk20a *g,
 	gk20a_dbg_fn("");

 	gk20a_gr_set_error_notifier(g, isr_data,
-			 NVGPU_CHANNEL_GR_ERROR_SW_NOTIFY);
+			 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
 	nvgpu_err(g,
 		   "firmware method 0x%08x, offset 0x%08x for channel %u",
 		   isr_data->class_num, isr_data->offset,
@@ -6024,7 +6025,7 @@ int gk20a_gr_isr(struct gk20a *g)
 		if (need_reset) {
 			nvgpu_err(g, "set gr exception notifier");
 			gk20a_gr_set_error_notifier(g, &isr_data,
-					 NVGPU_CHANNEL_GR_EXCEPTION);
+					 NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
 		}
 	}