gpu: nvgpu: Add NVGPU_CHANNEL_WDT flag

NVGPU_CHANNEL_WDT feature is embedded within the NVGPU_CHANNEL_WDT flag to allow it to be compiled out for safety builds. Jira NVGPU-3012 Change-Id: I0ca54af9d7b1b8e01f4090442341eaaadca8e339 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2114480 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-23 18:16:01 +03:00 · 2019-05-08 14:52:51 +05:30
parent bf561f38f7
commit 1dea88c6c7
10 changed files with 76 additions and 19 deletions
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -27,6 +27,7 @@ endif
 ccflags-y += -DNVGPU_ENGINE
 ccflags-y += -DNVGPU_USERD
 ccflags-y += -DNVGPU_CHANNEL_WDT
 obj-$(CONFIG_GK20A) := nvgpu.o
--- a/drivers/gpu/nvgpu/Makefile.shared.configs
+++ b/drivers/gpu/nvgpu/Makefile.shared.configs
@@ -31,6 +31,9 @@ NV_COMPONENT_CFLAGS             += -DNVGPU_DEBUGGER
 # Enable USERD for safety build until we switch to user mode submits only
 NV_COMPONENT_CFLAGS             += -DNVGPU_USERD
 # Enable Channel WDT for safety build until we switch to user mode submits only
 NV_COMPONENT_CFLAGS		+= -DNVGPU_CHANNEL_WDT
 # Enable iGPU LS PMU for safety build until devctl whitelisting is done
 NVGPU_LS_PMU                    := 1
 NV_COMPONENT_CFLAGS             += -DNVGPU_LS_PMU
--- a/drivers/gpu/nvgpu/common/ce/ce.c
+++ b/drivers/gpu/nvgpu/common/ce/ce.c
@@ -545,7 +545,10 @@ u32 nvgpu_ce_create_context(struct gk20a *g,
 		nvgpu_err(g, "ce: gk20a channel not available");
 		goto end;
 	}
 #ifdef NVGPU_CHANNEL_WDT
 	ce_ctx->ch->wdt.enabled = false;
 #endif
 	/* bind the channel to the vm */
 	err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -724,10 +724,12 @@ struct nvgpu_channel *gk20a_open_new_channel(struct gk20a *g,
 	ch->ctxsw_timeout_debug_dump = true;
 	ch->unserviceable = false;
 #ifdef NVGPU_CHANNEL_WDT
 	/* init kernel watchdog timeout */
 	ch->wdt.enabled = true;
 	ch->wdt.limit_ms = g->ch_wdt_init_limit_ms;
 	ch->wdt.debug_dump = true;
 #endif
 	ch->obj_class = 0;
 	ch->subctx_id = 0;
@@ -1220,7 +1222,7 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
 	u32 gpfifo_size, gpfifo_entry_size;
 	u64 gpfifo_gpu_va;
 	int err = 0;
-	u64 pbdma_acquire_timeout;
+	u64 pbdma_acquire_timeout = 0ULL;
 	gpfifo_size = args->num_gpfifo_entries;
 	gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
@@ -1332,11 +1334,14 @@ int nvgpu_channel_setup_bind(struct nvgpu_channel *c,
 		}
 	}
-	if (!nvgpu_is_timeouts_enabled(c->g) || !c->wdt.enabled) {
+#ifdef NVGPU_CHANNEL_WDT
-		pbdma_acquire_timeout = 0;
+	if (c->wdt.enabled && nvgpu_is_timeouts_enabled(c->g)) {
 	} else {
 		pbdma_acquire_timeout = c->wdt.limit_ms;
 	}
 #else
 	if (nvgpu_is_timeouts_enabled(c->g)) {
        	pbdma_acquire_timeout = g->ch_wdt_init_limit_ms;
 #endif
 	err = g->ops.ramfc.setup(c, gpfifo_gpu_va,
 			c->gpfifo.entry_num, pbdma_acquire_timeout,
@@ -1510,6 +1515,8 @@ u32 nvgpu_channel_update_gpfifo_get_and_get_free_count(struct nvgpu_channel *ch)
 	return nvgpu_channel_get_gpfifo_free_count(ch);
 }
 #ifdef NVGPU_CHANNEL_WDT
 static void nvgpu_channel_wdt_init(struct nvgpu_channel *ch)
 {
 	struct gk20a *g = ch->g;
@@ -1755,6 +1762,8 @@ static void nvgpu_channel_poll_wdt(struct gk20a *g)
 	}
 }
 #endif
 static inline struct nvgpu_channel_worker *
 nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
 {
@@ -1762,6 +1771,7 @@ nvgpu_channel_worker_from_worker(struct nvgpu_worker *worker)
 	   ((uintptr_t)worker - offsetof(struct nvgpu_channel_worker, worker));
 };
 #ifdef NVGPU_CHANNEL_WDT
 static void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
 {
@@ -1797,6 +1807,18 @@ static void nvgpu_channel_worker_poll_wakeup_post_process_item(
 		}
 	}
 }
 static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
 		struct nvgpu_worker *worker)
 {
 	struct nvgpu_channel_worker *ch_worker =
 		nvgpu_channel_worker_from_worker(worker);
 	return ch_worker->watchdog_interval;
 }
 #endif
 static void nvgpu_channel_worker_poll_wakeup_process_item(
 		struct nvgpu_list_node *work_item)
 {
@@ -1812,25 +1834,18 @@ static void nvgpu_channel_worker_poll_wakeup_process_item(
 	nvgpu_channel_put(ch);
 }
 static u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
 		struct nvgpu_worker *worker)
 {
 	struct nvgpu_channel_worker *ch_worker =
 		nvgpu_channel_worker_from_worker(worker);
 	return ch_worker->watchdog_interval;
 }
 static const struct nvgpu_worker_ops channel_worker_ops = {
 #ifdef NVGPU_CHANNEL_WDT
 	.pre_process = nvgpu_channel_worker_poll_init,
 	.wakeup_early_exit = NULL,
 	.wakeup_post_process =
 		nvgpu_channel_worker_poll_wakeup_post_process_item,
 	.wakeup_timeout =
 		nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
 #endif
 	.wakeup_early_exit = NULL,
 	.wakeup_process_item =
 		nvgpu_channel_worker_poll_wakeup_process_item,
 	.wakeup_condition = NULL,
 	.wakeup_timeout =
 		nvgpu_channel_worker_poll_wakeup_condition_get_timeout,
 };
 /**
@@ -1938,7 +1953,9 @@ int gk20a_channel_add_job(struct nvgpu_channel *c,
 		job->num_mapped_buffers = num_mapped_buffers;
 		job->mapped_buffers = mapped_buffers;
 #ifdef NVGPU_CHANNEL_WDT
 		nvgpu_channel_wdt_start(c);
 #endif
 		if (!pre_alloc_enabled) {
 			channel_gk20a_joblist_lock(c);
@@ -1985,7 +2002,9 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
 	struct nvgpu_channel_job *job;
 	struct gk20a *g;
 	bool job_finished = false;
 #ifdef NVGPU_CHANNEL_WDT
 	bool watchdog_on = false;
 #endif
 	c = nvgpu_channel_get(c);
 	if (c == NULL) {
@@ -2000,6 +2019,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
 	vm = c->vm;
 	g = c->g;
 #ifdef NVGPU_CHANNEL_WDT
 	/*
 	 * If !clean_all, we're in a condition where watchdog isn't supported
 	 * anyway (this would be a no-op).
@@ -2007,6 +2027,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
 	if (clean_all) {
 		watchdog_on = nvgpu_channel_wdt_stop(c);
 	}
 #endif
 	/* Synchronize with abort cleanup that needs the jobs. */
 	nvgpu_mutex_acquire(&c->joblist.cleanup_lock);
@@ -2035,6 +2056,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
 		completed = nvgpu_fence_is_expired(job->post_fence);
 		if (!completed) {
 #ifdef NVGPU_CHANNEL_WDT
 			/*
 			 * The watchdog eventually sees an updated gp_get if
 			 * something happened in this loop. A new job can have
@@ -2045,6 +2067,7 @@ void gk20a_channel_clean_up_jobs(struct nvgpu_channel *c,
 			if (clean_all && watchdog_on) {
 				nvgpu_channel_wdt_continue(c);
 			}
 #endif
 			break;
 		}
@@ -2298,7 +2321,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	nvgpu_spinlock_init(&c->ref_actions_lock);
 #endif
 	nvgpu_spinlock_init(&c->joblist.dynamic.lock);
 #ifdef NVGPU_CHANNEL_WDT
 	nvgpu_spinlock_init(&c->wdt.lock);
 #endif
 	nvgpu_init_list_node(&c->joblist.dynamic.jobs);
 	nvgpu_init_list_node(&c->dbg_s_list);
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -402,11 +402,14 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 	 */
 	need_job_tracking = (flag_fence_wait ||
 			flag_fence_get ||
 			c->wdt.enabled ||
 			(nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE)
 			 && !c->deterministic) ||
 			!skip_buffer_refcounting);
 #ifdef NVGPU_CHANNEL_WDT
       need_job_tracking = need_job_tracking || c->wdt.enabled;
 #endif
 	if (need_job_tracking) {
 		bool need_sync_framework = false;
@@ -439,9 +442,12 @@ static int nvgpu_submit_channel_gpfifo(struct nvgpu_channel *c,
 		 */
 		need_deferred_cleanup = !c->deterministic ||
 					need_sync_framework ||
 					c->wdt.enabled ||
 					!skip_buffer_refcounting;
 #ifdef NVGPU_CHANNEL_WDT
 		need_deferred_cleanup = need_deferred_cleanup || c->wdt.enabled;
 #endif
 		/*
 		 * For deterministic channels, we don't allow deferred clean_up
 		 * processing to occur. In cases we hit this, we fail the submit
--- a/drivers/gpu/nvgpu/common/rc/rc.c
+++ b/drivers/gpu/nvgpu/common/rc/rc.c
@@ -62,11 +62,15 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
 {
 	nvgpu_tsg_set_error_notifier(g, tsg,
 		NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 #ifdef NVGPU_CHANNEL_WDT
 	/*
 	 * Cancel all channels' wdt since ctxsw timeout might
 	 * trigger multiple watchdogs at a time
 	 */
 	nvgpu_channel_wdt_restart_all_channels(g);
 #endif
 	nvgpu_rc_fifo_recover(g, eng_bitmask, tsg->tsgid, true, true, debug_dump,
 			RC_TYPE_CTXSW_TIMEOUT);
 }
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -211,6 +211,8 @@ struct nvgpu_channel_joblist {
 	struct nvgpu_mutex cleanup_lock;
 };
 #ifdef NVGPU_CHANNEL_WDT
 struct nvgpu_channel_wdt {
 	/* lock protects the running timer state */
 	struct nvgpu_spinlock lock;
@@ -225,6 +227,8 @@ struct nvgpu_channel_wdt {
 	bool debug_dump;
 };
 #endif
 /*
 * Track refcount actions, saving their stack traces. This number specifies how
 * many most recent actions are stored in a buffer. Set to 0 to disable. 128
@@ -318,8 +322,10 @@ struct nvgpu_channel {
 	struct nvgpu_cond notifier_wq;
 	struct nvgpu_cond semaphore_wq;
 #ifdef NVGPU_CHANNEL_WDT
 	/* kernel watchdog to kill stuck jobs */
 	struct nvgpu_channel_wdt wdt;
 #endif
 	/* for job cleanup handling in the background worker */
 	struct nvgpu_list_node worker_item;
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -2075,9 +2075,12 @@ struct gk20a {
 	struct nvgpu_ltc *ltc;
 	struct nvgpu_channel_worker {
 		u32 watchdog_interval;
 		struct nvgpu_worker worker;
 #ifdef NVGPU_CHANNEL_WDT
 		u32 watchdog_interval;
 		struct nvgpu_timeout timeout;
 #endif
 	} channel_worker;
 	struct nvgpu_clk_arb_worker {
--- a/drivers/gpu/nvgpu/os/linux/cde.c
+++ b/drivers/gpu/nvgpu/os/linux/cde.c
@@ -1338,7 +1338,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 		goto err_get_gk20a_channel;
 	}
 #ifdef NVGPU_CHANNEL_WDT
 	ch->wdt.enabled = false;
 #endif
 	/* bind the channel to the vm */
 	err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch);
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -285,6 +285,7 @@ int gk20a_channel_free_cycle_stats_snapshot(struct nvgpu_channel *ch)
 static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
 		struct nvgpu_channel_wdt_args *args)
 {
 #ifdef NVGPU_CHANNEL_WDT
 	u32 status = args->wdt_status & (NVGPU_IOCTL_CHANNEL_DISABLE_WDT |
 			NVGPU_IOCTL_CHANNEL_ENABLE_WDT);
@@ -302,6 +303,9 @@ static int gk20a_channel_set_wdt_status(struct nvgpu_channel *ch,
 			NVGPU_IOCTL_CHANNEL_WDT_FLAG_DISABLE_DUMP) == 0;
 	return 0;
 #else
 	return -EINVAL;
 #endif
 }
 static void gk20a_channel_free_error_notifiers(struct nvgpu_channel *ch)