gpu: nvgpu: add safe channel id lookup

Add gk20a_channel_from_id() to retrieve a channel, given a raw channel ID, with a reference taken (or NULL if the channel was dead). This makes it harder to mistakenly use a channel that's dead and thus uncovers bugs sooner. Convert code to use the new lookup when applicable; work remains to convert complex uses where a ref should have been taken but hasn't. The channel ID is also validated against FIFO_INVAL_CHANNEL_ID; NULL is returned for such IDs. This is often useful and does not hurt when unnecessary. However, this does not prevent the case where a channel would be closed and reopened again when someone would hold a stale channel number. In all such conditions the caller should hold a reference already. The only conditions where a channel can be safely looked up by an id and used without taking a ref are when initializing or deinitializing the list of channels. Jira NVGPU-1460 Change-Id: I0a30968d17c1e0784d315a676bbe69c03a73481c Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1955400 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry picked from commit 7df3d58750 in dev-kernel) Reviewed-on: https://git-master.nvidia.com/r/2008515 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2018-11-13 15:36:19 +02:00
parent ed6e396090
commit 3794afbeb1
8 changed files with 59 additions and 39 deletions
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1172,7 +1172,7 @@ gk20a_refch_from_inst_ptr(struct gk20a *g, u64 inst_ptr)
 		struct channel_gk20a *ch;
 		u64 ch_inst_ptr;

-		ch = gk20a_channel_get(&f->channel[ci]);
+		ch = gk20a_channel_from_id(g, ci);
 		/* only alive channels are searched */
 		if (!ch) {
 			continue;
@@ -1959,9 +1959,9 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 chid, bool verbose, int rc_type)
 		gk20a_fifo_recover(g, engines, chid, false, true, verbose,
 					rc_type);
 	} else {
-		struct channel_gk20a *ch = &g->fifo.channel[chid];
+		struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);

-		if (gk20a_channel_get(ch)) {
+		if (ch != NULL) {
 			gk20a_channel_abort(ch, false);

 			if (gk20a_fifo_error_ch(g, ch)) {
@@ -2710,9 +2710,9 @@ static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
 	id = fifo_pbdma_status_id_v(status);
 	if (fifo_pbdma_status_id_type_v(status)
 			== fifo_pbdma_status_id_type_chid_v()) {
-		struct channel_gk20a *ch = &f->channel[id];
+		struct channel_gk20a *ch = gk20a_channel_from_id(g, id);

-		if (gk20a_channel_get(ch)) {
+		if (ch != NULL) {
 			g->ops.fifo.set_error_notifier(ch, error_notifier);
 			gk20a_fifo_recover_ch(g, id, true, RC_TYPE_PBDMA_FAULT);
 			gk20a_channel_put(ch);
@@ -2924,12 +2924,12 @@ void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, u32 id,
 		gk20a_fifo_recover_tsg(g, id, true,
 						RC_TYPE_PREEMPT_TIMEOUT);
 	} else {
-		struct channel_gk20a *ch = &g->fifo.channel[id];
+		struct channel_gk20a *ch = gk20a_channel_from_id(g, id);

 		nvgpu_err(g,
 			"preempt channel %d timeout", id);

-		if (gk20a_channel_get(ch)) {
+		if (ch != NULL) {
 			g->ops.fifo.set_error_notifier(ch,
 					NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
 			gk20a_fifo_recover_ch(g, id, true,
@@ -4031,8 +4031,8 @@ void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
 	}

 	for (chid = 0; chid < f->num_channels; chid++) {
-		struct channel_gk20a *ch = &f->channel[chid];
-		if (gk20a_channel_get(ch)) {
+		struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);
+		if (ch != NULL) {
 			ch_state[chid] =
 				nvgpu_kmalloc(g, sizeof(struct ch_state) +
 					ram_in_alloc_size_v());