linux-nvgpu/userspace/units/rc/nvgpu-rc.c

/*
 * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
#include <stdlib.h>
#include <unistd.h>
#include <unit/unit.h>
#include <unit/io.h>
#include <nvgpu/types.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/hal_init.h>
#include <nvgpu/dma.h>
#include <nvgpu/posix/io.h>
#include <os/posix/os_posix.h>
#include <nvgpu/posix/posix-fault-injection.h>
#include <nvgpu/posix/posix-nvhost.h>
#include <nvgpu/posix/posix-channel.h>
#include <nvgpu/runlist.h>
#include <nvgpu/device.h>
#include <nvgpu/channel.h>
#include <nvgpu/rc.h>
#include <nvgpu/pbdma_status.h>
#include <nvgpu/error_notifier.h>

#include "../fifo/nvgpu-fifo-common.h"
#include "../fifo/nvgpu-fifo-gv11b.h"
#include "nvgpu-rc.h"

#define NV_PMC_BOOT_0_ARCHITECTURE_GV110        (0x00000015 << \
						NVGPU_GPU_ARCHITECTURE_SHIFT)
#define NV_PMC_BOOT_0_IMPLEMENTATION_B          0xB

#define assert(cond)	unit_assert(cond, goto done)

static u32 stub_gv11b_gr_init_get_no_of_sm(struct gk20a *g)
{
	return 8;
}

static struct nvgpu_channel *ch = NULL;
static struct nvgpu_tsg *tsg = NULL;

static int verify_error_notifier(struct nvgpu_channel *ch, u32 error_notifier)
{
	struct nvgpu_posix_channel *cp = ch->os_priv;
	if (cp == NULL) {
		return UNIT_FAIL;
	} else if (cp->err_notifier.error == error_notifier &&
		cp->err_notifier.status == 0xffff) {
		return UNIT_SUCCESS;
	} else {
		return UNIT_FAIL;
	}
}

static void clear_error_notifier(struct nvgpu_channel *ch)
{
	struct nvgpu_posix_channel *cp = ch->os_priv;
	if (cp != NULL) {
		cp->err_notifier.error = 0U;
		cp->err_notifier.status = 0U;
	}
}

int test_rc_init(struct unit_module *m, struct gk20a *g, void *args)
{
	int ret = 0;
	struct nvgpu_posix_channel *posix_channel = NULL;

	ret = test_fifo_setup_gv11b_reg_space(m, g);
	if (ret != 0) {
		unit_return_fail(m, "fifo reg_space failure");
	}

	ret = nvgpu_pd_cache_init(g);
	if (ret != 0) {
		unit_return_fail(m, "PD cache initialization failure");
	}

	nvgpu_device_init(g);

	g->ops.gr.init.get_no_of_sm = stub_gv11b_gr_init_get_no_of_sm;

	g->ops.ecc.ecc_init_support(g);
	g->ops.mm.init_mm_support(g);

	ret = nvgpu_fifo_init_support(g);
	nvgpu_assert(ret == 0);

	/* Do not allocate from vidmem */
	nvgpu_set_enabled(g, NVGPU_MM_UNIFIED_MEMORY, true);

	ret = nvgpu_runlist_setup_sw(g);
	nvgpu_assert(ret == 0);

	tsg = nvgpu_tsg_open(g, getpid());
	nvgpu_assert(tsg != NULL);

	ch = nvgpu_channel_open_new(g, NVGPU_INVALID_RUNLIST_ID, false,
			getpid(), getpid());
	if (ch == NULL) {
		ret = UNIT_FAIL;
		unit_err(m, "failed channel open");
		goto clear_tsg;
	}

	posix_channel = malloc(sizeof(struct nvgpu_posix_channel));
	if (posix_channel == NULL) {
		unit_err(m, "failed to allocate memory for posix channel");
		goto clear_channel;
	}

	ch->os_priv = posix_channel;

	ret = nvgpu_tsg_bind_channel(tsg, ch);
	if (ret) {
		unit_err(m, "failed to bind channel");
		goto clear_posix_channel;
	}

	return UNIT_SUCCESS;

clear_posix_channel:
	free(posix_channel);
clear_channel:
	nvgpu_channel_close(ch);
	ch = NULL;
clear_tsg:
	nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
	tsg = NULL;

	return ret;
}

int test_rc_deinit(struct unit_module *m, struct gk20a *g, void *args)
{
	struct nvgpu_posix_channel *posix_channel = ch->os_priv;
	int ret = nvgpu_tsg_force_unbind_channel(tsg, ch);
	if (ret != 0) {
		ret = UNIT_FAIL;
		unit_err(m , "channel already unbound");
	}

	if (ch != NULL && posix_channel != NULL) {
		free(posix_channel);
	}

	if (ch != NULL) {
		nvgpu_channel_close(ch);
	}

	if (tsg != NULL) {
		nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
	}

	if (g->fifo.remove_support) {
		g->fifo.remove_support(&g->fifo);
	}

	return ret;
}

int test_rc_fifo_recover(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);
	nvgpu_rc_fifo_recover(g, 0U, 0U, false, false, false, 0U);

	g->sw_quiesce_pending = false;

	return UNIT_SUCCESS;
}

int test_rc_ctxsw_timeout(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);
	nvgpu_rc_ctxsw_timeout(g, 0U, tsg, false);

	g->sw_quiesce_pending = false;
	return verify_error_notifier(ch, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
}

int test_rc_runlist_update(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	nvgpu_rc_runlist_update(g, 0U);

	g->sw_quiesce_pending = false;
	return UNIT_SUCCESS;
}

int test_rc_preempt_timeout(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);
	nvgpu_rc_preempt_timeout(g, tsg);

	g->sw_quiesce_pending = false;
	return verify_error_notifier(ch, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
}

int test_rc_gr_fault(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);
	nvgpu_rc_gr_fault(g, tsg, ch);

	g->sw_quiesce_pending = false;
	return UNIT_SUCCESS;
}

int test_rc_sched_error_bad_tsg(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);
	nvgpu_rc_sched_error_bad_tsg(g);

	g->sw_quiesce_pending = false;
	return UNIT_SUCCESS;
}

int test_rc_tsg_and_related_engines(struct unit_module *m, struct gk20a *g, void *args)
{
	g->sw_quiesce_pending = true;
	nvgpu_rc_tsg_and_related_engines(g, tsg, false, RC_TYPE_SCHED_ERR);

	g->sw_quiesce_pending = false;
	return UNIT_SUCCESS;
}

#define F_RC_MMU_FAULT_ID_INVALID  0
#define F_RC_MMU_FAULT_ID_TYPE_TSG 1
#define F_RC_MMU_FAULT_ID_TYPE_NOT_TSG 2

static const char *f_rc_mmu_fault[] = {
	"id_invalid",
	"id_type_tsg",
	"id_type_not_tsg",
};

int test_rc_mmu_fault(struct unit_module *m, struct gk20a *g, void *args)
{
	u32 branches;
	u32 id = NVGPU_INVALID_TSG_ID;
	u32 id_type = F_RC_MMU_FAULT_ID_TYPE_NOT_TSG;
	g->sw_quiesce_pending = true;
	clear_error_notifier(ch);

	for (branches = 0U; branches <= F_RC_MMU_FAULT_ID_TYPE_NOT_TSG; branches++) {
		if (branches != F_RC_MMU_FAULT_ID_INVALID) {
			id = tsg->tsgid;
			id_type = ID_TYPE_UNKNOWN;
		}

		if (branches == F_RC_MMU_FAULT_ID_TYPE_TSG) {
			id_type = ID_TYPE_TSG;
		}

		unit_info(m, "%s branch: %s\n", __func__, f_rc_mmu_fault[branches]);

		nvgpu_rc_mmu_fault(g, 0U, id, id_type, RC_TYPE_MMU_FAULT, NULL);
	}

	g->sw_quiesce_pending = false;
	return UNIT_SUCCESS;
}

#define F_RC_IS_CHSW_VALID_OR_SAVE  0U
#define F_RC_IS_CHSW_LOAD_OR_SWITCH 1U
#define F_RC_IS_CHSW_INVALID        2U

#define F_RC_ID_TYPE_TSG     	    0U
#define F_RC_ID_TYPE_CH             1U
#define F_RC_ID_TYPE_INVALID        2U

#define F_RC_ID_TYPE_CH_NULL_CHANNEL  0U
#define F_RC_ID_TYPE_CH_NULL_TSG      1U
#define F_RC_ID_TYPE_CH_FULL	      2U

static const char *f_rc_chsw[] = {
	"is_chsw_valid_or_save",
	"is_chsw_load_or_switch",
	"is_chsw_invalid",
};

static const char *f_rc_id_type[] = {
	"id_type_tsg",
	"id_type_ch",
	"id_type_invalid",
};

static const char *f_rc_id_ch_subbranch[] = {
	"null_channel",
	"null_tsg",
	"full",
};

static void set_pbdma_info_id_type(u32 chsw_branches,
		struct nvgpu_pbdma_status_info *info,
		struct nvgpu_channel *ch_without_tsg,
		u32 id_type_branches,
		u32 id_type_ch_branches)
{
	if (id_type_branches == F_RC_ID_TYPE_TSG) {
		info->id = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
			tsg->tsgid : PBDMA_STATUS_ID_INVALID;
		info->id_type = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
			PBDMA_STATUS_ID_TYPE_TSGID : PBDMA_STATUS_ID_TYPE_INVALID;
		info->next_id = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
			tsg->tsgid : PBDMA_STATUS_ID_INVALID;
		info->next_id_type = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
			PBDMA_STATUS_NEXT_ID_TYPE_TSGID : PBDMA_STATUS_NEXT_ID_TYPE_INVALID;
	} else if (id_type_branches == F_RC_ID_TYPE_CH) {
		if (id_type_ch_branches == F_RC_ID_TYPE_CH_NULL_CHANNEL) {
			info->id = NVGPU_INVALID_CHANNEL_ID;
			info->id_type = PBDMA_STATUS_ID_TYPE_CHID;
			info->next_id = NVGPU_INVALID_CHANNEL_ID;
			info->next_id_type = PBDMA_STATUS_NEXT_ID_TYPE_CHID;
		} else if (id_type_ch_branches == F_RC_ID_TYPE_CH_NULL_TSG) {
			/* Use ch_without_tsg for NULL TSG branch */
			info->id = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
				ch_without_tsg->chid : PBDMA_STATUS_ID_INVALID;
			info->id_type = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
				PBDMA_STATUS_ID_TYPE_CHID : PBDMA_STATUS_ID_TYPE_INVALID;
			info->next_id = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
				ch_without_tsg->chid : PBDMA_STATUS_ID_INVALID;
			info->next_id_type = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
				PBDMA_STATUS_NEXT_ID_TYPE_CHID : PBDMA_STATUS_NEXT_ID_TYPE_INVALID;
		} else {
			/* Use ch for full path */
			info->id = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
				ch->chid : PBDMA_STATUS_ID_INVALID;
			info->id_type = (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) ?
				PBDMA_STATUS_ID_TYPE_CHID : PBDMA_STATUS_ID_TYPE_INVALID;
			info->next_id = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
				ch->chid : PBDMA_STATUS_ID_INVALID;
			info->next_id_type = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ?
				PBDMA_STATUS_NEXT_ID_TYPE_CHID : PBDMA_STATUS_NEXT_ID_TYPE_INVALID;
		}
	} else {
		info->id_type = PBDMA_STATUS_ID_INVALID;
		info->next_id_type = PBDMA_STATUS_ID_INVALID;
	}
}

int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args)
{
	u32 chsw_branches, id_type_branches;
	u32 chsw_subbranch;

	struct nvgpu_channel *ch_without_tsg = NULL;

	ch_without_tsg = nvgpu_channel_open_new(g, NVGPU_INVALID_RUNLIST_ID, false,
			getpid(), getpid());
	if (ch_without_tsg == NULL) {
		unit_err(m, "failed channel open");
		return UNIT_FAIL;
	}

	g->sw_quiesce_pending = true;

	for (chsw_branches = F_RC_IS_CHSW_VALID_OR_SAVE;
		chsw_branches <= F_RC_IS_CHSW_INVALID; chsw_branches++) {
		struct nvgpu_pbdma_status_info info = {0};

		if (chsw_branches == F_RC_IS_CHSW_INVALID) {
			info.chsw_status = NVGPU_PBDMA_CHSW_STATUS_INVALID;
			unit_info(m, "%s branch: %s\n", __func__, f_rc_chsw[chsw_branches]);
			nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info);
			continue;
		}

		for (chsw_subbranch = 0U; chsw_subbranch < 2U; chsw_subbranch++) {
			if (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) {
				info.chsw_status =
					(chsw_subbranch * NVGPU_PBDMA_CHSW_STATUS_VALID) +
					((1 - chsw_subbranch) * NVGPU_PBDMA_CHSW_STATUS_SAVE);
			} else {
				info.chsw_status =
					(chsw_subbranch * NVGPU_PBDMA_CHSW_STATUS_LOAD) +
					((1 - chsw_subbranch) * NVGPU_PBDMA_CHSW_STATUS_SWITCH);
			}
		}

		for (id_type_branches = F_RC_ID_TYPE_TSG; id_type_branches <= F_RC_ID_TYPE_INVALID;
				id_type_branches++) {
			u32 id_type_ch_sub_branches = 0U;
			if (id_type_branches == F_RC_ID_TYPE_CH) {
				for (id_type_ch_sub_branches = F_RC_ID_TYPE_CH_NULL_CHANNEL;
					id_type_ch_sub_branches <= F_RC_ID_TYPE_CH_FULL; id_type_ch_sub_branches++) {
					set_pbdma_info_id_type(chsw_branches, &info, ch_without_tsg,
						id_type_branches, id_type_ch_sub_branches);

					unit_info(m, "%s branch: %s - %s - %s\n", __func__,
						f_rc_chsw[chsw_branches],
						f_rc_id_type[id_type_branches],
						f_rc_id_ch_subbranch[id_type_ch_sub_branches]);

					nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info);
				}
			} else {
				set_pbdma_info_id_type(chsw_branches, &info, ch_without_tsg,
					id_type_branches, id_type_ch_sub_branches);


				unit_info(m, "%s branch: %s - %s\n", __func__,
					f_rc_chsw[chsw_branches],
					f_rc_id_type[id_type_branches]);

				nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info);
			}
		}
	}

	g->sw_quiesce_pending = false;

	nvgpu_channel_close(ch_without_tsg);

	return UNIT_SUCCESS;
}

struct unit_module_test nvgpu_rc_tests[] = {
	UNIT_TEST(rc_init, test_rc_init, NULL, 0),
	UNIT_TEST(rc_fifo_recover, test_rc_fifo_recover, NULL, 0),
	UNIT_TEST(rc_ctxsw_timeout, test_rc_ctxsw_timeout, NULL, 0),
	UNIT_TEST(rc_runlist_update, test_rc_runlist_update, NULL, 0),
	UNIT_TEST(rc_preempt_timeout, test_rc_preempt_timeout, NULL, 0),
	UNIT_TEST(rc_gr_fault, test_rc_gr_fault, NULL, 0),
	UNIT_TEST(rc_sched_error_bad_tsg, test_rc_sched_error_bad_tsg, NULL, 0),
	UNIT_TEST(rc_tsg_and_related_engines, test_rc_tsg_and_related_engines, NULL, 0),
	UNIT_TEST(rc_mmu_fault, test_rc_mmu_fault, NULL, 0),
	UNIT_TEST(rc_pbdma_fault, test_rc_pbdma_fault, NULL, 0),
	UNIT_TEST(rc_deinit, test_rc_deinit, NULL, 0),
};

UNIT_MODULE(nvgpu-rc, nvgpu_rc_tests, UNIT_PRIO_NVGPU_TEST);