From cce1d7ad845185df07ea2022e33ca6f87e12c394 Mon Sep 17 00:00:00 2001 From: Antony Clince Alex Date: Tue, 26 Oct 2021 10:09:20 +0000 Subject: [PATCH] gpu: nvgpu: update device management framework to remove unusable engines On certain platforms, not all copy engine instances are usable. The user shouldn't submit any work to these engines. To enforce this, remove these engines from active/host_engine list, this should ensure that these engines do not get advertised to userspace. In order to accomplish this introduce the following functions: - nvgpu_engine_remove_one_dev: This function removes the specified device entry from following device lists: fifo->host_engines, fifo->active_engines, runlist->rl_dev_list, runlist->eng_bitmask. Replace iteration over LCE device type entries using nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE), along with this introduce macro nvgpu_device_for_each_safe. Introduce gpu_dbg_ce flag for CE debugging. Bug 3370462 Change-Id: I2e21f18363c6e53630d129da241c8fece106cd33 Signed-off-by: Antony Clince Alex Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2616711 Reviewed-by: Seema Khowala Reviewed-by: Vaibhav Kachore Reviewed-by: mobile promotions Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-cert Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/common/ce/ce.c | 10 ++++ drivers/gpu/nvgpu/common/device.c | 7 +-- drivers/gpu/nvgpu/common/fifo/engines.c | 54 +++++++++++++++++-- drivers/gpu/nvgpu/hal/ce/ce_ga10b_fusa.c | 16 ++---- .../gpu/nvgpu/hal/fifo/engines_gp10b_fusa.c | 14 ++--- drivers/gpu/nvgpu/include/nvgpu/engines.h | 12 +++++ drivers/gpu/nvgpu/include/nvgpu/log_common.h | 1 + 7 files changed, 82 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/nvgpu/common/ce/ce.c b/drivers/gpu/nvgpu/common/ce/ce.c index 7debdff9f..928fa4c8f 100644 --- a/drivers/gpu/nvgpu/common/ce/ce.c +++ b/drivers/gpu/nvgpu/common/ce/ce.c @@ -37,6 +37,16 @@ int nvgpu_ce_init_support(struct gk20a *g) g->ops.ce.set_pce2lce_mapping(g); } + /* + * Bug 1895019 + * Each time PCE2LCE config is updated and if it happens to + * map a LCE which was previously unmapped, then ELCG would have turned + * off the clock to the unmapped LCE and when the LCE config is updated, + * a race occurs between the config update and ELCG turning on the clock + * to that LCE, this might result in LCE dropping the config update. + * To avoid such a race, each time PCE2LCE config is updated toggle + * resets for all LCEs. + */ err = nvgpu_mc_reset_devtype(g, NVGPU_DEVTYPE_LCE); if (err != 0) { nvgpu_err(g, "NVGPU_DEVTYPE_LCE reset failed"); diff --git a/drivers/gpu/nvgpu/common/device.c b/drivers/gpu/nvgpu/common/device.c index 690678df5..ba2f3de42 100644 --- a/drivers/gpu/nvgpu/common/device.c +++ b/drivers/gpu/nvgpu/common/device.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -277,10 +277,7 @@ static u32 nvgpu_device_do_get_copies(struct gk20a *g, } } - for (i = 0; i < nvgpu_device_count(g, NVGPU_DEVTYPE_LCE); i++) { - dev = nvgpu_device_get(g, NVGPU_DEVTYPE_LCE, i); - nvgpu_assert(dev != NULL); - + nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE) { if (async_only && dev->runlist_id == gr_dev->runlist_id) { /* It's a GRCE, skip it per async_only. */ diff --git a/drivers/gpu/nvgpu/common/fifo/engines.c b/drivers/gpu/nvgpu/common/fifo/engines.c index 61599ee99..877231f5c 100644 --- a/drivers/gpu/nvgpu/common/fifo/engines.c +++ b/drivers/gpu/nvgpu/common/fifo/engines.c @@ -170,10 +170,7 @@ u32 nvgpu_ce_engine_interrupt_mask(struct gk20a *g) /* * Now take care of LCEs. */ - for (i = 0U; i < nvgpu_device_count(g, NVGPU_DEVTYPE_LCE); i++) { - dev = nvgpu_device_get(g, NVGPU_DEVTYPE_LCE, i); - nvgpu_assert(dev != NULL); - + nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE) { mask |= BIT32(dev->intr_id); } @@ -803,6 +800,55 @@ static int nvgpu_engine_init_one_dev(struct nvgpu_fifo *f, return 0; } +void nvgpu_engine_remove_one_dev(struct nvgpu_fifo *f, + const struct nvgpu_device *dev) +{ + u32 i, j; + struct gk20a *g = f->g; + + /* + * First remove the engine from fifo->host_engines list, for this, it + * suffices to set the entry corresponding to the dev->engine_id to + * NULL, this should prevent the entry from being used. + */ + f->host_engines[dev->engine_id] = NULL; +#if defined(CONFIG_NVGPU_NON_FUSA) + /* + * Remove the device from the runlist device list. + */ + f->runlists[dev->runlist_id]->rl_dev_list[dev->rleng_id] = NULL; + + /* + * Remove the engine id from runlist->eng_bitmask + */ + f->runlists[dev->runlist_id]->eng_bitmask &= (~BIT32(dev->engine_id)); +#endif + /* + * For fifo->active_engines, we have to figure out the index of the + * device to be removed and shift the remaining elements up to that + * index. + */ + for (i = 0U; i < f->num_engines; i++) { + if (f->active_engines[i] == dev) { + nvgpu_log(g, gpu_dbg_device, "deleting device with" + " engine_id(%d) from active_engines list", + dev->engine_id); + for (j = i; j < nvgpu_safe_sub_u32(f->num_engines, 1U); + j++) { + f->active_engines[j] = f->active_engines[ + nvgpu_safe_add_u32(j, 1U)]; + } + break; + } + } + /* + * Update f->num_engines if a device was removed from f->active_engines + * list. + */ + f->num_engines = (i < f->num_engines) ? + nvgpu_safe_sub_u32(f->num_engines, 1U) : f->num_engines; +} + int nvgpu_engine_init_info(struct nvgpu_fifo *f) { int err; diff --git a/drivers/gpu/nvgpu/hal/ce/ce_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/ce/ce_ga10b_fusa.c index 888f50928..2e53334e8 100644 --- a/drivers/gpu/nvgpu/hal/ce/ce_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/ce/ce_ga10b_fusa.c @@ -131,15 +131,11 @@ static void ga10b_ce_intr_stall_nonstall_enable(struct gk20a *g, void ga10b_ce_init_hw(struct gk20a *g) { - u32 i = 0U; u32 nonstall_vectorid_tree[NVGPU_CIC_INTR_VECTORID_SIZE_MAX]; u32 num_nonstall_vectors = 0; + const struct nvgpu_device *dev; - for (i = 0U; i < nvgpu_device_count(g, NVGPU_DEVTYPE_LCE); i++) { - const struct nvgpu_device *dev = - nvgpu_device_get(g, NVGPU_DEVTYPE_LCE, i); - nvgpu_assert(dev != NULL); - + nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE) { /* * The intr_id in dev info is broken for non-stall interrupts * from grce0,1. Therefore, instead read the vectors from the @@ -163,13 +159,9 @@ void ga10b_ce_init_hw(struct gk20a *g) void ga10b_ce_intr_enable(struct gk20a *g, bool enable) { - u32 i = 0U; - - for (i = 0U; i < nvgpu_device_count(g, NVGPU_DEVTYPE_LCE); i++) { - const struct nvgpu_device *dev = - nvgpu_device_get(g, NVGPU_DEVTYPE_LCE, i); - nvgpu_assert(dev != NULL); + const struct nvgpu_device *dev; + nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE) { ga10b_ce_intr_stall_nonstall_enable(g, dev, enable); } } diff --git a/drivers/gpu/nvgpu/hal/fifo/engines_gp10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/engines_gp10b_fusa.c index 400571a17..c43998476 100644 --- a/drivers/gpu/nvgpu/hal/fifo/engines_gp10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/engines_gp10b_fusa.c @@ -34,19 +34,11 @@ int gp10b_engine_init_ce_info(struct nvgpu_fifo *f) { struct gk20a *g = f->g; - u32 i; bool found; + const struct nvgpu_device *dev; - for (i = 0; i < nvgpu_device_count(g, NVGPU_DEVTYPE_LCE); i++) { - const struct nvgpu_device *dev; - struct nvgpu_device *dev_rw; - - dev = nvgpu_device_get(g, NVGPU_DEVTYPE_LCE, i); - if (dev == NULL) { - nvgpu_err(g, "Failed to get LCE device %u", i); - return -EINVAL; - } - dev_rw = (struct nvgpu_device *)dev; + nvgpu_device_for_each(g, dev, NVGPU_DEVTYPE_LCE) { + struct nvgpu_device *dev_rw = (struct nvgpu_device *)dev; /* * vGPU consideration. Not present in older chips. See diff --git a/drivers/gpu/nvgpu/include/nvgpu/engines.h b/drivers/gpu/nvgpu/include/nvgpu/engines.h index 65a1b9a15..3b76c5472 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/engines.h +++ b/drivers/gpu/nvgpu/include/nvgpu/engines.h @@ -410,4 +410,16 @@ u32 nvgpu_engine_mmu_fault_id_to_veid(struct gk20a *g, u32 mmu_fault_id, */ void nvgpu_engine_mmu_fault_id_to_eng_ve_pbdma_id(struct gk20a *g, u32 mmu_fault_id, u32 *engine_id, u32 *veid, u32 *pbdma_id); +/** + * @brief Remove a device entry from engine list. + * + * @param g [in] The GPU driver struct. + * @param dev [in] A device. + * + * Remove the device entry \a dev from fifo->host_engines, fifo->active_engines. + * The device entry is retained in g->devs->devlist_heads list to ensure device + * reset. + */ +void nvgpu_engine_remove_one_dev(struct nvgpu_fifo *f, + const struct nvgpu_device *dev); #endif /*NVGPU_ENGINE_H*/ diff --git a/drivers/gpu/nvgpu/include/nvgpu/log_common.h b/drivers/gpu/nvgpu/include/nvgpu/log_common.h index 7a898dbf5..b6a93d08a 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log_common.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log_common.h @@ -81,5 +81,6 @@ enum nvgpu_log_type { #define gpu_dbg_mm BIT(41) /* Memory management debugging. */ #define gpu_dbg_hwpm BIT(42) /* GPU HWPM. */ #define gpu_dbg_verbose BIT(43) /* More verbose logs. */ +#define gpu_dbg_ce BIT(44) /* Copy Engine debugging */ #endif