gpu: nvgpu: MIG mode selection at runtime

This is adding code to select MIG mode and boot
the GPU with selected mig config.

For testing MIG, after system boots

1. write  mig_mode_config by
     echo  x > /sys/devices/gpu.0/mig_mode_config for igpu
     echo x > /sys/devices/./platform/14100000.pcie/pci0001:00/0001:00:00.0/0001:01:00.0/ for dgpu

2. Then run any nvgpu* tests or nvrm_gpu_info.
If the mig_mode need to be changed , note down the supported
configs by "cat mig_mode_config_list" and reboot the system

3. Follow steps 1 and 2.

example output:

"cat mig_mode_config" 2

"cat mig_mode_config_list"

+++++++++ Config list Start ++++++++++

 CONFIG_ID : 0 for CONFIG NAME : 2 GPU instances each with 4 GPCs

 CONFIG_ID : 1 for CONFIG NAME : 4 GPU instances each with 2 GPCs

 CONFIG_ID : 2 for CONFIG NAME : 7 GPU instances - 1 GPU instance with 2
GPCs + 6 GPU instances each with 1 GPC

 CONFIG_ID : 3 for CONFIG NAME : 5 GPU instances - 1 GPU instance with 4
GPCs + 4 GPU instances each with 1 GPC

 CONFIG_ID : 4 for CONFIG NAME : 4 GPU instances - 1 GPU instance with 2
GPCs + 2 GPU instances each with 1 GPC + 1 GPU instance with 4 GPCs

 CONFIG_ID : 5 for CONFIG NAME : 6 GPU instances - 2 GPU instances each
with 2 GPCs + 4 GPU instances each with 1 GPC

 CONFIG_ID : 6 for CONFIG NAME : 5 GPU instances -  1 GPU instance with
2 GPCs + 2 GPU instances each with 1 GPC + 2 GPU instances with 2 GPCs

 CONFIG_ID : 7 for CONFIG NAME : 5 GPU instances - 2 GPU instances each
with 2 GPCs + 1 GPC instance with 2 GPCs + 2 GPU instances with 1 GPC

 CONFIG_ID : 8 for CONFIG NAME : 5 GPU instances - 1 GPC instance with 2
GPCs + 2 GPU instances each with 1 GPC + 2 GPU instances each with 2
GPCs

 CONFIG_ID : 9 for CONFIG NAME : 1 GPU instance with 8 GPCs

++++++++++ Config list End +++++++++++

JIRA NVGPU-6633

Change-Id: I3e56f8c836e1ced8753a60f328da63916faa7696
Signed-off-by: dt <dt@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2522821
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
dt
2021-04-30 06:13:50 +00:00
committed by mobile promotions
parent e60d373fab
commit be507aea50
4 changed files with 37 additions and 89 deletions

View File

@@ -42,7 +42,6 @@ int nvgpu_init_gr_manager(struct gk20a *g)
g->mig.gpc_count = g->ops.priv_ring.get_gpc_count(g);
nvgpu_assert(g->mig.gpc_count > 0U);
g->mig.num_gpu_instances = 1U;
g->mig.current_gpu_instance_config_id = 0U;
g->mig.is_nongr_engine_sharable = false;
gpu_instance->gpu_instance_id = 0U;

View File

@@ -221,11 +221,6 @@ struct nvgpu_mig {
struct nvgpu_mutex gr_syspipe_lock;
/** Gpu instance configuration id. */
u32 current_gpu_instance_config_id;
/**
* current mig_instance_config.
*/
const struct nvgpu_mig_gpu_instance_config
*current_mig_gpu_instance_config;
/**
* Flag to indicate whether nonGR(CE) engine is sharable
* between gr syspipes or not.

View File

@@ -231,24 +231,6 @@ static char *nvgpu_pci_devnode_v2(struct device *dev, umode_t *mode)
dev_name(dev));
}
static char *nvgpu_mig_phys_devnode(struct device *dev, umode_t *mode)
{
struct nvgpu_cdev_class_priv_data *priv_data;
if (mode) {
*mode = S_IRUSR | S_IWUSR;
}
priv_data = dev_get_drvdata(dev);
if (priv_data->pci) {
return kasprintf(GFP_KERNEL, "nvgpu/dgpu-%s/%s",
dev_name(dev->parent), dev_name(dev));
}
return kasprintf(GFP_KERNEL, "nvgpu/igpu0/%s", dev_name(dev));
}
static char *nvgpu_mig_fgpu_devnode(struct device *dev, umode_t *mode)
{
struct nvgpu_cdev_class_priv_data *priv_data;
@@ -388,60 +370,19 @@ struct nvgpu_mig_static_info {
u32 minor_instance_id;
};
static const struct nvgpu_mig_static_info nvgpu_default_mig_static_info[] =
{
{
.instance_type = NVGPU_MIG_TYPE_PHYSICAL,
},
{
.instance_type = NVGPU_MIG_TYPE_MIG,
.major_instance_id = 0,
.minor_instance_id = 0,
},
{
.instance_type = NVGPU_MIG_TYPE_MIG,
.major_instance_id = 0,
.minor_instance_id = 1,
},
};
static const struct nvgpu_mig_static_info nvgpu_default_pci_mig_static_info[] =
{
{
.instance_type = NVGPU_MIG_TYPE_PHYSICAL,
},
{
.instance_type = NVGPU_MIG_TYPE_MIG,
.major_instance_id = 1,
.minor_instance_id = 0,
},
{
.instance_type = NVGPU_MIG_TYPE_MIG,
.major_instance_id = 2,
.minor_instance_id = 4,
},
};
static int nvgpu_prepare_mig_dev_node_class_list(struct gk20a *g, u32 *num_classes)
{
u32 class_count = 0U;
const struct nvgpu_mig_static_info *info;
struct nvgpu_class *class;
u32 i;
u32 num_instances;
struct nvgpu_cdev_class_priv_data *priv_data;
if (g->pci_class != 0U) {
info = &nvgpu_default_pci_mig_static_info[0];
num_instances = sizeof(nvgpu_default_pci_mig_static_info) /
sizeof(nvgpu_default_pci_mig_static_info[0]);
} else {
info = &nvgpu_default_mig_static_info[0];
num_instances = sizeof(nvgpu_default_mig_static_info) /
sizeof(nvgpu_default_mig_static_info[0]);
}
for (i = 0U; i < num_instances; i++) {
num_instances = g->mig.num_gpu_instances;
/*
* TODO: i=0 need to be added after ctrl node fixup.
*/
for (i = 1U; i < num_instances; i++) {
priv_data = nvgpu_kzalloc(g, sizeof(*priv_data));
if (priv_data == NULL) {
return -ENOMEM;
@@ -456,20 +397,16 @@ static int nvgpu_prepare_mig_dev_node_class_list(struct gk20a *g, u32 *num_class
kfree(priv_data);
return -ENOMEM;
}
class_count++;
class->class->devnode = nvgpu_mig_fgpu_devnode;
priv_data->major_instance_id = g->mig.gpu_instance[i].gpu_instance_id;
priv_data->minor_instance_id = g->mig.gpu_instance[i].gr_syspipe.gr_syspipe_id;
class->instance_type = NVGPU_MIG_TYPE_MIG;
if (info[i].instance_type == NVGPU_MIG_TYPE_PHYSICAL) {
class->class->devnode = nvgpu_mig_phys_devnode;
} else {
class->class->devnode = nvgpu_mig_fgpu_devnode;
}
priv_data->local_instance_id = i;
priv_data->major_instance_id = info[i].major_instance_id;
priv_data->minor_instance_id = info[i].minor_instance_id;
priv_data->pci = (g->pci_class != 0U);
class->priv_data = priv_data;
class->instance_type = info[i].instance_type;
priv_data->local_instance_id = i;
priv_data->pci = (g->pci_class != 0U);
}
*num_classes = class_count;
@@ -568,7 +505,7 @@ static int nvgpu_prepare_dev_node_class_list(struct gk20a *g, u32 *num_classes,
static bool check_valid_dev_node(struct gk20a *g, struct nvgpu_class *class,
const struct nvgpu_dev_node *node)
{
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
if (nvgpu_grmgr_is_multi_gr_enabled(g)) {
if ((class->instance_type == NVGPU_MIG_TYPE_PHYSICAL) &&
!node->mig_physical_node) {
return false;
@@ -584,7 +521,7 @@ static bool check_valid_class(struct gk20a *g, struct nvgpu_class *class)
return false;
}
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
if (nvgpu_grmgr_is_multi_gr_enabled(g)) {
if ((class->instance_type == NVGPU_MIG_TYPE_PHYSICAL)) {
return false;
}
@@ -681,7 +618,19 @@ int gk20a_user_init(struct device *dev)
}
num_cdevs = sizeof(dev_node_list) / sizeof(dev_node_list[0]);
total_cdevs = num_cdevs * num_classes;
if (nvgpu_grmgr_is_multi_gr_enabled(g)) {
/**
* As mig physical node needs the ctrl node only.
* We need to add total_cdevs + 1 when we enable ctrl node.
*/
total_cdevs = (num_cdevs - 1) * (num_classes - 1);
} else {
/*
* As the power node is already created, we need to
* reduced devs by by one.
*/
total_cdevs = (num_cdevs - 1) * num_classes;
}
err = alloc_chrdev_region(&devno, 0, total_cdevs, dev_name(dev));
if (err) {
@@ -742,7 +691,7 @@ u32 nvgpu_get_gpu_instance_id_from_cdev(struct gk20a *g, struct nvgpu_cdev *cdev
{
struct nvgpu_cdev_class_priv_data *priv_data;
if (nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
if (nvgpu_grmgr_is_multi_gr_enabled(g)) {
priv_data = dev_get_drvdata(cdev->node);
return priv_data->local_instance_id;
}

View File

@@ -1086,15 +1086,20 @@ static ssize_t mig_mode_config_list_show(struct device *dev,
u32 num_config = 0;
struct gk20a *g = get_gk20a(dev);
const struct nvgpu_mig_gpu_instance_config *mig_gpu_instance_config;
char *power_on_string = "MIG list will be displayed after gpu power"
const char *power_on_string = "MIG list will be displayed after gpu power"
" on with default MIG mode \n Boot with config id zero\n"
" Get the available configs \n"
" Change the init script and reboot";
const char *error_on_nullconfig = "MIG list can't be displayed";
if (nvgpu_is_powered_on(g) &&
(g->mig.current_mig_gpu_instance_config != NULL)) {
if (nvgpu_is_powered_on(g)) {
mig_gpu_instance_config =
g->mig.current_mig_gpu_instance_config;
(g->ops.grmgr.get_mig_config_ptr != NULL) ?
g->ops.grmgr.get_mig_config_ptr(g) : NULL;
if (mig_gpu_instance_config == NULL) {
res += sprintf(&buf[res], "%s", error_on_nullconfig);
return res;
}
} else {
res += sprintf(&buf[res], "%s", power_on_string);
return res;