gpu: nvgpu: ga10b: Use active ltcs count for cbc init

This patch fixes a bug in the cbc initialization code for ga10b, where it was erroneously assumed that a fixed ltc count of only one should be used for historical reasons. For volta and later, the full ltc count should be used in cbc-related computation. Ensure - CBC base address is 64K aligned - CBC start address lies within CBC allocated memory Check CBC is marked safe only for silicon platform. Bug 3353418 Change-Id: I5edee2a78dc9e8c149e111a9f088a57e0154f5c2 Signed-off-by: Vedashree Vidwans <vvidwans@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2585778 Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Seema Khowala <seemaj@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2021-08-25 21:58:21 -07:00
parent cc7b048641
commit a3e2283cf2
13 changed files with 113 additions and 67 deletions
--- a/drivers/gpu/nvgpu/hal/cbc/cbc_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/cbc/cbc_ga10b.c
@@ -33,17 +33,6 @@

 #include <nvgpu/hw/ga10b/hw_ltc_ga10b.h>

-u64 ga10b_cbc_get_base_divisor(struct gk20a *g)
-{
-	/*
-	 * For Tegra, the addressing works differently. Unlike DGPU, all
-	 * partitions talk to the same memory.
-	 */
-	u64 ltc_count = 1ULL;
-
-	return ltc_count << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
-}
-
 int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)
 {
 	/*
@@ -83,11 +72,7 @@ int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)
 		ltc_ltcs_ltss_cbc_param_bytes_per_comptagline_per_slice_v(
 			nvgpu_readl(g, ltc_ltcs_ltss_cbc_param_r()));

-	/*
-	 * For Tegra, the addressing works differently. Unlike DGPU, all
-	 * partitions talk to the same memory.
-	 */
-	u32 ltc_count = 1U;
+	u64 base_divisor = 0ULL;

 	/* check if vidmem is present */
 	bool alloc_vidmem = g->ops.fb.get_vidmem_size != NULL ? true : false;
@@ -111,7 +96,8 @@ int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)
 	/* Memory required for comptag lines in all slices of all ltcs */
 	compbit_backing_size =  nvgpu_safe_mult_u32(
 		nvgpu_safe_mult_u32(max_comptag_lines,
-			nvgpu_ltc_get_slices_per_ltc(g)), ltc_count);
+			nvgpu_ltc_get_slices_per_ltc(g)),
+		nvgpu_ltc_get_ltc_count(g));

 	/* Total memory required for compstatus */
 	compbit_backing_size =  nvgpu_safe_mult_u32(
@@ -119,13 +105,41 @@ int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)
 			gobs_per_comptagline_per_slice), compstatus_per_gob);

 	/* aligned to 2KB * ltc_count */
-	compbit_backing_size += nvgpu_safe_cast_u64_to_u32(
-		g->ops.cbc.get_base_divisor(g));
+	g->ops.fb.cbc_get_alignment(g, &base_divisor, NULL);
+	compbit_backing_size = nvgpu_safe_cast_u64_to_u32(
+		nvgpu_safe_add_u64(compbit_backing_size, base_divisor));

 	/* must be a multiple of 64KB */
 	compbit_backing_size = round_up(compbit_backing_size, SZ_64K);

-	err = nvgpu_cbc_alloc(g, compbit_backing_size, alloc_vidmem);
+	/*
+	 * Address calculation for CBC applies swizzle to the lower 16 bits
+	 * of physical address. So, CBC start and end address should be 64KB
+	 * aligned.
+	 * Memory allocated is aligned corresponding to PAGE_SIZE and can be
+	 * seen as:
+	 *
+	 * ------------------------ Allocated physical memory end address
+	 * ^     -------------- 64KB aligned CBC end address
+	 * |             ^
+	 * | allocated   |
+	 * | physical    |
+	 * | address     | CBC occupied
+	 * | space       | address space
+	 * |             |
+	 * |             v
+	 * v     -------------- 64KB aligned CBC start address
+	 * ------------------------ Allocated physical memory start address
+	 *
+	 * With PAGE_SIZE other than 64KB, the physical memory start address
+	 * may not be 64KB aligned. So, choose CBC start address to be the
+	 * lower 64KB multiple within the allocated memory.
+	 * However, offsetting start address will put the CBC memory beyond
+	 * the allocated space. Hence, request for 64KB additional memory to
+	 * incorporate the offset.
+	 */
+
+	err = nvgpu_cbc_alloc(g, (compbit_backing_size + SZ_64K), alloc_vidmem);
 	if (err != 0) {
 		return err;
 	}
@@ -144,8 +158,6 @@ int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)

 	nvgpu_log(g, gpu_dbg_info | gpu_dbg_pte, "supported LTCs: 0x%x",
 		nvgpu_ltc_get_ltc_count(g));
-	nvgpu_log(g, gpu_dbg_info | gpu_dbg_pte,
-		"ltc_count used for calculations: 0x%x", ltc_count);
 	nvgpu_log(g, gpu_dbg_info | gpu_dbg_pte,
 		"compbit backing store size : 0x%x", compbit_backing_size);
 	nvgpu_log(g, gpu_dbg_info | gpu_dbg_pte,
--- a/drivers/gpu/nvgpu/hal/cbc/cbc_ga10b.h
+++ b/drivers/gpu/nvgpu/hal/cbc/cbc_ga10b.h
@@ -30,7 +30,6 @@
 struct gk20a;
 struct nvgpu_cbc;

-u64 ga10b_cbc_get_base_divisor(struct gk20a *g);
 int ga10b_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc);
 void ga10b_cbc_init(struct gk20a *g, struct nvgpu_cbc *cbc);

--- a/drivers/gpu/nvgpu/hal/cbc/cbc_tu104.c
+++ b/drivers/gpu/nvgpu/hal/cbc/cbc_tu104.c
@@ -1,7 +1,7 @@
 /*
 * TU104 CBC
 *
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -34,13 +34,6 @@

 #include "cbc_tu104.h"

-
-u64 tu104_cbc_get_base_divisor(struct gk20a *g)
-{
-	return (u64)nvgpu_ltc_get_ltc_count(g) <<
-		       ltc_ltcs_ltss_cbc_base_alignment_shift_v();
-}
-
 int tu104_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc)
 {
 	/* max memory size (MB) to cover */
--- a/drivers/gpu/nvgpu/hal/cbc/cbc_tu104.h
+++ b/drivers/gpu/nvgpu/hal/cbc/cbc_tu104.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -31,7 +31,6 @@ enum nvgpu_cbc_op;
 struct gk20a;
 struct nvgpu_cbc;

-u64 tu104_cbc_get_base_divisor(struct gk20a *g);
 int tu104_cbc_alloc_comptags(struct gk20a *g, struct nvgpu_cbc *cbc);
 int tu104_cbc_ctrl(struct gk20a *g, enum nvgpu_cbc_op op,
 		       u32 min, u32 max);
--- a/drivers/gpu/nvgpu/hal/fb/fb_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/fb/fb_ga10b.c
@@ -38,21 +38,23 @@
 #ifdef CONFIG_NVGPU_COMPRESSION
 void ga10b_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc)
 {
-	u64 compbit_store_base;
-	u64 compbit_store_pa;
-	u64 combit_top_size;
-	u64 combit_top;
-	u32 cbc_max_rval;
-	/* Unlike dgpu, partition swizzling is disabled for ga10b */
-	u32 num_swizzled_ltcs = 1U;
+	u64 base_divisor = 0ULL;
+	u64 top_divisor = 0ULL;
+	u64 compbit_store_base = 0ULL;
+	u64 compbit_start_pa = 0ULL;
+	u64 compbit_store_pa = 0ULL;
+	u64 combit_top_size = 0ULL;
+	u64 combit_top = 0ULL;
+	u32 cbc_max_rval = 0U;
+
+	g->ops.fb.cbc_get_alignment(g, &base_divisor, &top_divisor);

 	/*
 	 * Update CBC registers
 	 * Note: CBC Base value should be updated after CBC MAX
 	 */
 	combit_top_size = cbc->compbit_backing_size;
-	combit_top = (combit_top_size / num_swizzled_ltcs) >>
-				fb_mmu_cbc_top_alignment_shift_v();
+	combit_top = combit_top_size / top_divisor;
 	nvgpu_assert(combit_top < U64(U32_MAX));
 	nvgpu_writel(g, fb_mmu_cbc_top_r(),
 		fb_mmu_cbc_top_size_f(u64_lo32(combit_top)));
@@ -64,31 +66,44 @@ void ga10b_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc)
 	nvgpu_writel(g, fb_mmu_cbc_max_r(), cbc_max_rval);

 	compbit_store_pa = nvgpu_mem_get_addr(g, &cbc->compbit_store.mem);
-	compbit_store_base = (compbit_store_pa / num_swizzled_ltcs) >>
-				fb_mmu_cbc_base_alignment_shift_v();
+	/* must be a multiple of 64KB within allocated memory */
+	compbit_store_base = round_up(compbit_store_pa, SZ_64K);
+	/* Calculate post-divide cbc address */
+	compbit_store_base = compbit_store_base / base_divisor;
+
+	/*
+	 * CBC start address is calculated from the CBC_BASE register value
+	 * Check that CBC start address lies within cbc allocated memory.
+	 */
+	compbit_start_pa = compbit_store_base * base_divisor;
+	nvgpu_assert(compbit_start_pa >= compbit_store_pa);
+
 	nvgpu_assert(compbit_store_base < U64(U32_MAX));
 	nvgpu_writel(g, fb_mmu_cbc_base_r(),
 		fb_mmu_cbc_base_address_f(u64_lo32(compbit_store_base)));

+	if (nvgpu_platform_is_silicon(g)) {
+		/* Make sure cbc is marked safe by MMU */
+		cbc_max_rval = nvgpu_readl(g, fb_mmu_cbc_max_r());
+		if ((cbc_max_rval & fb_mmu_cbc_max_safe_m()) !=
+			fb_mmu_cbc_max_safe_true_f()) {
+			nvgpu_err(g,
+				"CBC marked unsafe by MMU, check cbc config");
+		}
+	}
+
+	cbc->compbit_store.base_hw = compbit_store_base;
+
 	nvgpu_log(g, gpu_dbg_info | gpu_dbg_map_v | gpu_dbg_pte,
 		"compbit top size: 0x%x,%08x \n",
 		(u32)(combit_top_size >> 32),
 		(u32)(combit_top_size & 0xffffffffU));

 	nvgpu_log(g, gpu_dbg_info | gpu_dbg_map_v | gpu_dbg_pte,
-		"compbit base.pa: 0x%x,%08x cbc_base:0x%llx\n",
+		"compbit mem.pa: 0x%x,%08x cbc_base:0x%llx\n",
 		(u32)(compbit_store_pa >> 32),
 		(u32)(compbit_store_pa & 0xffffffffU),
 		compbit_store_base);
-
-	/* Make sure cbc is marked safe by MMU */
-	cbc_max_rval = nvgpu_readl(g, fb_mmu_cbc_max_r());
-	if ((cbc_max_rval & fb_mmu_cbc_max_safe_m()) !=
-		fb_mmu_cbc_max_safe_true_f()) {
-		nvgpu_err(g, "CBC marked unsafe by MMU, check cbc config");
-	}
-
-	cbc->compbit_store.base_hw = compbit_store_base;
 }
 #endif

--- a/drivers/gpu/nvgpu/hal/fb/fb_tu104.c
+++ b/drivers/gpu/nvgpu/hal/fb/fb_tu104.c
@@ -104,9 +104,26 @@ int fb_tu104_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb)
 }

 #ifdef CONFIG_NVGPU_COMPRESSION
+void tu104_fb_cbc_get_alignment(struct gk20a *g,
+		u64 *base_divisor, u64 *top_divisor)
+{
+	u64 ltc_count = (u64)nvgpu_ltc_get_ltc_count(g);
+
+	if (base_divisor != NULL) {
+		*base_divisor =
+			ltc_count << fb_mmu_cbc_base_alignment_shift_v();
+	}
+
+	if (top_divisor != NULL) {
+		*top_divisor =
+			ltc_count << fb_mmu_cbc_top_alignment_shift_v();
+	}
+}
+
 void tu104_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc)
 {
 	u64 base_divisor;
+	u64 top_divisor;
 	u64 compbit_store_base;
 	u64 compbit_store_pa;
 	u64 cbc_start_addr, cbc_end_addr;
@@ -114,17 +131,14 @@ void tu104_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc)
 	u64 cbc_top_size;
 	u32 cbc_max;

+	g->ops.fb.cbc_get_alignment(g, &base_divisor, &top_divisor);
 	compbit_store_pa = nvgpu_mem_get_addr(g, &cbc->compbit_store.mem);
-	base_divisor = g->ops.cbc.get_base_divisor(g);
 	compbit_store_base = DIV_ROUND_UP(compbit_store_pa, base_divisor);

-	cbc_start_addr = (u64)nvgpu_ltc_get_ltc_count(g) *
-			(compbit_store_base <<
-			 fb_mmu_cbc_base_address_alignment_shift_v());
+	cbc_start_addr = compbit_store_base * base_divisor;
 	cbc_end_addr = cbc_start_addr + cbc->compbit_backing_size;

-	cbc_top = (cbc_end_addr / nvgpu_ltc_get_ltc_count(g)) >>
-		  fb_mmu_cbc_base_address_alignment_shift_v();
+	cbc_top = (cbc_end_addr / top_divisor);
 	cbc_top_size = u64_lo32(cbc_top) - compbit_store_base;

 	nvgpu_assert(cbc_top_size < U64(U32_MAX));
--- a/drivers/gpu/nvgpu/hal/fb/fb_tu104.h
+++ b/drivers/gpu/nvgpu/hal/fb/fb_tu104.h
@@ -31,6 +31,9 @@ struct nvgpu_mem;
 int  fb_tu104_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb);
 #ifdef CONFIG_NVGPU_COMPRESSION
 struct nvgpu_cbc;
+
+void tu104_fb_cbc_get_alignment(struct gk20a *g,
+		u64 *base_divisor, u64 *top_divisor);
 void tu104_fb_cbc_configure(struct gk20a *g, struct nvgpu_cbc *cbc);
 #endif
 int  tu104_fb_apply_pdb_cache_errata(struct gk20a *g);
--- a/drivers/gpu/nvgpu/hal/init/hal_ga100.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_ga100.c
@@ -403,7 +403,6 @@ static const struct gops_cbc ga100_ops_cbc = {
 	.cbc_init_support = nvgpu_cbc_init_support,
 	.cbc_remove_support = nvgpu_cbc_remove_support,
 	.init = tu104_cbc_init,
-	.get_base_divisor = tu104_cbc_get_base_divisor,
 	.alloc_comptags = ga100_cbc_alloc_comptags,
 	.ctrl = tu104_cbc_ctrl,
 	.fix_config = NULL,
@@ -882,6 +881,7 @@ static const struct gops_fb ga100_ops_fb = {
 #ifdef CONFIG_NVGPU_COMPRESSION
 	.is_comptagline_mode_enabled = ga100_fb_is_comptagline_mode_enabled,
 	.cbc_configure = tu104_fb_cbc_configure,
+	.cbc_get_alignment = tu104_fb_cbc_get_alignment,
 	.set_use_full_comp_tag_line = gm20b_fb_set_use_full_comp_tag_line,
 	.compression_page_size = ga100_fb_compression_page_size,
 	.compressible_page_size = gp10b_fb_compressible_page_size,
--- a/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_ga10b.c
@@ -366,7 +366,6 @@ static const struct gops_cbc ga10b_ops_cbc = {
 	.cbc_init_support = nvgpu_cbc_init_support,
 	.cbc_remove_support = nvgpu_cbc_remove_support,
 	.init = ga10b_cbc_init,
-	.get_base_divisor = ga10b_cbc_get_base_divisor,
 	.alloc_comptags = ga10b_cbc_alloc_comptags,
 	.ctrl = tu104_cbc_ctrl,
 };
@@ -871,6 +870,7 @@ static const struct gops_fb ga10b_ops_fb = {
 	.mmu_debug_rd = gm20b_fb_mmu_debug_rd,
 #ifdef CONFIG_NVGPU_COMPRESSION
 	.cbc_configure = ga10b_fb_cbc_configure,
+	.cbc_get_alignment = tu104_fb_cbc_get_alignment,
 	.set_use_full_comp_tag_line = NULL,
 	.compression_page_size = gp10b_fb_compression_page_size,
 	.compressible_page_size = gp10b_fb_compressible_page_size,
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -341,7 +341,6 @@ static const struct gops_cbc tu104_ops_cbc = {
 	.cbc_init_support = nvgpu_cbc_init_support,
 	.cbc_remove_support = nvgpu_cbc_remove_support,
 	.init = tu104_cbc_init,
-	.get_base_divisor = tu104_cbc_get_base_divisor,
 	.alloc_comptags = tu104_cbc_alloc_comptags,
 	.ctrl = tu104_cbc_ctrl,
 	.fix_config = NULL,
@@ -797,6 +796,7 @@ static const struct gops_fb tu104_ops_fb = {
 	.mmu_debug_rd = gm20b_fb_mmu_debug_rd,
 #ifdef CONFIG_NVGPU_COMPRESSION
 	.cbc_configure = tu104_fb_cbc_configure,
+	.cbc_get_alignment = tu104_fb_cbc_get_alignment,
 	.set_use_full_comp_tag_line = gm20b_fb_set_use_full_comp_tag_line,
 	.compression_page_size = gp10b_fb_compression_page_size,
 	.compressible_page_size = gp10b_fb_compressible_page_size,
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/cbc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/cbc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -27,7 +27,6 @@ struct gops_cbc {
 	int (*cbc_init_support)(struct gk20a *g);
 	void (*cbc_remove_support)(struct gk20a *g);
 	void (*init)(struct gk20a *g, struct nvgpu_cbc *cbc);
-	u64 (*get_base_divisor)(struct gk20a *g);
 	int (*alloc_comptags)(struct gk20a *g,
 				struct nvgpu_cbc *cbc);
 	int (*ctrl)(struct gk20a *g, enum nvgpu_cbc_op op,
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/fb.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/fb.h
@@ -440,6 +440,18 @@ struct gops_fb {
 							(struct gk20a *g);
 #ifdef CONFIG_NVGPU_COMPRESSION
 	void (*cbc_configure)(struct gk20a *g, struct nvgpu_cbc *cbc);
+	/**
+	 * @brief Get cbc base and top alignment specs.
+	 *
+	 * @param g [in]	Pointer to GPU driver struct.
+	 *
+	 * The function calculates and returns required CBC base and top
+	 * alignment values.
+	 *
+	 * @return None.
+	 */
+	void (*cbc_get_alignment)(struct gk20a *g,
+		u64 *base_divisor, u64 *top_divisor);
 	bool (*set_use_full_comp_tag_line)(struct gk20a *g);

 	/*
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_fb_tu104.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_fb_tu104.h
@@ -687,13 +687,13 @@
 #define fb_mmu_num_active_ltcs_count_f(v)               ((U32(v) & 0x1fU) << 0U)
 #define fb_mmu_num_active_ltcs_count_v(r)                  (((r) >> 0U) & 0x1fU)
 #define fb_mmu_cbc_base_r()                                        (0x00100ec4U)
+#define fb_mmu_cbc_base_alignment_shift_v()                        (0x0000000bU)
 #define fb_mmu_cbc_base_address_f(v)               ((U32(v) & 0x3ffffffU) << 0U)
 #define fb_mmu_cbc_base_address_v(r)                  (((r) >> 0U) & 0x3ffffffU)
-#define fb_mmu_cbc_base_address_alignment_shift_v()                (0x0000000bU)
 #define fb_mmu_cbc_top_r()                                         (0x00100ec8U)
+#define fb_mmu_cbc_top_alignment_shift_v()                         (0x0000000bU)
 #define fb_mmu_cbc_top_size_f(v)                      ((U32(v) & 0x7fffU) << 0U)
 #define fb_mmu_cbc_top_size_v(r)                         (((r) >> 0U) & 0x7fffU)
-#define fb_mmu_cbc_top_size_alignment_shift_v()                    (0x0000000bU)
 #define fb_mmu_cbc_max_r()                                         (0x00100eccU)
 #define fb_mmu_cbc_max_comptagline_f(v)             ((U32(v) & 0xffffffU) << 0U)
 #define fb_mmu_cbc_max_comptagline_m()                    (U32(0xffffffU) << 0U)