From 4aef10c9507a19fb288936b88b0faeb62a520817 Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Mon, 19 Jan 2015 14:50:57 -0800
Subject: [PATCH] gpu: nvgpu: Set compression page per SoC

Compression page size varies depending on architecture. Make it
129kB on gk20a and gm20b.

Also export some common functions from gm20b.

Bug 1592495

Change-Id: Ifb1c5b15d25fa961dab097021080055fc385fecd
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/673790
---
 drivers/gpu/nvgpu/gk20a/fb_gk20a.c  |  6 ++++++
 drivers/gpu/nvgpu/gk20a/gk20a.c     |  2 +-
 drivers/gpu/nvgpu/gk20a/gk20a.h     |  1 +
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h  |  3 ---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c  | 22 +++++++++++-----------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h  |  1 -
 drivers/gpu/nvgpu/gm20b/fb_gm20b.c  |  6 ++++++
 drivers/gpu/nvgpu/gm20b/ltc_gm20b.c | 14 +++++++-------
 drivers/gpu/nvgpu/gm20b/ltc_gm20b.h |  7 +++++++
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c    |  1 -
 10 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
index d5b3fd877..568aed7a7 100644
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -42,10 +42,16 @@ static void gk20a_fb_set_mmu_page_size(struct gk20a *g)
 	gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
 }
 
+static int gk20a_fb_compression_page_size(struct gk20a *g)
+{
+	return SZ_128K;
+}
+
 void gk20a_init_fb(struct gpu_ops *gops)
 {
 	gops->fb.reset = fb_gk20a_reset;
 	gops->fb.set_mmu_page_size = gk20a_fb_set_mmu_page_size;
+	gops->fb.compression_page_size = gk20a_fb_compression_page_size;
 	gk20a_init_uncompressed_kind_map();
 	gk20a_init_kind_attr();
 }
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 6c18c895a..57d5f09ac 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1788,7 +1788,7 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 	gpu->bus_type = NVGPU_GPU_BUS_TYPE_AXI; /* always AXI for now */
 
 	gpu->big_page_size = g->mm.pmu.vm.big_page_size;
-	gpu->compression_page_size = g->mm.pmu.vm.compression_page_size;
+	gpu->compression_page_size = g->ops.fb.compression_page_size(g);
 	gpu->pde_coverage_bit_count = g->mm.pmu.vm.pde_stride_shift;
 
 	gpu->available_big_page_sizes = gpu->big_page_size;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index b9796faab..4fbc25be6 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -167,6 +167,7 @@ struct gpu_ops {
 		void (*init_uncompressed_kind_map)(struct gk20a *g);
 		void (*init_kind_attr)(struct gk20a *g);
 		void (*set_mmu_page_size)(struct gk20a *g);
+		int (*compression_page_size)(struct gk20a *g);
 	} fb;
 	struct {
 		void (*slcg_bus_load_gating_prod)(struct gk20a *g, bool prod);
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index f130b8306..cd6fe9cb4 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -258,9 +258,6 @@ struct gr_gk20a {
 	u32 map_tile_count;
 	u32 map_row_offset;
 
-#define COMP_TAG_LINE_SIZE_SHIFT	(17)	/* one tag covers 128K */
-#define COMP_TAG_LINE_SIZE		(1 << COMP_TAG_LINE_SIZE_SHIFT)
-
 	u32 max_comptag_mem; /* max memory size (MB) for comptag */
 	struct compbit_store_desc compbit_store;
 	struct gk20a_allocator comp_tags;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3bce3c74c..6b7f84a35 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1001,7 +1001,9 @@ static int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
 					     enum gmmu_pgsz_gk20a pgsz_idx)
 {
 	bool kind_compressible;
-	struct device *d = dev_from_gk20a(vm->mm->g);
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct device *d = dev_from_gk20a(g);
+	int ctag_granularity = g->ops.fb.compression_page_size(g);
 
 	if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
 		bfr->kind_v = gmmu_pte_kind_pitch_v();
@@ -1036,8 +1038,7 @@ static int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
 		kind_compressible = false;
 	}
 	if (kind_compressible)
-		bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
-			COMP_TAG_LINE_SIZE_SHIFT;
+		bfr->ctag_lines = DIV_ROUND_UP_ULL(bfr->size, ctag_granularity);
 	else
 		bfr->ctag_lines = 0;
 
@@ -1113,10 +1114,10 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
 	u32 pde_lo, pde_hi;
 	struct device *d = dev_from_vm(vm);
 	struct gk20a *g = gk20a_from_vm(vm);
+	int ctag_granularity = g->ops.fb.compression_page_size(g);
 
 	if (clear_ctags && ctag_offset) {
-		u32 ctag_lines = ALIGN(size, COMP_TAG_LINE_SIZE) >>
-					COMP_TAG_LINE_SIZE_SHIFT;
+		u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
 
 		/* init/clear the ctag buffer */
 		g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
@@ -1756,7 +1757,9 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	struct scatterlist *cur_chunk;
 	unsigned int cur_offset;
 	u32 pte_w[2] = {0, 0}; /* invalid pte */
-	u32 ctag = ctag_offset * SZ_128K;
+	struct gk20a *g = gk20a_from_vm(vm);
+	u32 ctag_granularity = g->ops.fb.compression_page_size(g);
+	u32 ctag = ctag_offset * ctag_granularity;
 	u32 ctag_incr;
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
 	u64 addr = 0;
@@ -1768,9 +1771,6 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
 		   pgsz_idx, pde_lo, pde_hi);
 
-	/* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
-	 * below (per-pte). Note: this doesn't work unless page size (when
-	 * comptags are active) is 128KB. We have checks elsewhere for that. */
 	ctag_incr = ctag_offset ? page_size : 0;
 
 	cur_offset = 0;
@@ -1843,7 +1843,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 						>> gmmu_pte_address_shift_v());
 				pte_w[1] = gmmu_pte_aperture_video_memory_f() |
 					gmmu_pte_kind_f(kind_v) |
-					gmmu_pte_comptagline_f(ctag / SZ_128K);
+					gmmu_pte_comptagline_f(ctag
+							/ ctag_granularity);
 
 				if (rw_flag == gk20a_mem_flag_read_only) {
 					pte_w[0] |= gmmu_pte_read_only_true_f();
@@ -2161,7 +2162,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	vm->big_pages = big_pages;
 
 	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
-	vm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
 	vm->pde_stride    = vm->big_page_size << 10;
 	vm->pde_stride_shift = ilog2(vm->pde_stride);
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 8470a7ac2..e4fc30852 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -293,7 +293,6 @@ struct vm_gk20a {
 	bool tlb_dirty;
 	bool mapped;
 
-	u32 compression_page_size;
 	u32 big_page_size;
 	u32 pde_stride;
 	u32 pde_stride_shift;
diff --git a/drivers/gpu/nvgpu/gm20b/fb_gm20b.c b/drivers/gpu/nvgpu/gm20b/fb_gm20b.c
index 7cdd776ee..deef78965 100644
--- a/drivers/gpu/nvgpu/gm20b/fb_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/fb_gm20b.c
@@ -90,10 +90,16 @@ static void gm20b_fb_set_mmu_page_size(struct gk20a *g)
 	gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
 }
 
+static int gm20b_fb_compression_page_size(struct gk20a *g)
+{
+	return SZ_128K;
+}
+
 void gm20b_init_fb(struct gpu_ops *gops)
 {
 	gops->fb.init_fs_state = fb_gm20b_init_fs_state;
 	gops->fb.set_mmu_page_size = gm20b_fb_set_mmu_page_size;
+	gops->fb.compression_page_size = gm20b_fb_compression_page_size;
 	gm20b_init_uncompressed_kind_map();
 	gm20b_init_kind_attr();
 }
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
index fe2e06d5f..0a0efe414 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -101,8 +101,8 @@ static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	return 0;
 }
 
-static int gm20b_ltc_cbc_ctrl(struct gk20a *g, enum gk20a_cbc_op op,
-			      u32 min, u32 max)
+int gm20b_ltc_cbc_ctrl(struct gk20a *g, enum gk20a_cbc_op op,
+		       u32 min, u32 max)
 {
 	int err = 0;
 	struct gr_gk20a *gr = &g->gr;
@@ -170,7 +170,7 @@ out:
 	return 0;
 }
 
-static void gm20b_ltc_init_fs_state(struct gk20a *g)
+void gm20b_ltc_init_fs_state(struct gk20a *g)
 {
 	u32 reg;
 
@@ -196,7 +196,7 @@ static void gm20b_ltc_init_fs_state(struct gk20a *g)
 	gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg);
 }
 
-static void gm20b_ltc_isr(struct gk20a *g)
+void gm20b_ltc_isr(struct gk20a *g)
 {
 	u32 mc_intr, ltc_intr;
 	int ltc, slice;
@@ -221,7 +221,7 @@ static void gm20b_ltc_isr(struct gk20a *g)
 	}
 }
 
-static void gm20b_ltc_g_elpg_flush_locked(struct gk20a *g)
+void gm20b_ltc_g_elpg_flush_locked(struct gk20a *g)
 {
 	u32 data;
 	bool done[g->ltc_count];
@@ -265,7 +265,7 @@ static void gm20b_ltc_g_elpg_flush_locked(struct gk20a *g)
 			    "g_elpg_flush too many retries");
 }
 
-static u32 gm20b_ltc_cbc_fix_config(struct gk20a *g, int base)
+u32 gm20b_ltc_cbc_fix_config(struct gk20a *g, int base)
 {
 	u32 val = gk20a_readl(g, ltc_ltcs_ltss_cbc_num_active_ltcs_r());
 	if (val == 2) {
@@ -281,7 +281,7 @@ static u32 gm20b_ltc_cbc_fix_config(struct gk20a *g, int base)
 /*
  * Performs a full flush of the L2 cache.
  */
-static void gm20b_flush_ltc(struct gk20a *g)
+void gm20b_flush_ltc(struct gk20a *g)
 {
 	u32 op_pending;
 	unsigned long now, timeout;
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.h b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.h
index c7524264d..288e193a7 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.h
@@ -18,4 +18,11 @@
 struct gpu_ops;
 
 void gm20b_init_ltc(struct gpu_ops *gops);
+void gm20b_ltc_init_fs_state(struct gk20a *g);
+int gm20b_ltc_cbc_ctrl(struct gk20a *g, enum gk20a_cbc_op op,
+		       u32 min, u32 max);
+void gm20b_ltc_g_elpg_flush_locked(struct gk20a *g);
+void gm20b_ltc_isr(struct gk20a *g);
+u32 gm20b_ltc_cbc_fix_config(struct gk20a *g, int base);
+void gm20b_flush_ltc(struct gk20a *g);
 #endif
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 2dd8cb68c..6817b107b 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -39,7 +39,6 @@ static int vgpu_init_mm_setup_sw(struct gk20a *g)
 
 	/* gk20a_init_gpu_characteristics expects this to be populated */
 	vm->big_page_size = big_page_size;
-	vm->compression_page_size = big_page_size;
 	vm->pde_stride    = vm->big_page_size << 10;
 	vm->pde_stride_shift = ilog2(vm->pde_stride);