gpu: nvgpu: optimize mem_desc accessor loops

Instead of going via gk20a_mem_{wr,rd}32() on each iteration, do direct
memcpy/memset with sysmem, and minimize the enter/exit overhead with
vidmem.

JIRA DNVGPU-23

Change-Id: I5437e35f8393a746777a40636c1e9b5d93ced1f6
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1159524
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Konsta Holtta
2016-06-06 16:23:06 +03:00
committed by Terje Bergstrom
parent 15d241a8cb
commit 987de66583
2 changed files with 132 additions and 23 deletions

View File

@@ -150,19 +150,101 @@ u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
}
/*
* Batch innerloop for the function below once per each PRAMIN range (some
* 4B..1MB at a time). "start" reg goes as-is to gk20a_{readl,writel}.
*/
typedef void (*pramin_access_batch_fn)(struct gk20a *g, u32 start, u32 words,
u32 **arg);
/*
* The PRAMIN range is 1 MB, must change base addr if a buffer crosses that.
* This same loop is used for read/write/memset. Offset and size in bytes.
* One call to "loop" is done per range, with "arg" supplied.
*/
static inline void pramin_access_batched(struct gk20a *g, struct mem_desc *mem,
u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
{
offset /= sizeof(u32);
while (size) {
u32 byteoff = gk20a_pramin_enter(g, mem, offset);
u32 start_reg = pram_data032_r(byteoff / sizeof(u32));
u32 until_end = SZ_1M - (byteoff & (SZ_1M - 1));
u32 n = min(size, until_end);
loop(g, start_reg, n / sizeof(u32), arg);
/* read back to synchronize accesses */
gk20a_readl(g, start_reg);
gk20a_pramin_exit(g, mem);
offset += n / sizeof(u32);
size -= n;
}
}
static inline void pramin_access_batch_rd_n(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, *dest_u32 = *arg;
while (words--) {
*dest_u32++ = gk20a_readl(g, r);
r += sizeof(u32);
}
*arg = dest_u32;
}
static inline void pramin_access_batch_wr_n(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, *src_u32 = *arg;
while (words--) {
gk20a_writel(g, r, *src_u32++);
r += sizeof(u32);
}
*arg = src_u32;
}
static inline void pramin_access_batch_set(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, repeat = **arg;
while (words--) {
gk20a_writel(g, r, repeat);
r += sizeof(u32);
}
}
void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
u32 offset, void *dest, u32 size)
{
u32 i;
u32 *dest_u32 = dest;
WARN_ON(offset & 3);
WARN_ON(size & 3);
offset /= sizeof(u32);
size /= sizeof(u32);
for (i = 0; i < size; i++)
dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i);
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *src = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
memcpy(dest, src, size);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
src, *dest, size);
#endif
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 *dest_u32 = dest;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_rd_n, &dest_u32);
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
@@ -195,30 +277,57 @@ void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
void *src, u32 size)
{
u32 i;
u32 *src_u32 = src;
WARN_ON(offset & 3);
WARN_ON(size & 3);
offset /= sizeof(u32);
size /= sizeof(u32);
for (i = 0; i < size; i++)
gk20a_mem_wr32(g, mem, offset + i, src_u32[i]);
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *dest = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
dest, *src, size);
#endif
memcpy(dest, src, size);
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 *src_u32 = src;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_wr_n, &src_u32);
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
u32 value, u32 size)
u32 c, u32 size)
{
u32 i;
WARN_ON(offset & 3);
WARN_ON(size & 3);
offset /= sizeof(u32);
size /= sizeof(u32);
WARN_ON(c & ~0xff);
for (i = 0; i < size; i++)
gk20a_mem_wr32(g, mem, offset + i, value);
c &= 0xff;
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *dest = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x [times %d]",
dest, c, size);
#endif
memset(dest, c, size);
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 repeat_value = c | (c << 8) | (c << 16) | (c << 24);
u32 *p = &repeat_value;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_set, &p);
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
/*

View File

@@ -458,9 +458,9 @@ void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
/* memcpy from cpu, offset and size in bytes (32b-aligned) */
void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
void *src, u32 size);
/* size and offset in bytes (32b-aligned), filled with u32s */
/* size and offset in bytes (32b-aligned), filled with the constant byte c */
void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
u32 value, u32 size);
u32 c, u32 size);
#if 0 /*related to addr bits above, concern below TBD on which is accurate */
#define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\