gpu: nvgpu: Use busy looping on memory ops

Use busy looping on L2 and TLB maintenance operations. This speeds
them up by an order of magnitude.

Add also trace points to measure performance for memory ops and
interrupt processing.

Change-Id: Ic4a8525d3d946b2b8f57b4b8ddcfc61605619399
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/681640
This commit is contained in:
Terje Bergstrom
2015-02-05 10:05:56 -08:00
committed by Dan Willemsen
parent 5b6e8995b2
commit 24ddf71b90
4 changed files with 103 additions and 7 deletions

View File

@@ -26,6 +26,7 @@
#include <linux/dma-mapping.h>
#include <linux/firmware.h>
#include <linux/nvhost.h>
#include <trace/events/gk20a.h>
#include "gk20a.h"
#include "kind_gk20a.h"
@@ -4998,6 +4999,8 @@ static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
{
gk20a_dbg_fn("");
trace_gr_gk20a_handle_sw_method(g->dev->name);
if (class_num == KEPLER_COMPUTE_A) {
switch (offset << 2) {
case NVA0C0_SET_SHADER_EXCEPTIONS:

View File

@@ -14,6 +14,7 @@
*/
#include <linux/types.h>
#include <trace/events/gk20a.h>
#include "gk20a.h"
#include "mc_gk20a.h"
@@ -23,6 +24,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g)
{
u32 mc_intr_0;
trace_mc_gk20a_intr_stall(g->dev->name);
if (!g->power_on)
return IRQ_NONE;
@@ -37,6 +40,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g)
/* flush previous write */
gk20a_readl(g, mc_intr_en_0_r());
trace_mc_gk20a_intr_stall_done(g->dev->name);
return IRQ_WAKE_THREAD;
}
@@ -67,6 +72,8 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
trace_mc_gk20a_intr_thread_stall(g->dev->name);
mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
@@ -92,6 +99,8 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g)
/* flush previous write */
gk20a_readl(g, mc_intr_en_0_r());
trace_mc_gk20a_intr_thread_stall_done(g->dev->name);
return IRQ_HANDLED;
}

View File

@@ -27,6 +27,7 @@
#include <linux/vmalloc.h>
#include <linux/dma-buf.h>
#include <uapi/linux/nvgpu.h>
#include <trace/events/gk20a.h>
#include "gk20a.h"
#include "mm_gk20a.h"
@@ -2816,6 +2817,9 @@ int gk20a_mm_fb_flush(struct gk20a *g)
/* Make sure all previous writes are committed to the L2. There's no
guarantee that writes are to DRAM. This will be a sysmembar internal
to the L2. */
trace_gk20a_mm_fb_flush(g->dev->name);
gk20a_writel(g, flush_fb_flush_r(),
flush_fb_flush_pending_busy_f());
@@ -2828,7 +2832,7 @@ int gk20a_mm_fb_flush(struct gk20a *g)
flush_fb_flush_pending_busy_v()) {
gk20a_dbg_info("fb_flush 0x%x", data);
retry--;
usleep_range(20, 40);
udelay(5);
} else
break;
} while (retry >= 0 || !tegra_platform_is_silicon());
@@ -2839,6 +2843,8 @@ int gk20a_mm_fb_flush(struct gk20a *g)
ret = -EBUSY;
}
trace_gk20a_mm_fb_flush_done(g->dev->name);
mutex_unlock(&mm->l2_op_lock);
return ret;
@@ -2849,6 +2855,8 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
u32 data;
s32 retry = 200;
trace_gk20a_mm_l2_invalidate(g->dev->name);
/* Invalidate any clean lines from the L2 so subsequent reads go to
DRAM. Dirty lines are not affected by this operation. */
gk20a_writel(g, flush_l2_system_invalidate_r(),
@@ -2864,7 +2872,7 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
gk20a_dbg_info("l2_system_invalidate 0x%x",
data);
retry--;
usleep_range(20, 40);
udelay(5);
} else
break;
} while (retry >= 0 || !tegra_platform_is_silicon());
@@ -2872,6 +2880,8 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
if (retry < 0)
gk20a_warn(dev_from_gk20a(g),
"l2_system_invalidate too many retries");
trace_gk20a_mm_l2_invalidate_done(g->dev->name);
}
void gk20a_mm_l2_invalidate(struct gk20a *g)
@@ -2900,6 +2910,8 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
mutex_lock(&mm->l2_op_lock);
trace_gk20a_mm_l2_flush(g->dev->name);
/* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
as clean, so subsequent reads might hit in the L2. */
gk20a_writel(g, flush_l2_flush_dirty_r(),
@@ -2914,7 +2926,7 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
flush_l2_flush_dirty_pending_busy_v()) {
gk20a_dbg_info("l2_flush_dirty 0x%x", data);
retry--;
usleep_range(20, 40);
udelay(5);
} else
break;
} while (retry >= 0 || !tegra_platform_is_silicon());
@@ -2923,6 +2935,8 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
gk20a_warn(dev_from_gk20a(g),
"l2_flush_dirty too many retries");
trace_gk20a_mm_l2_flush_done(g->dev->name);
if (invalidate)
gk20a_mm_l2_invalidate_locked(g);
@@ -2964,7 +2978,7 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->mm->g,
vm->pdes.sgt->sgl) >> 12);
u32 data;
s32 retry = 200;
s32 retry = 2000;
static DEFINE_MUTEX(tlb_lock);
gk20a_dbg_fn("");
@@ -2986,11 +3000,14 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
}
mutex_lock(&tlb_lock);
trace_gk20a_mm_tlb_invalidate(g->dev->name);
do {
data = gk20a_readl(g, fb_mmu_ctrl_r());
if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
break;
usleep_range(20, 40);
udelay(2);
retry--;
} while (retry >= 0 || !tegra_platform_is_silicon());
@@ -3014,13 +3031,15 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
fb_mmu_ctrl_pri_fifo_empty_false_f())
break;
retry--;
usleep_range(20, 40);
udelay(2);
} while (retry >= 0 || !tegra_platform_is_silicon());
if (retry < 0)
gk20a_warn(dev_from_gk20a(g),
"mmu invalidate too many retries");
trace_gk20a_mm_tlb_invalidate_done(g->dev->name);
out:
mutex_unlock(&tlb_lock);
vm->tlb_dirty = false;

View File

@@ -1,7 +1,7 @@
/*
* gk20a event logging to ftrace.
*
* Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -65,6 +65,71 @@ DEFINE_EVENT(gk20a, gk20a_gpfifo_submit_wait_for_space_done,
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_l2_invalidate,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_l2_invalidate_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_l2_flush,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_l2_flush_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_tlb_invalidate,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_tlb_invalidate_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_fb_flush,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gk20a_mm_fb_flush_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, mc_gk20a_intr_thread_stall,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, mc_gk20a_intr_thread_stall_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, mc_gk20a_intr_stall,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, mc_gk20a_intr_stall_done,
TP_PROTO(const char *name),
TP_ARGS(name)
);
DEFINE_EVENT(gk20a, gr_gk20a_handle_sw_method,
TP_PROTO(const char *name),
TP_ARGS(name)
);
TRACE_EVENT(gk20a_channel_update,
TP_PROTO(const void *channel),
TP_ARGS(channel),