gpu: nvgpu: Implement Thermal Alert for PG189

PG189 has multiple sensors which can provide interrupt when board temperature reaches programmed threshold. This Interrupt is implemented in nvgpu and provide events via clk_arb. Support is enabled for TU104 with NVGPU_SUPPORT_DGPU_THERMAL_ALERT flag. Board specific config is added in DT which will be parsed by nvgpu. Nvgpu does the following. 1.Read gpio line number, interrupt type, and event delay from DT. 2.Call kernel methods and register the interrupt with kernel. 3.Create work queue which will process the interrupt in process context. 4.When interrupt occurs disable interrupt, add work to work queue. 5.In work queue post events and sleep for delay time then enable Interrupt Bug 2492512 Change-Id: Ic5694fe366ca492f8afe8a67de4350e9a51af2af Signed-off-by: Abdul Salam <absalam@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2119411 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-05-23 23:44:05 +05:30
parent 20fcf813dd
commit 25eb392fd1
7 changed files with 122 additions and 5 deletions
--- a/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
+++ b/drivers/gpu/nvgpu/common/clk_arb/clk_arb.c
@@ -472,8 +472,12 @@ static void nvgpu_clk_arb_schedule_alarm(struct gk20a *g, u32 alarm)

 void nvgpu_clk_arb_send_thermal_alarm(struct gk20a *g)
 {
-	nvgpu_clk_arb_schedule_alarm(g,
-		BIT32(NVGPU_EVENT_ALARM_THERMAL_ABOVE_THRESHOLD));
+	struct nvgpu_clk_arb *arb = g->clk_arb;
+
+	if (arb != NULL) {
+		nvgpu_clk_arb_schedule_alarm(g,
+			BIT32(NVGPU_EVENT_ALARM_THERMAL_ABOVE_THRESHOLD));
+	}
 }

 void nvgpu_clk_arb_worker_deinit(struct gk20a *g)
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -1453,6 +1453,7 @@ int tu104_init_hal(struct gk20a *g)
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_PMU_SUPER_SURFACE, true);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE, true);
+	nvgpu_set_enabled(g, NVGPU_SUPPORT_DGPU_THERMAL_ALERT, true);

 	/* for now */
 	gops->clk.support_clk_freq_controller = false;
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -203,10 +203,14 @@ struct gk20a;

 /* NVGPU_GPU_IOCTL_SET_MMU_DEBUG_MODE is available */
 #define NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE	78
+
+/* DGPU Thermal Alert */
+#define NVGPU_SUPPORT_DGPU_THERMAL_ALERT	79
+
 /*
 * Must be greater than the largest bit offset in the above list.
 */
-#define NVGPU_MAX_ENABLED_BITS			79U
+#define NVGPU_MAX_ENABLED_BITS			80U

 /**
 * nvgpu_is_enabled - Check if the passed flag is enabled.
--- a/drivers/gpu/nvgpu/os/linux/ioctl_clk_arb.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_clk_arb.c
@@ -175,7 +175,7 @@ static inline u32 __pending_event(struct nvgpu_clk_dev *dev,
 	if (_WRAPGTEQ(tail, head) && info) {
 		head++;
 		p_notif = &dev->queue.notifications[head % dev->queue.size];
-		events |= nvgpu_convert_gpu_event(p_notif->notification);
+		events = p_notif->notification;
 		info->event_id = ffs(events) - 1;
 		info->timestamp = p_notif->timestamp;
 		nvgpu_atomic_set(&dev->queue.head, head);
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -28,6 +28,7 @@
 #include <linux/notifier.h>
 #include <linux/platform/tegra/common.h>
 #include <linux/pci.h>
+#include <linux/of_gpio.h>

 #include <uapi/linux/nvgpu.h>
 #include <dt-bindings/soc/gm20b-fuse.h>
@@ -284,6 +285,78 @@ void gk20a_init_linux_characteristics(struct gk20a *g)
 	}
 }

+static void therm_alert_work_queue(struct work_struct *work)
+{
+
+	struct dgpu_thermal_alert *thermal_alert =
+		container_of(work, struct dgpu_thermal_alert, work);
+	struct nvgpu_os_linux *l =
+		container_of(thermal_alert, struct nvgpu_os_linux,
+				thermal_alert);
+	struct gk20a *g = &l->g;
+
+	nvgpu_clk_arb_send_thermal_alarm(g);
+	nvgpu_msleep(l->thermal_alert.event_delay * 1000U);
+	enable_irq(l->thermal_alert.therm_alert_irq);
+}
+
+static irqreturn_t therm_irq(int irq, void *dev_id)
+{
+	struct nvgpu_os_linux *l = (struct nvgpu_os_linux *)dev_id;
+
+        disable_irq_nosync(irq);
+        queue_work(l->thermal_alert.workqueue, &l->thermal_alert.work);
+        return IRQ_HANDLED;
+}
+
+static int nvgpu_request_therm_irq(struct nvgpu_os_linux *l)
+{
+	struct device_node *np;
+	int ret = 0, gpio, index = 0;
+	u32 irq_flags = IRQ_TYPE_NONE;
+	u32 event_delay = 10U;
+
+	if (l->thermal_alert.workqueue != NULL) {
+		return ret;
+	}
+	np = of_find_node_by_name(NULL, "nvgpu");
+	if (!np) {
+		return -ENOENT;
+	}
+
+	gpio = of_get_named_gpio(np, "nvgpu-therm-gpios", index);
+	if (gpio < 0) {
+		nvgpu_err(&l->g, "failed to get GPIO %d ", gpio);
+		return gpio;
+	}
+
+	l->thermal_alert.therm_alert_irq = gpio_to_irq(gpio);
+
+	if (of_property_read_u32(np, "alert-interrupt-level", &irq_flags))
+		nvgpu_info(&l->g, "Missing interrupt-level "
+				"prop using %d", irq_flags);
+	if (of_property_read_u32(np, "alert-event-interval", &event_delay))
+		nvgpu_info(&l->g, "Missing event-interval "
+				"prop using %d seconds ", event_delay);
+
+	l->thermal_alert.event_delay = event_delay;
+
+	if (!l->thermal_alert.workqueue) {
+		l->thermal_alert.workqueue = alloc_workqueue("%s",
+					WQ_HIGHPRI, 1, "dgpu_thermal_alert");
+		INIT_WORK(&l->thermal_alert.work, therm_alert_work_queue);
+	}
+
+	ret = devm_request_irq(l->dev, l->thermal_alert.therm_alert_irq ,
+			therm_irq, irq_flags, "dgpu_therm", l);
+	if (ret != 0) {
+		nvgpu_err(&l->g, "IRQ request failed");
+	}
+
+	return ret;
+}
+
+
 int gk20a_pm_finalize_poweron(struct device *dev)
 {
 	struct gk20a *g = get_gk20a(dev);
@@ -331,6 +404,16 @@ int gk20a_pm_finalize_poweron(struct device *dev)
 			g->sim->sim_init_late(g);
 	}

+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_DGPU_THERMAL_ALERT) &&
+		nvgpu_platform_is_silicon(g)) {
+		err = nvgpu_request_therm_irq(l);
+		if (err) {
+			nvgpu_err(g, "thermal interrupt request failed %d",
+				err);
+			goto done;
+		}
+	}
+
 	err = gk20a_finalize_poweron(g);
 	if (err)
 		goto done;
--- a/drivers/gpu/nvgpu/os/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/os/linux/os_linux.h
@@ -62,10 +62,17 @@ struct nvgpu_os_linux_ops {
 	} s_param;
 };

+struct dgpu_thermal_alert {
+        struct workqueue_struct *workqueue;
+        struct work_struct work;
+        u32 therm_alert_irq;
+        u32 event_delay;
+};
+
 struct nvgpu_os_linux {
 	struct gk20a g;
 	struct device *dev;
-
+	struct dgpu_thermal_alert thermal_alert;
 	struct {
 		struct cdev cdev;
 		struct device *node;
--- a/drivers/gpu/nvgpu/os/linux/pci.c
+++ b/drivers/gpu/nvgpu/os/linux/pci.c
@@ -558,6 +558,19 @@ err_free_l:
 	return err;
 }

+static void nvgpu_thermal_deinit(struct gk20a *g)
+{
+	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+	struct device *dev = dev_from_gk20a(g);
+
+	devm_free_irq(dev, l->thermal_alert.therm_alert_irq, g);
+	if (l->thermal_alert.workqueue != NULL) {
+		cancel_work_sync(&l->thermal_alert.work);
+		destroy_workqueue(l->thermal_alert.workqueue);
+		l->thermal_alert.workqueue = NULL;
+	}
+}
+
 static void nvgpu_pci_remove(struct pci_dev *pdev)
 {
 	struct gk20a *g = get_gk20a(&pdev->dev);
@@ -576,6 +589,11 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)

 	gk20a_driver_start_unload(g);

+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_DGPU_THERMAL_ALERT) &&
+			nvgpu_platform_is_silicon(g)) {
+		nvgpu_thermal_deinit(g);
+	}
+
 	err = nvgpu_quiesce(g);
 	/* TODO: handle failure to idle */
 	WARN(err, "gpu failed to idle during driver removal");