gpu: nvgpu: decrease refcount when sync-unmap fails

When nvgpu_vm_unmap_sync fails, nvgpu_unmap_sync currently bails out without decreasing the buffer refcount. This prevents from releasing the buffer, in case a deferred job completes after the timeout (which was observed 2 times during overnight stress tests). This also means that the fixed address is not re-useable. Throw out a warning when nvgpu_vm_unmap_sync fails, but proceed with decreasing refcount. Bug 200578193 Change-Id: Ie0cc7caa7d12ca0a3b42123a5f7a28bda72dabbc Signed-off-by: ddutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2291352 (cherry picked from commit bb2c8ef511) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2307940 Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: Amulya Yarlagadda <ayarlagadda@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: Amulya Yarlagadda <ayarlagadda@nvidia.com>
2025-12-23 18:16:01 +03:00 · 2020-03-03 14:49:36 +05:30
parent aaecfae53f
commit af9af422c6
1 changed files with 14 additions and 13 deletions
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -1180,6 +1180,7 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
 {
 	struct nvgpu_timeout timeout;
 	int ret = 0;
 	bool done = false;
 	nvgpu_mutex_release(&vm->update_gmmu_lock);
@@ -1189,16 +1190,18 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
 	nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER);
 	do {
-		if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) == 1) {
+		if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) <= 1) {
-			break;
+			done = true;
-		}
+		} else if (nvgpu_timeout_expired_msg(&timeout,
 		nvgpu_msleep(10);
 	} while (nvgpu_timeout_expired_msg(&timeout,
 			    "sync-unmap failed on 0x%llx",
-			    mapped_buffer->addr) == 0);
+			    mapped_buffer->addr) != 0) {
 			done = true;
 		} else {
 			nvgpu_msleep(10);
 		}
 	} while (!done);
-	if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) != 1 &&
+	if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) > 1) {
 			nvgpu_timeout_expired(&timeout)) {
 		ret = -ETIMEDOUT;
 	}
@@ -1221,11 +1224,9 @@ void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset,
 	if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) {
 		if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) {
-			/*
+			nvgpu_warn(vm->mm->g, "%d references remaining on 0x%llx",
-			 * Looks like we have failed... Better not continue in
+				nvgpu_atomic_read(&mapped_buffer->ref.refcount),
-			 * case the buffer is in use.
+				mapped_buffer->addr);
 			 */
 			goto done;
 		}
 	}