gpu: nvgpu: decrease refcount when sync-unmap fails

When nvgpu_vm_unmap_sync fails, nvgpu_unmap_sync currently bails
out without decreasing the buffer refcount. This prevents from
releasing the buffer, in case a deferred job completes after the
timeout (which was observed 2 times during overnight
stress tests). This also means that the fixed address is not
re-useable.

Throw out a warning when nvgpu_vm_unmap_sync fails, but proceed
with decreasing refcount.

Bug 200578193

Change-Id: Ie0cc7caa7d12ca0a3b42123a5f7a28bda72dabbc
Signed-off-by: ddutta <ddutta@nvidia.com>
(cherry picked from commit a433f26d5b
in dev-main)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2291352
Tested-by: Naveen Kumar S <nkumars@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
ddutta
2020-03-03 14:49:36 +05:30
committed by mobile promotions
parent fbad02d5e0
commit bb2c8ef511

View File

@@ -1180,6 +1180,7 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
{ {
struct nvgpu_timeout timeout; struct nvgpu_timeout timeout;
int ret = 0; int ret = 0;
bool done = false;
nvgpu_mutex_release(&vm->update_gmmu_lock); nvgpu_mutex_release(&vm->update_gmmu_lock);
@@ -1189,16 +1190,18 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER); nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER);
do { do {
if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) == 1) { if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) <= 1) {
break; done = true;
} } else if (nvgpu_timeout_expired_msg(&timeout,
nvgpu_msleep(10);
} while (nvgpu_timeout_expired_msg(&timeout,
"sync-unmap failed on 0x%llx", "sync-unmap failed on 0x%llx",
mapped_buffer->addr) == 0); mapped_buffer->addr) != 0) {
done = true;
} else {
nvgpu_msleep(10);
}
} while (!done);
if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) != 1 && if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) > 1) {
nvgpu_timeout_expired(&timeout)) {
ret = -ETIMEDOUT; ret = -ETIMEDOUT;
} }
@@ -1221,11 +1224,9 @@ void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset,
if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) { if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) {
if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) { if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) {
/* nvgpu_warn(vm->mm->g, "%d references remaining on 0x%llx",
* Looks like we have failed... Better not continue in nvgpu_atomic_read(&mapped_buffer->ref.refcount),
* case the buffer is in use. mapped_buffer->addr);
*/
goto done;
} }
} }