From a540cf96f4ae3d87da1388c527889c384022a9b1 Mon Sep 17 00:00:00 2001 From: rawrmonster17 Date: Fri, 19 Jun 2026 01:06:48 -0500 Subject: [PATCH] ce: fix CeUtils scheduling left paused on error paths in kceTopLevelPceLceMappingsUpdate cePauseCeUtilsScheduling() is called at the start of kceTopLevelPceLceMappingsUpdate_IMPL() to block RM-internal CE submissions while PCE-LCE mappings are being updated. However, two error paths return without calling the matching ceResumeCeUtilsScheduling(): 1. NV_ASSERT_OK_OR_RETURN() on rmapiControlCacheFreeForControl() returns immediately on failure, skipping the resume. 2. The early return on NV2080_CTRL_CMD_CE_UPDATE_PCE_LCE_MAPPINGS_V2 failure likewise skips the resume. When either path fires, CeUtils submission stays permanently paused for the lifetime of the GPU instance. Subsequent RM-internal CE operations (memory scrubbing, allocation init) stall or fail. Fix by converting both early returns to goto cleanup so that ceResumeCeUtilsScheduling() is always called after the pause, regardless of which error path is taken. Also convert the NV_ASSERT_OK_OR_RETURN() to an explicit status check so the error is captured in status before branching to cleanup. Signed-off-by: rawrmonster17 --- src/nvidia/src/kernel/gpu/ce/kernel_ce.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/nvidia/src/kernel/gpu/ce/kernel_ce.c b/src/nvidia/src/kernel/gpu/ce/kernel_ce.c index 9f308645f..338be611f 100644 --- a/src/nvidia/src/kernel/gpu/ce/kernel_ce.c +++ b/src/nvidia/src/kernel/gpu/ce/kernel_ce.c @@ -805,9 +805,10 @@ NV_STATUS kceTopLevelPceLceMappingsUpdate_IMPL(OBJGPU *pGpu, KernelCE *pKCe) params.exposeCeMask = exposeCeMask; params.bUpdateNvlinkPceLce = bUpdateNvlinkPceLce; - NV_ASSERT_OK_OR_RETURN( - rmapiControlCacheFreeForControl(gpuGetInstance(pGpu), - NV2080_CTRL_CMD_CE_GET_CE_PCE_MASK)); + status = rmapiControlCacheFreeForControl(gpuGetInstance(pGpu), + NV2080_CTRL_CMD_CE_GET_CE_PCE_MASK); + if (status != NV_OK) + goto cleanup; // For GSP clients, the update needs to be routed through ctrl call params.shimInstance = pKCe->shimInstance; @@ -822,7 +823,7 @@ NV_STATUS kceTopLevelPceLceMappingsUpdate_IMPL(OBJGPU *pGpu, KernelCE *pKCe) { NV_PRINTF(LEVEL_ERROR, "Failed to update PCE-LCE mappings. Return\n"); - return status; + goto cleanup; } // @@ -833,6 +834,7 @@ NV_STATUS kceTopLevelPceLceMappingsUpdate_IMPL(OBJGPU *pGpu, KernelCE *pKCe) // status = kceUpdateClassDB_HAL(pGpu, pKCe); +cleanup: ceResumeCeUtilsScheduling(pGpu); return status;