From 6067a7871c68b0919b669f10ea777eea042c7f44 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 10 Jun 2026 21:09:30 +0000 Subject: [PATCH 1/4] CI: require N consecutive nvidia-smi successes after device cycle Multi-GPU Windows rows (observed on 2x H100 MCDM after #2176 landed) keep failing the "Ensure GPU is working" step with `Failed to initialize NVML: Not Found`. Root cause: after `pnputil` cycles both display devices, NVML briefly reports success mid-init then flaps back to "Not Found" a couple seconds later. The existing poll exits on the *first* `nvidia-smi` exit code 0, so the loop bails ~2 seconds in and the next workflow step hits the flap window. Scale the consecutive-success requirement to the number of cycled NVIDIA devices (1 for single-GPU rows, 2 for the H100 pair) and bump the inter-iteration sleep from 2 to 3 seconds. Single-GPU rows pay an extra 1-sec floor; multi-GPU rows now require ~6 sec of stable NVML before moving on. The 60-sec deadline is unchanged; the loop still bails (and the script fails loudly) if NVML doesn't settle in time. --- ci/tools/configure_driver_mode.ps1 | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 index 42e0914935d..5494995bce1 100644 --- a/ci/tools/configure_driver_mode.ps1 +++ b/ci/tools/configure_driver_mode.ps1 @@ -30,8 +30,10 @@ function Set-DriverMode { exit 1 } - # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) - $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" + # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA). + # @(...) forces an array even when there's a single device, so .Count works. + $nvidia_devices = @(Get-PnpDevice -Class Display -FriendlyName "NVIDIA*") + $gpu_count = $nvidia_devices.Count foreach ($device in $nvidia_devices) { Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" pnputil /disable-device "$($device.InstanceId)" @@ -39,17 +41,22 @@ function Set-DriverMode { } # Poll nvidia-smi until NVML can initialize, or give up after ~60s. - # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows - # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is - # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`. + # Require N consecutive successes where N == number of GPUs we just + # cycled: on multi-GPU rows (observed on 2x H100 MCDM), NVML briefly + # reports "ok" mid-init then flaps back to "Not Found", so a single + # success isn't enough. Scaling the consecutive-success requirement + # to the device count gives the settle window room to grow with the + # hardware. Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." $deadline = (Get-Date).AddSeconds(60) + $consecutive_ok = 0 do { - Start-Sleep -Seconds 2 + Start-Sleep -Seconds 3 & nvidia-smi.exe 2>&1 | Out-Null - } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline) - if ($LASTEXITCODE -ne 0) { - Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle" + if ($LASTEXITCODE -eq 0) { $consecutive_ok++ } else { $consecutive_ok = 0 } + } while ($consecutive_ok -lt $gpu_count -and (Get-Date) -lt $deadline) + if ($consecutive_ok -lt $gpu_count) { + Write-Error "nvidia-smi did not return cleanly $gpu_count times in a row within 60s of the device cycle" exit 1 } } From 3777a8da20acdb7eadf60ce6f8b7ff62eab407e5 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 10 Jun 2026 21:51:56 +0000 Subject: [PATCH 2/4] CI: restore the pre-#2176 5-sec unconditional settle before the poll Pre-#2176, every Windows row ran install_gpu_driver.ps1 unconditionally and that script ended with a fixed `Start-Sleep -Seconds 5` after the pnputil cycle. #2176 dropped that floor (the poll exits on the first nvidia-smi success at ~2 sec on single-GPU, ~2 sec mid-flap on the H100 pair). Put the 5-sec floor back, before the consecutive-success poll, so we never settle for less than the known-good baseline. --- ci/tools/configure_driver_mode.ps1 | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 index 5494995bce1..d8038fc81a7 100644 --- a/ci/tools/configure_driver_mode.ps1 +++ b/ci/tools/configure_driver_mode.ps1 @@ -40,13 +40,20 @@ function Set-DriverMode { pnputil /enable-device "$($device.InstanceId)" } - # Poll nvidia-smi until NVML can initialize, or give up after ~60s. - # Require N consecutive successes where N == number of GPUs we just - # cycled: on multi-GPU rows (observed on 2x H100 MCDM), NVML briefly - # reports "ok" mid-init then flaps back to "Not Found", so a single - # success isn't enough. Scaling the consecutive-success requirement - # to the device count gives the settle window room to grow with the - # hardware. + # Restore the unconditional 5-sec settle that the pre-#2176 + # install_gpu_driver.ps1 had after the cycle. With #2176, on + # DRIVER=latest rows we no longer run an install (which previously + # acted as additional warm-up), so the floor had effectively + # dropped to whatever the poll's first iteration was. + Start-Sleep -Seconds 5 + + # Then poll nvidia-smi until NVML can initialize, or give up after + # ~60s. Require N consecutive successes where N == number of GPUs + # we just cycled: on multi-GPU rows (observed on 2x H100 MCDM) + # NVML briefly reports "ok" mid-init then flaps back to "Not + # Found", so a single success isn't enough. Scaling the + # consecutive-success requirement to the device count gives the + # settle window room to grow with the hardware. Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." $deadline = (Get-Date).AddSeconds(60) $consecutive_ok = 0 From cf5bfd1f248bba5388f5b7e0e937bec7daca6b34 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 10 Jun 2026 21:55:08 +0000 Subject: [PATCH 3/4] CI: trim configure_driver_mode.ps1 comments for portability --- ci/tools/configure_driver_mode.ps1 | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 index d8038fc81a7..f4947f53606 100644 --- a/ci/tools/configure_driver_mode.ps1 +++ b/ci/tools/configure_driver_mode.ps1 @@ -40,20 +40,11 @@ function Set-DriverMode { pnputil /enable-device "$($device.InstanceId)" } - # Restore the unconditional 5-sec settle that the pre-#2176 - # install_gpu_driver.ps1 had after the cycle. With #2176, on - # DRIVER=latest rows we no longer run an install (which previously - # acted as additional warm-up), so the floor had effectively - # dropped to whatever the poll's first iteration was. + # Initial settle after the device cycle. Start-Sleep -Seconds 5 - # Then poll nvidia-smi until NVML can initialize, or give up after - # ~60s. Require N consecutive successes where N == number of GPUs - # we just cycled: on multi-GPU rows (observed on 2x H100 MCDM) - # NVML briefly reports "ok" mid-init then flaps back to "Not - # Found", so a single success isn't enough. Scaling the - # consecutive-success requirement to the device count gives the - # settle window room to grow with the hardware. + # Poll nvidia-smi for N consecutive successes (N == cycled GPUs) + # so a mid-init "ok" flap doesn't fool the loop; bail after ~60s. Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." $deadline = (Get-Date).AddSeconds(60) $consecutive_ok = 0 From 4909e110f3bc2819c18ce5ff95410d31537eba59 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 10 Jun 2026 21:56:11 +0000 Subject: [PATCH 4/4] CI: drop redundant @(...) comment --- ci/tools/configure_driver_mode.ps1 | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 index f4947f53606..b055ca39102 100644 --- a/ci/tools/configure_driver_mode.ps1 +++ b/ci/tools/configure_driver_mode.ps1 @@ -31,7 +31,6 @@ function Set-DriverMode { } # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA). - # @(...) forces an array even when there's a single device, so .Count works. $nvidia_devices = @(Get-PnpDevice -Class Display -FriendlyName "NVIDIA*") $gpu_count = $nvidia_devices.Count foreach ($device in $nvidia_devices) {