From 77151ce138fe523b969febe1f89aaae56ed8a205 Mon Sep 17 00:00:00 2001 From: Joey Freeland Date: Mon, 15 Jun 2026 15:58:14 -0400 Subject: [PATCH 1/3] fix(pg_upgrade): retry nix-store -r to avoid hangs on stalled S3 fetches --- .../pg_upgrade_scripts/initiate.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh index 64020bf04..ebe356b3c 100755 --- a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh +++ b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh @@ -348,8 +348,22 @@ EXTRA_NIX_CONF echo "Store path: $STORE_PATH" - # Realize from binary cache (no nix evaluation needed!) - nix-store -r "$STORE_PATH" + # Realize the closure from the binary cache. + # + # nix-store -r can stall indefinitely on a dropped S3 connection without + # erroring out (its own download timeout doesn't reliably fire), so guard each + # attempt with a timeout and retry. nix-store is resumable, so a retry only + # re-fetches the unfinished paths. Failing as a normal command (not exit 1) + # lets the ERR trap run cleanup and record "failed" instead of hanging. + nix_store_ok="false" + for attempt in 1 2 3; do + if timeout -k 10 120 nix-store -r "$STORE_PATH"; then + nix_store_ok="true" + break + fi + echo "WARNING: nix-store -r attempt ${attempt}/3 for $STORE_PATH failed or stalled (>120s); retrying" + done + [ "$nix_store_ok" = "true" ] PG_UPGRADE_BIN_DIR="$STORE_PATH" PGSHARENEW="$PG_UPGRADE_BIN_DIR/share/postgresql" From 22d31beb698dc8c283b2f18d5ff60d66762435b2 Mon Sep 17 00:00:00 2001 From: Joey Freeland Date: Tue, 16 Jun 2026 12:37:18 -0400 Subject: [PATCH 2/3] fix: cleaner log, comment --- .../pg_upgrade_scripts/initiate.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh index ebe356b3c..ed7ed7f95 100755 --- a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh +++ b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh @@ -352,16 +352,21 @@ EXTRA_NIX_CONF # # nix-store -r can stall indefinitely on a dropped S3 connection without # erroring out (its own download timeout doesn't reliably fire), so guard each - # attempt with a timeout and retry. nix-store is resumable, so a retry only - # re-fetches the unfinished paths. Failing as a normal command (not exit 1) - # lets the ERR trap run cleanup and record "failed" instead of hanging. + # attempt with a timeout and retry. Each path downloads atomically: already- + # registered paths are skipped on retry but an in-flight NAR restarts from + # 0%. Failing as a normal command (not exit 1) lets the ERR trap run cleanup + # and record "failed" instead of hanging. nix_store_ok="false" for attempt in 1 2 3; do if timeout -k 10 120 nix-store -r "$STORE_PATH"; then nix_store_ok="true" break fi - echo "WARNING: nix-store -r attempt ${attempt}/3 for $STORE_PATH failed or stalled (>120s); retrying" + if [ "$attempt" -lt 3 ]; then + echo "WARNING: nix-store -r attempt ${attempt}/3 for $STORE_PATH failed or stalled (>120s); retrying" + else + echo "ERROR: nix-store -r failed after 3 attempts for $STORE_PATH" + fi done [ "$nix_store_ok" = "true" ] From 6422c43f45821c37a976c157d10f140a3a1d3464 Mon Sep 17 00:00:00 2001 From: Joey Freeland <30938344+jfreeland@users.noreply.github.com> Date: Tue, 16 Jun 2026 12:54:14 -0400 Subject: [PATCH 3/3] chore: valid copilot feedback Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../files/admin_api_scripts/pg_upgrade_scripts/initiate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh index ed7ed7f95..c09fa1f33 100755 --- a/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh +++ b/ansible/files/admin_api_scripts/pg_upgrade_scripts/initiate.sh @@ -358,12 +358,12 @@ EXTRA_NIX_CONF # and record "failed" instead of hanging. nix_store_ok="false" for attempt in 1 2 3; do - if timeout -k 10 120 nix-store -r "$STORE_PATH"; then + if timeout -k 10s 120s nix-store -r "$STORE_PATH"; then nix_store_ok="true" break fi if [ "$attempt" -lt 3 ]; then - echo "WARNING: nix-store -r attempt ${attempt}/3 for $STORE_PATH failed or stalled (>120s); retrying" + echo "WARNING: nix-store -r attempt ${attempt}/3 for $STORE_PATH failed or stalled (>=120s + up to 10s kill grace); retrying" else echo "ERROR: nix-store -r failed after 3 attempts for $STORE_PATH" fi