From 1af213dce557a9aedbb95a70e6e1be9ba11afabe Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sat, 6 Jun 2026 17:25:20 -0400 Subject: [PATCH 1/5] Makefile: skip rollout restart on fresh deploy-bink On a fresh cluster, deploy-bink was doing a rollout restart immediately after the initial deploy, which is pointless. Make it conditional on the Deployment already existing, so fresh deploys proceed without interruption. (Yeah, probably going to move off of Makefile and to Justfile in a follow-up...) Assisted-by: Pi (Claude Opus 4.6) --- Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 350faa0..1f6f8dd 100644 --- a/Makefile +++ b/Makefile @@ -116,8 +116,13 @@ deploy-bink: kustomize ## Deploy to a bink cluster (idempotent, requires: buildi bink cluster start --cluster-name $(BINK_CLUSTER_NAME) --node-name controller --api-port 0 --expose $(KUBECONFIG_BINK) \ $(if $(BINK_NODE_IMAGE),--node-image $(BINK_NODE_IMAGE)) kubectl --kubeconfig $(KUBECONFIG_BINK) wait --for=condition=Ready node/controller --timeout=5m - $(MAKE) deploy KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) IMG=$(IMG_BINK) - kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout restart deployment/bootc-operator-controller-manager + # On re-deploy, restart the rollout to force a re-pull of the :latest tag. + # On fresh deploy, skip the restart -- the pod is already pulling the correct image. + @existed=$$(kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator get deploy bootc-operator-controller-manager -o name 2>/dev/null || true) && \ + $(MAKE) deploy KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) IMG=$(IMG_BINK) && \ + if [ -n "$$existed" ]; then \ + kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout restart deployment/bootc-operator-controller-manager; \ + fi kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout status deployment/bootc-operator-controller-manager --timeout=3m .PHONY: teardown-bink From 03294f5d99e830d20e505a778822df36f742d486 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sat, 6 Jun 2026 22:31:14 -0400 Subject: [PATCH 2/5] Makefile: split out start-bink target from deploy-bink deploy-bink was doing two distinct things: 1. make sure a bink cluster is up and running 2. redeploy the bootc-operator controller Let's split out the first part into a separate `start-bink` target. That'll allow us to split the steps more in CI but also is prep for error-handling related stuff. --- Makefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 1f6f8dd..6812ed8 100644 --- a/Makefile +++ b/Makefile @@ -108,14 +108,17 @@ undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/. # Note the :latest tag here: this makes the pull policy be Always. IMG_BINK ?= registry.cluster.local:5000/bootc-operator-e2e:latest -.PHONY: deploy-bink -deploy-bink: kustomize ## Deploy to a bink cluster (idempotent, requires: buildimg). +.PHONY: start-bink +start-bink: ## Start a bink cluster (idempotent). bink registry start - podman push --tls-verify=false $(IMG) localhost:5000/bootc-operator-e2e:latest bink cluster list 2>&1 | grep -qw $(BINK_CLUSTER_NAME) || \ bink cluster start --cluster-name $(BINK_CLUSTER_NAME) --node-name controller --api-port 0 --expose $(KUBECONFIG_BINK) \ $(if $(BINK_NODE_IMAGE),--node-image $(BINK_NODE_IMAGE)) kubectl --kubeconfig $(KUBECONFIG_BINK) wait --for=condition=Ready node/controller --timeout=5m + +.PHONY: deploy-bink +deploy-bink: start-bink kustomize ## Deploy to a bink cluster (requires: buildimg). + podman push --tls-verify=false $(IMG) localhost:5000/bootc-operator-e2e:latest # On re-deploy, restart the rollout to force a re-pull of the :latest tag. # On fresh deploy, skip the restart -- the pod is already pulling the correct image. @existed=$$(kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator get deploy bootc-operator-controller-manager -o name 2>/dev/null || true) && \ From 1969c94917cfe0e22dd8e40edf25348bdca373eb Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sat, 6 Jun 2026 18:01:31 -0400 Subject: [PATCH 3/5] Makefile: add `gather-bink` for log gathering Add a new `hack/gather-logs.sh` for log gathering from the bink cluster and the controller and a new `gather-bink` Makefile target that calls it. Generally useful but also prep for hooking it up to CI. Assisted-by: Pi (Claude Opus 4.6) --- .gitignore | 1 + Makefile | 6 +++++ hack/gather-logs.sh | 55 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100755 hack/gather-logs.sh diff --git a/.gitignore b/.gitignore index 2f20c60..092058e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +_output/ bin/ *.out kubeconfig-* diff --git a/Makefile b/Makefile index 6812ed8..285aad9 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ CONTAINER_TOOL ?= podman # To use a separate dev cluster: make deploy-bink BINK_CLUSTER_NAME=dev BINK_CLUSTER_NAME ?= e2e KUBECONFIG_BINK ?= ./kubeconfig-$(BINK_CLUSTER_NAME) +ARTIFACTS ?= $(abspath _output/logs) # YEAR defines the year value used for substituting the YEAR placeholder in the boilerplate header. YEAR ?= $(shell date +%Y) @@ -128,6 +129,11 @@ deploy-bink: start-bink kustomize ## Deploy to a bink cluster (requires: buildim fi kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout status deployment/bootc-operator-controller-manager --timeout=3m +.PHONY: gather-bink +gather-bink: ## Gather diagnostic logs from the bink cluster. + KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) BINK_CLUSTER_NAME=$(BINK_CLUSTER_NAME) \ + hack/gather-logs.sh $(ARTIFACTS)/gather-bink controller + .PHONY: teardown-bink teardown-bink: ## Tear down the bink cluster. bink cluster stop --remove-data --cluster-name $(BINK_CLUSTER_NAME) diff --git a/hack/gather-logs.sh b/hack/gather-logs.sh new file mode 100755 index 0000000..d6ef7cf --- /dev/null +++ b/hack/gather-logs.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Gather diagnostic logs from a bink cluster. +# +# Usage: hack/gather-logs.sh [node-names...] +# +# Expects KUBECONFIG and BINK_CLUSTER_NAME from environment. +# Each command's output is written to a separate file in . +# Individual command failures are non-fatal. + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [node-names...]" >&2 + exit 1 +fi + +: "${KUBECONFIG:?must be set}" +: "${BINK_CLUSTER_NAME:?must be set}" + +output_dir="$1" +shift +nodes=("$@") + +mkdir -p "${output_dir}" + +echo "Gathering logs to ${output_dir}..." + +run() { + local filename="$1" + shift + echo " ${filename}" + "$@" > "${output_dir}/${filename}" 2>&1 || true +} + +# Cluster-wide commands +run "k-get-pods.txt" kubectl get pods -n bootc-operator -o wide +run "k-describe-pods.txt" kubectl describe pods -n bootc-operator +run "k-get-deployment.yaml" kubectl get deployment -n bootc-operator -o yaml +run "k-describe-bootcnodepools.txt" kubectl describe bootcnodepools +run "k-describe-bootcnodes.txt" kubectl describe bootcnodes +run "k-get-events.txt" kubectl get events -n bootc-operator --sort-by=.lastTimestamp + +# Pod logs +for pod in $(kubectl get pods -n bootc-operator -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do + run "k-logs-${pod}.log" kubectl logs -n bootc-operator "${pod}" --all-containers + run "k-logs-${pod}-previous.log" kubectl logs -n bootc-operator "${pod}" --all-containers --previous +done + +# Per-node commands +for node in "${nodes[@]}"; do + run "k-describe-node-${node}.txt" kubectl describe node "${node}" + run "journal-${node}.txt" bink node ssh "${node}" --cluster-name "${BINK_CLUSTER_NAME}" -- journalctl --no-pager +done + +echo "Done." From ed5a586b6c30789f4c3323b489d2def124e44a79 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sat, 6 Jun 2026 22:04:45 -0400 Subject: [PATCH 4/5] e2eutil: gather diagnostic logs in cleanup Call hack/gather-logs.sh from cleanup() before deleting pools and removing nodes, so that the cluster state is captured while everything is still alive. The script is called on every test run (pass or fail) to provide a complete record of cluster state. Log gathering is gated on the ARTIFACTS environment variable. When unset (e.g. running go test directly), gathering is silently skipped. Assisted-by: Pi (Claude Opus 4.6) --- Makefile | 2 ++ test/e2e/e2eutil/env.go | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 285aad9..c76131b 100644 --- a/Makefile +++ b/Makefile @@ -59,8 +59,10 @@ e2e: ## Run e2e tests (requires: make deploy-bink). V=1 for verbose. RUN= # actually gives us streaming output (otherwise, it spawns a subprocess for # each package, even though we just have one here--but I really like streaming # output...). + rm -rf $(ARTIFACTS) cd test/e2e && KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) BINK_CLUSTER_NAME=$(BINK_CLUSTER_NAME) \ $(if $(BINK_NODE_IMAGE),BINK_NODE_IMAGE=$(BINK_NODE_IMAGE)) \ + ARTIFACTS=$(ARTIFACTS) \ go test -timeout 10m -count=1 $(if $(V),-v) $(if $(RUN),-run $(RUN)) . ##@ Build diff --git a/test/e2e/e2eutil/env.go b/test/e2e/e2eutil/env.go index 928d062..0472851 100644 --- a/test/e2e/e2eutil/env.go +++ b/test/e2e/e2eutil/env.go @@ -13,6 +13,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strings" "testing" "time" @@ -168,8 +169,11 @@ func (e *Env) TestLabels() map[string]string { return map[string]string{LabelE2ETest: e.testID} } -// cleanup deletes test-scoped resources and bink nodes. +// cleanup gathers diagnostic logs, then deletes test-scoped resources +// and bink nodes. func (e *Env) cleanup(t *testing.T) { + e.gatherLogs(t) + ctx := context.Background() t.Logf("Removing pools with label %s=%s...", LabelE2ETest, e.testID) if err := e.Client.DeleteAllOf(ctx, &bootcv1alpha1.BootcNodePool{}, client.MatchingLabels(e.TestLabels())); err != nil { @@ -183,6 +187,30 @@ func (e *Env) cleanup(t *testing.T) { } } +// gatherLogs calls hack/gather-logs.sh to collect diagnostic logs into +// $ARTIFACTS//. Skipped if ARTIFACTS is not set. +func (e *Env) gatherLogs(t *testing.T) { + artifactsDir := os.Getenv("ARTIFACTS") + if artifactsDir == "" { + return + } + + outputDir := filepath.Join(artifactsDir, e.testID) + + // The test runs from test/e2e/, so resolve the script relative + // to the repo root. + args := []string{"../../hack/gather-logs.sh", outputDir} + args = append(args, e.nodes...) + + t.Logf("Gathering logs to %s...", outputDir) + cmd := exec.Command("bash", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + t.Logf("WARNING: gather-logs.sh failed: %v", err) + } +} + // sanitizeTestName lowercases a test name for use in k8s object names. // Panics if the result exceeds 63 characters (k8s label value limit). func sanitizeTestName(name string) string { From 117a39f1be3f3f984a50568813fd12a50fd85296 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sat, 6 Jun 2026 22:45:16 -0400 Subject: [PATCH 5/5] ci: split steps more and upload logs We need better visibility into failing e2e tests so let's capture logs. --- .github/workflows/ci.yaml | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 74d6b33..0dde973 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -138,8 +138,29 @@ jobs: - name: Start podman socket run: systemctl --user start podman.socket + - name: Build operator image + run: make buildimg + + - name: Start bink cluster + run: make start-bink + + - name: Deploy to bink cluster + run: make deploy-bink + + - name: Gather deploy logs + if: failure() + run: make gather-bink + - name: Run e2e tests - run: make buildimg deploy-bink e2e V=1 + run: make e2e V=1 + + - name: Upload logs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: e2e-logs + path: _output/logs/ + if-no-files-found: ignore - name: Push to GHCR if: github.event_name == 'push'