diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 74d6b33..0dde973 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -138,8 +138,29 @@ jobs: - name: Start podman socket run: systemctl --user start podman.socket + - name: Build operator image + run: make buildimg + + - name: Start bink cluster + run: make start-bink + + - name: Deploy to bink cluster + run: make deploy-bink + + - name: Gather deploy logs + if: failure() + run: make gather-bink + - name: Run e2e tests - run: make buildimg deploy-bink e2e V=1 + run: make e2e V=1 + + - name: Upload logs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: e2e-logs + path: _output/logs/ + if-no-files-found: ignore - name: Push to GHCR if: github.event_name == 'push' diff --git a/.gitignore b/.gitignore index 2f20c60..092058e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +_output/ bin/ *.out kubeconfig-* diff --git a/Makefile b/Makefile index 350faa0..c76131b 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ CONTAINER_TOOL ?= podman # To use a separate dev cluster: make deploy-bink BINK_CLUSTER_NAME=dev BINK_CLUSTER_NAME ?= e2e KUBECONFIG_BINK ?= ./kubeconfig-$(BINK_CLUSTER_NAME) +ARTIFACTS ?= $(abspath _output/logs) # YEAR defines the year value used for substituting the YEAR placeholder in the boilerplate header. YEAR ?= $(shell date +%Y) @@ -58,8 +59,10 @@ e2e: ## Run e2e tests (requires: make deploy-bink). V=1 for verbose. RUN= # actually gives us streaming output (otherwise, it spawns a subprocess for # each package, even though we just have one here--but I really like streaming # output...). + rm -rf $(ARTIFACTS) cd test/e2e && KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) BINK_CLUSTER_NAME=$(BINK_CLUSTER_NAME) \ $(if $(BINK_NODE_IMAGE),BINK_NODE_IMAGE=$(BINK_NODE_IMAGE)) \ + ARTIFACTS=$(ARTIFACTS) \ go test -timeout 10m -count=1 $(if $(V),-v) $(if $(RUN),-run $(RUN)) . ##@ Build @@ -108,18 +111,31 @@ undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/. # Note the :latest tag here: this makes the pull policy be Always. IMG_BINK ?= registry.cluster.local:5000/bootc-operator-e2e:latest -.PHONY: deploy-bink -deploy-bink: kustomize ## Deploy to a bink cluster (idempotent, requires: buildimg). +.PHONY: start-bink +start-bink: ## Start a bink cluster (idempotent). bink registry start - podman push --tls-verify=false $(IMG) localhost:5000/bootc-operator-e2e:latest bink cluster list 2>&1 | grep -qw $(BINK_CLUSTER_NAME) || \ bink cluster start --cluster-name $(BINK_CLUSTER_NAME) --node-name controller --api-port 0 --expose $(KUBECONFIG_BINK) \ $(if $(BINK_NODE_IMAGE),--node-image $(BINK_NODE_IMAGE)) kubectl --kubeconfig $(KUBECONFIG_BINK) wait --for=condition=Ready node/controller --timeout=5m - $(MAKE) deploy KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) IMG=$(IMG_BINK) - kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout restart deployment/bootc-operator-controller-manager + +.PHONY: deploy-bink +deploy-bink: start-bink kustomize ## Deploy to a bink cluster (requires: buildimg). + podman push --tls-verify=false $(IMG) localhost:5000/bootc-operator-e2e:latest + # On re-deploy, restart the rollout to force a re-pull of the :latest tag. + # On fresh deploy, skip the restart -- the pod is already pulling the correct image. + @existed=$$(kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator get deploy bootc-operator-controller-manager -o name 2>/dev/null || true) && \ + $(MAKE) deploy KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) IMG=$(IMG_BINK) && \ + if [ -n "$$existed" ]; then \ + kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout restart deployment/bootc-operator-controller-manager; \ + fi kubectl --kubeconfig $(KUBECONFIG_BINK) -n bootc-operator rollout status deployment/bootc-operator-controller-manager --timeout=3m +.PHONY: gather-bink +gather-bink: ## Gather diagnostic logs from the bink cluster. + KUBECONFIG=$(abspath $(KUBECONFIG_BINK)) BINK_CLUSTER_NAME=$(BINK_CLUSTER_NAME) \ + hack/gather-logs.sh $(ARTIFACTS)/gather-bink controller + .PHONY: teardown-bink teardown-bink: ## Tear down the bink cluster. bink cluster stop --remove-data --cluster-name $(BINK_CLUSTER_NAME) diff --git a/hack/gather-logs.sh b/hack/gather-logs.sh new file mode 100755 index 0000000..d6ef7cf --- /dev/null +++ b/hack/gather-logs.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Gather diagnostic logs from a bink cluster. +# +# Usage: hack/gather-logs.sh [node-names...] +# +# Expects KUBECONFIG and BINK_CLUSTER_NAME from environment. +# Each command's output is written to a separate file in . +# Individual command failures are non-fatal. + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [node-names...]" >&2 + exit 1 +fi + +: "${KUBECONFIG:?must be set}" +: "${BINK_CLUSTER_NAME:?must be set}" + +output_dir="$1" +shift +nodes=("$@") + +mkdir -p "${output_dir}" + +echo "Gathering logs to ${output_dir}..." + +run() { + local filename="$1" + shift + echo " ${filename}" + "$@" > "${output_dir}/${filename}" 2>&1 || true +} + +# Cluster-wide commands +run "k-get-pods.txt" kubectl get pods -n bootc-operator -o wide +run "k-describe-pods.txt" kubectl describe pods -n bootc-operator +run "k-get-deployment.yaml" kubectl get deployment -n bootc-operator -o yaml +run "k-describe-bootcnodepools.txt" kubectl describe bootcnodepools +run "k-describe-bootcnodes.txt" kubectl describe bootcnodes +run "k-get-events.txt" kubectl get events -n bootc-operator --sort-by=.lastTimestamp + +# Pod logs +for pod in $(kubectl get pods -n bootc-operator -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do + run "k-logs-${pod}.log" kubectl logs -n bootc-operator "${pod}" --all-containers + run "k-logs-${pod}-previous.log" kubectl logs -n bootc-operator "${pod}" --all-containers --previous +done + +# Per-node commands +for node in "${nodes[@]}"; do + run "k-describe-node-${node}.txt" kubectl describe node "${node}" + run "journal-${node}.txt" bink node ssh "${node}" --cluster-name "${BINK_CLUSTER_NAME}" -- journalctl --no-pager +done + +echo "Done." diff --git a/test/e2e/e2eutil/env.go b/test/e2e/e2eutil/env.go index 928d062..0472851 100644 --- a/test/e2e/e2eutil/env.go +++ b/test/e2e/e2eutil/env.go @@ -13,6 +13,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strings" "testing" "time" @@ -168,8 +169,11 @@ func (e *Env) TestLabels() map[string]string { return map[string]string{LabelE2ETest: e.testID} } -// cleanup deletes test-scoped resources and bink nodes. +// cleanup gathers diagnostic logs, then deletes test-scoped resources +// and bink nodes. func (e *Env) cleanup(t *testing.T) { + e.gatherLogs(t) + ctx := context.Background() t.Logf("Removing pools with label %s=%s...", LabelE2ETest, e.testID) if err := e.Client.DeleteAllOf(ctx, &bootcv1alpha1.BootcNodePool{}, client.MatchingLabels(e.TestLabels())); err != nil { @@ -183,6 +187,30 @@ func (e *Env) cleanup(t *testing.T) { } } +// gatherLogs calls hack/gather-logs.sh to collect diagnostic logs into +// $ARTIFACTS//. Skipped if ARTIFACTS is not set. +func (e *Env) gatherLogs(t *testing.T) { + artifactsDir := os.Getenv("ARTIFACTS") + if artifactsDir == "" { + return + } + + outputDir := filepath.Join(artifactsDir, e.testID) + + // The test runs from test/e2e/, so resolve the script relative + // to the repo root. + args := []string{"../../hack/gather-logs.sh", outputDir} + args = append(args, e.nodes...) + + t.Logf("Gathering logs to %s...", outputDir) + cmd := exec.Command("bash", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + t.Logf("WARNING: gather-logs.sh failed: %v", err) + } +} + // sanitizeTestName lowercases a test name for use in k8s object names. // Panics if the result exceeds 63 characters (k8s label value limit). func sanitizeTestName(name string) string {