promptfoo · mldangelo-oai · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -12,4 +12,4 @@
 
 PRs run a reduced test matrix (Python 3.10 + 3.13). Main branch runs the full matrix (3.10–3.13). Documentation-only changes skip the full test suite.
 
-The performance workflow posts a sticky benchmark summary comment on same-repo PRs and uploads raw benchmark JSON as workflow artifacts.
+The performance workflow posts a sticky benchmark summary comment on same-repo PRs, uploads benchmark JSON plus Markdown summaries as workflow artifacts, and reports regressions or missing benchmarks without blocking the PR.
diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml
@@ -4,14 +4,27 @@ on:
   pull_request:
     paths:
       - "modelaudit/**"
-      - "tests/**"
+      - "tests/benchmarks/**"
+      - "tests/helpers/**"
+      - "tests/conftest.py"
+      - "tests/test_benchmark_report.py"
       - "scripts/benchmark_report.py"
       - "pyproject.toml"
       - "uv.lock"
       - ".github/workflows/perf.yml"
   push:
     branches:
       - main
+    paths:
+      - "modelaudit/**"
+      - "tests/benchmarks/**"
+      - "tests/helpers/**"
+      - "tests/conftest.py"
+      - "tests/test_benchmark_report.py"
+      - "scripts/benchmark_report.py"
+      - "pyproject.toml"
+      - "uv.lock"
+      - ".github/workflows/perf.yml"
   workflow_dispatch:
 
 permissions:
@@ -42,63 +55,84 @@ jobs:
         run: |
           uv python pin 3.11
 
+      - name: Prepare benchmark temp directories
+        id: paths
+        run: |
+          artifact_dir="$RUNNER_TEMP/modelaudit-benchmarks"
+          base_worktree="$RUNNER_TEMP/modelaudit-base"
+          echo "BENCHMARK_ARTIFACT_DIR=$artifact_dir" >> "$GITHUB_ENV"
+          echo "BENCHMARK_BASE_WORKTREE=$base_worktree" >> "$GITHUB_ENV"
+          echo "artifact_dir=$artifact_dir" >> "$GITHUB_OUTPUT"
+          echo "base_worktree=$base_worktree" >> "$GITHUB_OUTPUT"
+          rm -rf "$artifact_dir" "$base_worktree"
+          mkdir -p "$artifact_dir"
+
       - name: Benchmark base commit
         if: github.event_name == 'pull_request'
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
         run: |
-          git worktree add /tmp/modelaudit-base "${{ github.event.pull_request.base.sha }}"
-          cd /tmp/modelaudit-base
-          if [ ! -f tests/benchmarks/test_scan_benchmarks.py ]; then
+          set -euo pipefail
+          git worktree add --detach "$BENCHMARK_BASE_WORKTREE" "$BASE_SHA"
+          if [ ! -f "$BENCHMARK_BASE_WORKTREE/tests/benchmarks/test_scan_benchmarks.py" ]; then
             echo "Base branch does not include the benchmark suite yet; skipping baseline run."
             exit 0
           fi
-          uv python pin 3.11
-          uv run --locked --with pytest-benchmark pytest \
+          uv run --directory "$BENCHMARK_BASE_WORKTREE" --python 3.11 --locked --with pytest-benchmark pytest \
             tests/benchmarks/test_scan_benchmarks.py \
-            --benchmark-json=/tmp/benchmark-base.json \
+            --benchmark-json="$BENCHMARK_ARTIFACT_DIR/benchmark-base.json" \
             -q
 
       - name: Benchmark current commit
         run: |
           # Keep this lane single-process; pytest-benchmark disables itself under xdist.
           uv run --locked --with pytest-benchmark pytest \
             tests/benchmarks/test_scan_benchmarks.py \
-            --benchmark-json=/tmp/benchmark-head.json \
+            --benchmark-json="$BENCHMARK_ARTIFACT_DIR/benchmark-head.json" \
             -q
 
       - name: Compare against base
         id: compare
         if: github.event_name == 'pull_request'
-        continue-on-error: true
         run: |
-          if [ -f /tmp/benchmark-base.json ]; then
-            set +e
+          {
+            echo "[Workflow run and artifacts](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})"
+            echo
+          } > "$BENCHMARK_ARTIFACT_DIR/benchmark-comment.md"
+          if [ -f "$BENCHMARK_ARTIFACT_DIR/benchmark-base.json" ]; then
             uv run --locked python scripts/benchmark_report.py \
-              --current /tmp/benchmark-head.json \
-              --baseline /tmp/benchmark-base.json \
+              --current "$BENCHMARK_ARTIFACT_DIR/benchmark-head.json" \
+              --baseline "$BENCHMARK_ARTIFACT_DIR/benchmark-base.json" \
               --threshold 0.15 \
-              --summary-file /tmp/benchmark-comment.md
-            compare_exit=$?
-            set -e
+              --summary-file "$BENCHMARK_ARTIFACT_DIR/benchmark-compare.md"
           else
             {
               echo "Base branch does not include the benchmark suite yet; showing current results only."
               echo
-            } > /tmp/benchmark-comment.md
+            } > "$BENCHMARK_ARTIFACT_DIR/benchmark-current.md"
             uv run --locked python scripts/benchmark_report.py \
-              --current /tmp/benchmark-head.json \
-              --summary-file /tmp/benchmark-current.md
-            cat /tmp/benchmark-current.md >> /tmp/benchmark-comment.md
-            compare_exit=0
+              --current "$BENCHMARK_ARTIFACT_DIR/benchmark-head.json" \
+              --summary-file "$BENCHMARK_ARTIFACT_DIR/benchmark-current-report.md"
+            cat "$BENCHMARK_ARTIFACT_DIR/benchmark-current.md" >> "$BENCHMARK_ARTIFACT_DIR/benchmark-comment.md"
+            cat "$BENCHMARK_ARTIFACT_DIR/benchmark-current-report.md" >> "$BENCHMARK_ARTIFACT_DIR/benchmark-comment.md"
+          fi
+          if [ -f "$BENCHMARK_ARTIFACT_DIR/benchmark-compare.md" ]; then
+            cat "$BENCHMARK_ARTIFACT_DIR/benchmark-compare.md" >> "$BENCHMARK_ARTIFACT_DIR/benchmark-comment.md"
           fi
-          cat /tmp/benchmark-comment.md >> "$GITHUB_STEP_SUMMARY"
-          exit "$compare_exit"
+          cat "$BENCHMARK_ARTIFACT_DIR/benchmark-comment.md" >> "$GITHUB_STEP_SUMMARY"
 
       - name: Summarize current results
         if: github.event_name != 'pull_request'
         run: |
+          {
+            echo "[Workflow run and artifacts](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})"
+            echo
+          } > "$BENCHMARK_ARTIFACT_DIR/benchmark-summary.md"
           uv run --locked python scripts/benchmark_report.py \
-            --current /tmp/benchmark-head.json \
-            --summary-file "$GITHUB_STEP_SUMMARY"
+            --current "$BENCHMARK_ARTIFACT_DIR/benchmark-head.json" \
+            --summary-file "$BENCHMARK_ARTIFACT_DIR/benchmark-current.md"
+          cat "$BENCHMARK_ARTIFACT_DIR/benchmark-current.md" >> "$BENCHMARK_ARTIFACT_DIR/benchmark-summary.md"
+          cat "$BENCHMARK_ARTIFACT_DIR/benchmark-summary.md" >> "$GITHUB_STEP_SUMMARY"
 
       - name: Comment benchmark summary on PR
         if: >
@@ -107,7 +141,7 @@ jobs:
           github.event.pull_request.head.repo.full_name == github.repository
         uses: actions/github-script@v8
         env:
-          COMMENT_BODY_PATH: /tmp/benchmark-comment.md
+          COMMENT_BODY_PATH: ${{ steps.paths.outputs.artifact_dir }}/benchmark-comment.md
         with:
           script: |
             const fs = require("fs")
@@ -152,12 +186,11 @@ jobs:
         uses: actions/upload-artifact@v7
         with:
           name: benchmark-results-python-3.11
-          path: |
-            /tmp/benchmark-head.json
-            /tmp/benchmark-base.json
+          path: ${{ steps.paths.outputs.artifact_dir }}
           if-no-files-found: error
           retention-days: 14
 
-      - name: Fail on benchmark regression
-        if: github.event_name == 'pull_request' && steps.compare.outcome == 'failure'
-        run: exit 1
+      - name: Cleanup benchmark base worktree
+        if: always() && github.event_name == 'pull_request'
+        run: |
+          git worktree remove --force "$BENCHMARK_BASE_WORKTREE" || true
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,4 +12,4 @@

		PRs run a reduced test matrix (Python 3.10 + 3.13). Main branch runs the full matrix (3.10–3.13). Documentation-only changes skip the full test suite.

		The performance workflow posts a sticky benchmark summary comment on same-repo PRs and uploads raw benchmark JSON as workflow artifacts.
		The performance workflow posts a sticky benchmark summary comment on same-repo PRs, uploads benchmark JSON plus Markdown summaries as workflow artifacts, and reports regressions or missing benchmarks without blocking the PR.