Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions .github/workflows/evals-report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: E2E Evals Report
on:
workflow_run:
workflows: ["E2E Evals"]
types: [completed]

concurrency:
group: evals-report-${{ github.event.workflow_run.id }}
cancel-in-progress: true

jobs:
report:
runs-on: ubicloud-standard-2
if: ${{ github.event.workflow_run.event == 'pull_request' }}
timeout-minutes: 5
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Download all eval artifacts
uses: actions/download-artifact@v4
with:
run-id: ${{ github.event.workflow_run.id }}
pattern: eval-*
path: /tmp/eval-results
merge-multiple: true

- name: Post PR comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Resolve the PR number from the workflow run
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?head=${{ github.event.workflow_run.head_repository.owner.login }}:${{ github.event.workflow_run.head_branch }}" --jq '.[0].number')
if [ -z "$PR_NUMBER" ]; then
echo "Could not find PR for branch ${{ github.event.workflow_run.head_branch }}"
exit 0
fi

# shellcheck disable=SC2086,SC2059
RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
if [ -z "$RESULTS" ]; then
echo "No eval results found"
exit 0
fi

TOTAL=0; PASSED=0; FAILED=0; COST="0"
SUITE_LINES=""
for f in $RESULTS; do
if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then
echo "Skipping malformed JSON: $f"
continue
fi
T=$(jq -r '.total_tests // 0' "$f")
P=$(jq -r '.passed // 0' "$f")
F=$(jq -r '.failed // 0' "$f")
C=$(jq -r '.total_cost_usd // 0' "$f")
TIER=$(jq -r '.tier // "unknown"' "$f")
[ "$T" -eq 0 ] && continue
TOTAL=$((TOTAL + T))
PASSED=$((PASSED + P))
FAILED=$((FAILED + F))
COST=$(echo "$COST + $C" | bc)
STATUS_ICON="✅"
[ "$F" -gt 0 ] && STATUS_ICON="❌"
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
done

STATUS="✅ PASS"
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"

BODY="## E2E Evals: ${STATUS}

**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**

| Suite | Result | Status | Cost |
|-------|--------|--------|------|
$(echo -e "$SUITE_LINES")

---
*12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"

if [ "$FAILED" -gt 0 ]; then
FAILURES=""
for f in $RESULTS; do
if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi
F=$(jq -r '.failed // 0' "$f")
[ "$F" -eq 0 ] && continue
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error")
FAILURES="${FAILURES}${FAILS}\n"
done
BODY="${BODY}

### Failures
$(echo -e "$FAILURES")"
fi

# Update existing comment or create new one
COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)

if [ -n "$COMMENT_ID" ]; then
gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \
-X PATCH -f body="$BODY"
else
gh pr comment "$PR_NUMBER" --body "$BODY"
fi
93 changes: 0 additions & 93 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,96 +145,3 @@ jobs:
name: eval-${{ matrix.suite.name }}
path: ~/.gstack-dev/evals/*.json
retention-days: 90

report:
runs-on: ubicloud-standard-2
needs: evals
if: always() && github.event_name == 'pull_request'
timeout-minutes: 5
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Download all eval artifacts
uses: actions/download-artifact@v4
with:
pattern: eval-*
path: /tmp/eval-results
merge-multiple: true

- name: Post PR comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# shellcheck disable=SC2086,SC2059
RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
if [ -z "$RESULTS" ]; then
echo "No eval results found"
exit 0
fi

TOTAL=0; PASSED=0; FAILED=0; COST="0"
SUITE_LINES=""
for f in $RESULTS; do
if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then
echo "Skipping malformed JSON: $f"
continue
fi
T=$(jq -r '.total_tests // 0' "$f")
P=$(jq -r '.passed // 0' "$f")
F=$(jq -r '.failed // 0' "$f")
C=$(jq -r '.total_cost_usd // 0' "$f")
TIER=$(jq -r '.tier // "unknown"' "$f")
[ "$T" -eq 0 ] && continue
TOTAL=$((TOTAL + T))
PASSED=$((PASSED + P))
FAILED=$((FAILED + F))
COST=$(echo "$COST + $C" | bc)
STATUS_ICON="✅"
[ "$F" -gt 0 ] && STATUS_ICON="❌"
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
done

STATUS="✅ PASS"
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"

BODY="## E2E Evals: ${STATUS}

**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**

| Suite | Result | Status | Cost |
|-------|--------|--------|------|
$(echo -e "$SUITE_LINES")

---
*12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"

if [ "$FAILED" -gt 0 ]; then
FAILURES=""
for f in $RESULTS; do
if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi
F=$(jq -r '.failed // 0' "$f")
[ "$F" -eq 0 ] && continue
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error")
FAILURES="${FAILURES}${FAILS}\n"
done
BODY="${BODY}

### Failures
$(echo -e "$FAILURES")"
fi

# Update existing comment or create new one
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)

if [ -n "$COMMENT_ID" ]; then
gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \
-X PATCH -f body="$BODY"
else
gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
fi
67 changes: 65 additions & 2 deletions setup
Original file line number Diff line number Diff line change
Expand Up @@ -534,20 +534,83 @@ link_factory_skill_dirs() {
return 1
fi

# Copy skills as REAL directories into ~/.agents/skills/.
# Droid reads skill files from ~/.agents/skills/. This is required because
# skills in ~/gstack/.factory/skills/ (a project subdirectory) are never
# scanned by Droid.
mkdir -p "$HOME/.agents/skills"
for skill_dir in "$factory_dir"/gstack*/; do
if [ -f "$skill_dir/SKILL.md" ]; then
skill_name="$(basename "$skill_dir")"
dest="$HOME/.agents/skills/$skill_name"
rm -rf "$dest"
cp -r "$skill_dir" "$dest"
fi
done

# Create RELATIVE symlinks in ~/.factory/skills/ pointing to ../../.agents/skills/.
# Droid scans ~/.factory/skills/ and follows relative symlinks (not absolute paths).
# Relative symlinks are required; absolute paths are ignored by Droid.
local linked=()
for skill_dir in "$factory_dir"/gstack*/; do
if [ -f "$skill_dir/SKILL.md" ]; then
skill_name="$(basename "$skill_dir")"
[ "$skill_name" = "gstack" ] && continue
target="$skills_dir/$skill_name"
if [ -L "$target" ] || [ ! -e "$target" ]; then
ln -snf "$skill_dir" "$target"
ln -snf "../../.agents/skills/$skill_name" "$target"
linked+=("$skill_name")
fi
fi
done
if [ ${#linked[@]} -gt 0 ]; then
echo " linked skills: ${linked[*]}"
fi

# Register skills in ~/.agents/.skill-lock.json with sourceType=github.
# Droid's lockfile ONLY loads entries with sourceType=github. Entries with
# sourceType=local are completely ignored — and no working skill uses it.
if [ -f "$HOME/.agents/.skill-lock.json" ]; then
python3 -c "
import json, hashlib, os, datetime
lock_path = '$HOME/.agents/.skill-lock.json'
agents_skills = '$HOME/.agents/skills'
with open(lock_path) as f:
lock = json.load(f)
now_str = datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.') + '000Z'
added = []
for entry in os.listdir(agents_skills):
if not entry.startswith('gstack'):
continue
skill_md = os.path.join(agents_skills, entry, 'SKILL.md')
if not os.path.isfile(skill_md):
continue
hasher = hashlib.sha1()
for root, dirs, files in os.walk(os.path.join(agents_skills, entry)):
dirs.sort()
for fname in sorted(files):
fpath = os.path.join(root, fname)
try:
with open(fpath, 'rb') as fh:
hasher.update(fh.read())
except:
pass
lock['skills'][entry] = {
'source': 'gstack/' + entry,
'sourceType': 'github',
'sourceUrl': 'file://' + os.path.join(agents_skills, entry),
'skillPath': 'SKILL.md',
'skillFolderHash': hasher.hexdigest(),
'installedAt': now_str,
'updatedAt': now_str
}
added.append(entry)
with open(lock_path, 'w') as f:
json.dump(lock, f, indent=2)
print(' registered skills: ' + ' '.join(added))
"
else
echo " warning: ~/.agents/.skill-lock.json not found — skill registration skipped" >&2
fi
}

# 4. Install for Claude (default)
Expand Down
Loading