Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"fastapi[all]",
"uvicorn",
"jinja2",
"openai",
]

[project.optional-dependencies]
Expand Down
28 changes: 28 additions & 0 deletions src/kernelbot/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,34 @@ async def admin_update_problems(
}


@app.get("/admin/audits")
async def admin_get_audits(
_: Annotated[None, Depends(require_admin)],
is_cheating: Optional[bool] = None,
reviewed: Optional[bool] = None,
limit: int = Query(50, ge=1, le=500),
db_context=Depends(get_db),
) -> dict:
"""List submission audits, filterable by is_cheating and reviewed status."""
with db_context as db:
audits = db.get_audits(is_cheating=is_cheating, reviewed=reviewed, limit=limit)
return {"status": "ok", "audits": audits}


@app.post("/admin/audits/{submission_id}/reviewed")
async def admin_mark_audit_reviewed(
submission_id: int,
_: Annotated[None, Depends(require_admin)],
db_context=Depends(get_db),
) -> dict:
"""Mark a submission audit as human-reviewed."""
with db_context as db:
updated = db.mark_audit_reviewed(submission_id)
if not updated:
raise HTTPException(status_code=404, detail="Audit not found for this submission")
return {"status": "ok", "submission_id": submission_id}


@app.get("/leaderboards")
async def get_leaderboards(db_context=Depends(get_db)):
"""An endpoint that returns all leaderboards.
Expand Down
126 changes: 126 additions & 0 deletions src/libkernelbot/audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import asyncio
import json
import os

from libkernelbot.leaderboard_db import LeaderboardDB
from libkernelbot.utils import setup_logging

logger = setup_logging(__name__)

OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
AUDIT_MODEL = "openai/gpt-4o-mini"
AUDIT_TIMEOUT_SECONDS = int(os.getenv("AUDIT_TIMEOUT_SECONDS", "20"))
AUDIT_MAX_REFERENCE_CHARS = int(os.getenv("AUDIT_MAX_REFERENCE_CHARS", "120000"))
AUDIT_MAX_SUBMISSION_CHARS = int(os.getenv("AUDIT_MAX_SUBMISSION_CHARS", "60000"))

AUDIT_SYSTEM = """You are a code auditor for a GPU kernel programming competition.

You will be given reference/evaluation code and a user's submitted solution.

Your job is to determine if the submission is cheating. Common cheating patterns:
- Hardcoding expected outputs instead of computing them
- Bypassing the evaluation/correctness checks
- Exploiting the test harness rather than solving the problem
- Returning precomputed values without doing real computation
- Patching or monkey-patching evaluation functions

Respond with ONLY a JSON object (no markdown, no extra text):
{"is_cheating": true/false, "explanation": "brief explanation"}"""


def _truncate(content: str, max_chars: int) -> str:
if len(content) <= max_chars:
return content
omitted = len(content) - max_chars
return f"{content[:max_chars]}\n\n... [TRUNCATED {omitted} CHARS] ..."


def _parse_audit_result(raw_content: str) -> dict:
content = raw_content.strip()
if content.startswith("```"):
lines = content.splitlines()
if lines and lines[0].startswith("```"):
lines = lines[1:]
if lines and lines[-1].startswith("```"):
lines = lines[:-1]
content = "\n".join(lines).strip()
if content.startswith("json"):
content = content[4:].strip()
parsed = json.loads(content)
if not isinstance(parsed, dict):
raise ValueError("Audit response is not a JSON object")
return parsed


async def audit_submission(submission_id: int, db: LeaderboardDB) -> dict | None:
api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
logger.debug("OPENROUTER_API_KEY not set, skipping audit for submission %s", submission_id)
return None

try:
import openai

with db:
submission = db.get_submission_by_id(submission_id)
if submission is None:
logger.warning("Submission %s not found for audit", submission_id)
return None

task_json = db.get_leaderboard_task_by_id(submission["leaderboard_id"])

if not task_json:
logger.warning("No task found for leaderboard %s", submission["leaderboard_id"])
return None

# Extract reference code from the task files
reference_code = ""
if isinstance(task_json, dict) and "files" in task_json:
for filename, content in task_json["files"].items():
reference_code += f"--- {filename} ---\n{content}\n\n"

if not reference_code:
reference_code = json.dumps(task_json, indent=2)

submission_code = submission["code"]
reference_code = _truncate(reference_code, AUDIT_MAX_REFERENCE_CHARS)
submission_code = _truncate(submission_code, AUDIT_MAX_SUBMISSION_CHARS)
user_msg = (
"Reference/evaluation code:\n```\n"
+ reference_code
+ "\n```\n\nSubmitted code:\n```\n"
+ submission_code
+ "\n```"
)

client = openai.AsyncOpenAI(api_key=api_key, base_url=OPENROUTER_BASE_URL)
async with asyncio.timeout(AUDIT_TIMEOUT_SECONDS):
response = await client.chat.completions.create(
model=AUDIT_MODEL,
messages=[
{"role": "system", "content": AUDIT_SYSTEM},
{"role": "user", "content": user_msg},
],
temperature=0,
max_tokens=512,
)

result_text = response.choices[0].message.content
if not result_text:
logger.warning("Empty audit response for submission %s", submission_id)
return None

result = _parse_audit_result(result_text)

is_cheating = bool(result.get("is_cheating", False))
explanation = str(result.get("explanation", ""))

with db:
db.create_submission_audit(submission_id, is_cheating, explanation, AUDIT_MODEL)

logger.info("Audit for submission %s: is_cheating=%s", submission_id, is_cheating)
return {"is_cheating": is_cheating, "explanation": explanation, "model": AUDIT_MODEL}

except Exception:
logger.exception("Failed to audit submission %s", submission_id)
return None
10 changes: 10 additions & 0 deletions src/libkernelbot/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ async def submit_full(
finally:
with self.db as db:
db.mark_submission_done(sub_id)
asyncio.create_task(self._maybe_audit(sub_id))
return sub_id, results

async def submit_leaderboard( # noqa: C901
Expand Down Expand Up @@ -238,5 +239,14 @@ async def handle_submission(

return result

async def _maybe_audit(self, submission_id: int):
"""Fire-and-forget audit of a submission. Never raises."""
try:
from libkernelbot.audit import audit_submission

await audit_submission(submission_id, self.db)
except Exception:
logger.debug("Audit skipped or failed for submission %s", submission_id, exc_info=True)

def _get_arch(self, gpu_type: GPU):
return GPU_TO_SM[gpu_type.name]
117 changes: 117 additions & 0 deletions src/libkernelbot/leaderboard_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,123 @@ def validate_cli_id(self, cli_id: str) -> Optional[dict[str, str]]:
logger.exception("Error validating CLI ID %s", cli_id, exc_info=e)
raise KernelBotError("Error validating CLI ID") from e

# ── Submission audit methods ──────────────────────────────────────

def get_leaderboard_task_by_id(self, leaderboard_id: int) -> Optional[dict]:
"""Fetch the task JSONB for a leaderboard by its ID."""
try:
self.cursor.execute(
"SELECT task FROM leaderboard.leaderboard WHERE id = %s",
(leaderboard_id,),
)
row = self.cursor.fetchone()
return row[0] if row else None
except psycopg2.Error as e:
self.connection.rollback()
logger.exception("Error fetching task for leaderboard %s", leaderboard_id, exc_info=e)
return None

def create_submission_audit(
self, submission_id: int, is_cheating: bool, explanation: str, model: str
) -> Optional[int]:
"""Insert an audit record for a submission."""
try:
self.cursor.execute(
"""
INSERT INTO leaderboard.submission_audit
(submission_id, is_cheating, explanation, model)
VALUES (%s, %s, %s, %s)
ON CONFLICT (submission_id) DO UPDATE
SET is_cheating = EXCLUDED.is_cheating,
explanation = EXCLUDED.explanation,
model = EXCLUDED.model,
created_at = NOW(),
reviewed = FALSE
RETURNING id
""",
(submission_id, is_cheating, explanation, model),
)
self.connection.commit()
return self.cursor.fetchone()[0]
except psycopg2.Error as e:
self.connection.rollback()
logger.exception("Error creating audit for submission %s", submission_id, exc_info=e)
raise KernelBotError("Error creating submission audit") from e

def get_audits(
self,
is_cheating: Optional[bool] = None,
reviewed: Optional[bool] = None,
limit: int = 50,
) -> List[dict]:
"""List audits with optional filters."""
limit = max(1, min(limit, 500))
try:
conditions = []
params = []
if is_cheating is not None:
conditions.append("a.is_cheating = %s")
params.append(is_cheating)
if reviewed is not None:
conditions.append("a.reviewed = %s")
params.append(reviewed)

where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
params.append(limit)

self.cursor.execute(
f"""
SELECT a.id, a.submission_id, a.is_cheating, a.explanation,
a.model, a.created_at, a.reviewed,
s.file_name, s.user_id, lb.name as leaderboard_name
FROM leaderboard.submission_audit a
JOIN leaderboard.submission s ON a.submission_id = s.id
JOIN leaderboard.leaderboard lb ON s.leaderboard_id = lb.id
{where}
ORDER BY a.created_at DESC
LIMIT %s
""",
params,
)
rows = self.cursor.fetchall()
return [
{
"id": r[0],
"submission_id": r[1],
"is_cheating": r[2],
"explanation": r[3],
"model": r[4],
"created_at": r[5],
"reviewed": r[6],
"file_name": r[7],
"user_id": r[8],
"leaderboard_name": r[9],
}
for r in rows
]
except psycopg2.Error as e:
self.connection.rollback()
logger.exception("Error listing submission audits", exc_info=e)
raise KernelBotError("Error listing submission audits") from e

def mark_audit_reviewed(self, submission_id: int) -> bool:
"""Mark an audit as human-reviewed. Returns True if a row was updated."""
try:
self.cursor.execute(
"""
UPDATE leaderboard.submission_audit
SET reviewed = TRUE
WHERE submission_id = %s
""",
(submission_id,),
)
self.connection.commit()
return self.cursor.rowcount > 0
except psycopg2.Error as e:
self.connection.rollback()
logger.exception("Error marking audit reviewed for submission %s", submission_id, exc_info=e)
raise KernelBotError("Error marking audit as reviewed") from e


class LeaderboardDoesNotExist(KernelBotError):
def __init__(self, name: str):
Expand Down
27 changes: 27 additions & 0 deletions src/migrations/20260301_01_audit-add-submission-audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
add-submission-audit
"""

from yoyo import step

__depends__ = {"20260225_01_aW5Bu-add-leaderboard-performance-indexes"}

steps = [
step(
"""
CREATE TABLE leaderboard.submission_audit (
id SERIAL PRIMARY KEY,
submission_id INTEGER NOT NULL REFERENCES leaderboard.submission(id) ON DELETE CASCADE,
is_cheating BOOLEAN NOT NULL,
explanation TEXT NOT NULL,
model TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
reviewed BOOLEAN NOT NULL DEFAULT FALSE,
UNIQUE(submission_id)
)
""",
"""
DROP TABLE IF EXISTS leaderboard.submission_audit
""",
),
]
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def docker_compose(project_root: Path):

def _nuke_contents(db):
db.cursor.execute(
"TRUNCATE leaderboard.code_files, leaderboard.submission, leaderboard.runs, "
"TRUNCATE leaderboard.code_files, leaderboard.submission, leaderboard.submission_audit, leaderboard.runs, "
"leaderboard.leaderboard, leaderboard.user_info, leaderboard.templates, "
"leaderboard.gpu_type RESTART IDENTITY CASCADE"
)
Expand Down
Loading
Loading