gpu-mode · msaroufim · Mar 1, 2026 · Mar 2, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "fastapi[all]",
     "uvicorn",
     "jinja2",
+    "openai",
 ]
 
 [project.optional-dependencies]

diff --git a/src/kernelbot/api/main.py b/src/kernelbot/api/main.py
@@ -675,6 +675,34 @@ async def admin_update_problems(
     }
 
 
+@app.get("/admin/audits")
+async def admin_get_audits(
+    _: Annotated[None, Depends(require_admin)],
+    is_cheating: Optional[bool] = None,
+    reviewed: Optional[bool] = None,
+    limit: int = Query(50, ge=1, le=500),
+    db_context=Depends(get_db),
+) -> dict:
+    """List submission audits, filterable by is_cheating and reviewed status."""
+    with db_context as db:
+        audits = db.get_audits(is_cheating=is_cheating, reviewed=reviewed, limit=limit)
+    return {"status": "ok", "audits": audits}
+
+
+@app.post("/admin/audits/{submission_id}/reviewed")
+async def admin_mark_audit_reviewed(
+    submission_id: int,
+    _: Annotated[None, Depends(require_admin)],
+    db_context=Depends(get_db),
+) -> dict:
+    """Mark a submission audit as human-reviewed."""
+    with db_context as db:
+        updated = db.mark_audit_reviewed(submission_id)
+    if not updated:
+        raise HTTPException(status_code=404, detail="Audit not found for this submission")
+    return {"status": "ok", "submission_id": submission_id}
+
+
 @app.get("/leaderboards")
 async def get_leaderboards(db_context=Depends(get_db)):
     """An endpoint that returns all leaderboards.

diff --git a/src/libkernelbot/audit.py b/src/libkernelbot/audit.py
@@ -0,0 +1,126 @@
+import asyncio
+import json
+import os
+
+from libkernelbot.leaderboard_db import LeaderboardDB
+from libkernelbot.utils import setup_logging
+
+logger = setup_logging(__name__)
+
+OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+AUDIT_MODEL = "openai/gpt-4o-mini"
+AUDIT_TIMEOUT_SECONDS = int(os.getenv("AUDIT_TIMEOUT_SECONDS", "20"))
+AUDIT_MAX_REFERENCE_CHARS = int(os.getenv("AUDIT_MAX_REFERENCE_CHARS", "120000"))
+AUDIT_MAX_SUBMISSION_CHARS = int(os.getenv("AUDIT_MAX_SUBMISSION_CHARS", "60000"))
+
+AUDIT_SYSTEM = """You are a code auditor for a GPU kernel programming competition.
+
+You will be given reference/evaluation code and a user's submitted solution.
+
+Your job is to determine if the submission is cheating. Common cheating patterns:
+- Hardcoding expected outputs instead of computing them
+- Bypassing the evaluation/correctness checks
+- Exploiting the test harness rather than solving the problem
+- Returning precomputed values without doing real computation
+- Patching or monkey-patching evaluation functions
+
+Respond with ONLY a JSON object (no markdown, no extra text):
+{"is_cheating": true/false, "explanation": "brief explanation"}"""
+
+
+def _truncate(content: str, max_chars: int) -> str:
+    if len(content) <= max_chars:
+        return content
+    omitted = len(content) - max_chars
+    return f"{content[:max_chars]}\n\n... [TRUNCATED {omitted} CHARS] ..."
+
+
+def _parse_audit_result(raw_content: str) -> dict:
+    content = raw_content.strip()
+    if content.startswith("```"):
+        lines = content.splitlines()
+        if lines and lines[0].startswith("```"):
+            lines = lines[1:]
+        if lines and lines[-1].startswith("```"):
+            lines = lines[:-1]
+        content = "\n".join(lines).strip()
+        if content.startswith("json"):
+            content = content[4:].strip()
+    parsed = json.loads(content)
+    if not isinstance(parsed, dict):
+        raise ValueError("Audit response is not a JSON object")
+    return parsed
+
+
+async def audit_submission(submission_id: int, db: LeaderboardDB) -> dict | None:
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        logger.debug("OPENROUTER_API_KEY not set, skipping audit for submission %s", submission_id)
+        return None
+
+    try:
+        import openai
+
+        with db:
+            submission = db.get_submission_by_id(submission_id)
+            if submission is None:
+                logger.warning("Submission %s not found for audit", submission_id)
+                return None
+
+            task_json = db.get_leaderboard_task_by_id(submission["leaderboard_id"])
+
+        if not task_json:
+            logger.warning("No task found for leaderboard %s", submission["leaderboard_id"])
+            return None
+
+        # Extract reference code from the task files
+        reference_code = ""
+        if isinstance(task_json, dict) and "files" in task_json:
+            for filename, content in task_json["files"].items():
+                reference_code += f"--- {filename} ---\n{content}\n\n"
+
+        if not reference_code:
+            reference_code = json.dumps(task_json, indent=2)
+
+        submission_code = submission["code"]
+        reference_code = _truncate(reference_code, AUDIT_MAX_REFERENCE_CHARS)
+        submission_code = _truncate(submission_code, AUDIT_MAX_SUBMISSION_CHARS)
+        user_msg = (
+            "Reference/evaluation code:\n```\n"
+            + reference_code
+            + "\n```\n\nSubmitted code:\n```\n"
+            + submission_code
+            + "\n```"
+        )
+
+        client = openai.AsyncOpenAI(api_key=api_key, base_url=OPENROUTER_BASE_URL)
+        async with asyncio.timeout(AUDIT_TIMEOUT_SECONDS):
+            response = await client.chat.completions.create(
+                model=AUDIT_MODEL,
+                messages=[
+                    {"role": "system", "content": AUDIT_SYSTEM},
+                    {"role": "user", "content": user_msg},
+                ],
+                temperature=0,
+                max_tokens=512,
+            )
+
+        result_text = response.choices[0].message.content
+        if not result_text:
+            logger.warning("Empty audit response for submission %s", submission_id)
+            return None
+
+        result = _parse_audit_result(result_text)
+
+        is_cheating = bool(result.get("is_cheating", False))
+        explanation = str(result.get("explanation", ""))
+
+        with db:
+            db.create_submission_audit(submission_id, is_cheating, explanation, AUDIT_MODEL)
+
+        logger.info("Audit for submission %s: is_cheating=%s", submission_id, is_cheating)
+        return {"is_cheating": is_cheating, "explanation": explanation, "model": AUDIT_MODEL}
+
+    except Exception:
+        logger.exception("Failed to audit submission %s", submission_id)
+        return None
diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py
@@ -107,6 +107,7 @@ async def submit_full(
         finally:
             with self.db as db:
                 db.mark_submission_done(sub_id)
+            asyncio.create_task(self._maybe_audit(sub_id))
         return sub_id, results
 
     async def submit_leaderboard(  # noqa: C901
@@ -238,5 +239,14 @@ async def handle_submission(
 
         return result
 
+    async def _maybe_audit(self, submission_id: int):
+        """Fire-and-forget audit of a submission. Never raises."""
+        try:
+            from libkernelbot.audit import audit_submission
+
+            await audit_submission(submission_id, self.db)
+        except Exception:
+            logger.debug("Audit skipped or failed for submission %s", submission_id, exc_info=True)
+
     def _get_arch(self, gpu_type: GPU):
         return GPU_TO_SM[gpu_type.name]
diff --git a/src/libkernelbot/leaderboard_db.py b/src/libkernelbot/leaderboard_db.py
@@ -1235,6 +1235,123 @@ def validate_cli_id(self, cli_id: str) -> Optional[dict[str, str]]:
             logger.exception("Error validating CLI ID %s", cli_id, exc_info=e)
             raise KernelBotError("Error validating CLI ID") from e
 
+    # ── Submission audit methods ──────────────────────────────────────
+
+    def get_leaderboard_task_by_id(self, leaderboard_id: int) -> Optional[dict]:
+        """Fetch the task JSONB for a leaderboard by its ID."""
+        try:
+            self.cursor.execute(
+                "SELECT task FROM leaderboard.leaderboard WHERE id = %s",
+                (leaderboard_id,),
+            )
+            row = self.cursor.fetchone()
+            return row[0] if row else None
+        except psycopg2.Error as e:
+            self.connection.rollback()
+            logger.exception("Error fetching task for leaderboard %s", leaderboard_id, exc_info=e)
+            return None
+
+    def create_submission_audit(
+        self, submission_id: int, is_cheating: bool, explanation: str, model: str
+    ) -> Optional[int]:
+        """Insert an audit record for a submission."""
+        try:
+            self.cursor.execute(
+                """
+                INSERT INTO leaderboard.submission_audit
+                    (submission_id, is_cheating, explanation, model)
+                VALUES (%s, %s, %s, %s)
+                ON CONFLICT (submission_id) DO UPDATE
+                    SET is_cheating = EXCLUDED.is_cheating,
+                        explanation = EXCLUDED.explanation,
+                        model = EXCLUDED.model,
+                        created_at = NOW(),
+                        reviewed = FALSE
+                RETURNING id
+                """,
+                (submission_id, is_cheating, explanation, model),
+            )
+            self.connection.commit()
+            return self.cursor.fetchone()[0]
+        except psycopg2.Error as e:
+            self.connection.rollback()
+            logger.exception("Error creating audit for submission %s", submission_id, exc_info=e)
+            raise KernelBotError("Error creating submission audit") from e
+
+    def get_audits(
+        self,
+        is_cheating: Optional[bool] = None,
+        reviewed: Optional[bool] = None,
+        limit: int = 50,
+    ) -> List[dict]:
+        """List audits with optional filters."""
+        limit = max(1, min(limit, 500))
+        try:
+            conditions = []
+            params = []
+            if is_cheating is not None:
+                conditions.append("a.is_cheating = %s")
+                params.append(is_cheating)
+            if reviewed is not None:
+                conditions.append("a.reviewed = %s")
+                params.append(reviewed)
+
+            where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
+            params.append(limit)
+
+            self.cursor.execute(
+                f"""
+                SELECT a.id, a.submission_id, a.is_cheating, a.explanation,
+                       a.model, a.created_at, a.reviewed,
+                       s.file_name, s.user_id, lb.name as leaderboard_name
+                FROM leaderboard.submission_audit a
+                JOIN leaderboard.submission s ON a.submission_id = s.id
+                JOIN leaderboard.leaderboard lb ON s.leaderboard_id = lb.id
+                {where}
+                ORDER BY a.created_at DESC
+                LIMIT %s
+                """,
+                params,
+            )
+            rows = self.cursor.fetchall()
+            return [
+                {
+                    "id": r[0],
+                    "submission_id": r[1],
+                    "is_cheating": r[2],
+                    "explanation": r[3],
+                    "model": r[4],
+                    "created_at": r[5],
+                    "reviewed": r[6],
+                    "file_name": r[7],
+                    "user_id": r[8],
+                    "leaderboard_name": r[9],
+                }
+                for r in rows
+            ]
+        except psycopg2.Error as e:
+            self.connection.rollback()
+            logger.exception("Error listing submission audits", exc_info=e)
+            raise KernelBotError("Error listing submission audits") from e
+
+    def mark_audit_reviewed(self, submission_id: int) -> bool:
+        """Mark an audit as human-reviewed. Returns True if a row was updated."""
+        try:
+            self.cursor.execute(
+                """
+                UPDATE leaderboard.submission_audit
+                SET reviewed = TRUE
+                WHERE submission_id = %s
+                """,
+                (submission_id,),
+            )
+            self.connection.commit()
+            return self.cursor.rowcount > 0
+        except psycopg2.Error as e:
+            self.connection.rollback()
+            logger.exception("Error marking audit reviewed for submission %s", submission_id, exc_info=e)
+            raise KernelBotError("Error marking audit as reviewed") from e
+
 
 class LeaderboardDoesNotExist(KernelBotError):
     def __init__(self, name: str):

diff --git a/src/migrations/20260301_01_audit-add-submission-audit.py b/src/migrations/20260301_01_audit-add-submission-audit.py
@@ -0,0 +1,27 @@
+"""
+add-submission-audit
+"""
+
+from yoyo import step
+
+__depends__ = {"20260225_01_aW5Bu-add-leaderboard-performance-indexes"}
+
+steps = [
+    step(
+        """
+        CREATE TABLE leaderboard.submission_audit (
+            id SERIAL PRIMARY KEY,
+            submission_id INTEGER NOT NULL REFERENCES leaderboard.submission(id) ON DELETE CASCADE,
+            is_cheating BOOLEAN NOT NULL,
+            explanation TEXT NOT NULL,
+            model TEXT NOT NULL,
+            created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+            reviewed BOOLEAN NOT NULL DEFAULT FALSE,
+            UNIQUE(submission_id)
+        )
+        """,
+        """
+        DROP TABLE IF EXISTS leaderboard.submission_audit
+        """,
+    ),
+]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -48,7 +48,7 @@ def docker_compose(project_root: Path):
 
 def _nuke_contents(db):
     db.cursor.execute(
-        "TRUNCATE leaderboard.code_files, leaderboard.submission, leaderboard.runs, "
+        "TRUNCATE leaderboard.code_files, leaderboard.submission, leaderboard.submission_audit, leaderboard.runs, "
         "leaderboard.leaderboard, leaderboard.user_info, leaderboard.templates, "
         "leaderboard.gpu_type RESTART IDENTITY CASCADE"
     )