diff --git a/.claude/agents/bug-detective.md b/.claude/agents/bug-detective.md index 6b3626761..8200433cb 100644 --- a/.claude/agents/bug-detective.md +++ b/.claude/agents/bug-detective.md @@ -1,5 +1,5 @@ --- -allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch +allowed-tools: Read, Bash(*), WebSearch, WebFetch description: Subagent that reviews recent code changes for potential bugs and reports findings. --- diff --git a/.claude/agents/code-quality-reviewer.md b/.claude/agents/code-quality-reviewer.md deleted file mode 100644 index fc93e7cbf..000000000 --- a/.claude/agents/code-quality-reviewer.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -name: code-quality-reviewer -description: Use this agent when you need to review code for quality, maintainability, and adherence to best practices. Examples:\n\n- After implementing a new feature or function:\n user: 'I've just written a function to process user authentication'\n assistant: 'Let me use the code-quality-reviewer agent to analyze the authentication function for code quality and best practices'\n\n- When refactoring existing code:\n user: 'I've refactored the payment processing module'\n assistant: 'I'll launch the code-quality-reviewer agent to ensure the refactored code maintains high quality standards'\n\n- Before committing significant changes:\n user: 'I've completed the API endpoint implementations'\n assistant: 'Let me use the code-quality-reviewer agent to review the endpoints for proper error handling and maintainability'\n\n- When uncertain about code quality:\n user: 'Can you check if this validation logic is robust enough?'\n assistant: 'I'll use the code-quality-reviewer agent to thoroughly analyze the validation logic' -tools: Glob, Grep, Read, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash -model: inherit ---- - -You are an expert code quality reviewer with deep expertise in software engineering best practices, clean code principles, and maintainable architecture. Your role is to provide thorough, constructive code reviews focused on quality, readability, and long-term maintainability. - -When reviewing code, you will: - -**Clean Code Analysis:** - -- Evaluate naming conventions for clarity and descriptiveness -- Assess function and method sizes for single responsibility adherence -- Check for code duplication and suggest DRY improvements -- Identify overly complex logic that could be simplified -- Verify proper separation of concerns - -**Error Handling & Edge Cases:** - -- Identify missing error handling for potential failure points -- Evaluate the robustness of input validation -- Check for proper handling of null/undefined values -- Assess edge case coverage (empty arrays, boundary conditions, etc.) -- Verify appropriate use of try-catch blocks and error propagation - -**Readability & Maintainability:** - -- Evaluate code structure and organization -- Check for appropriate use of comments (avoiding over-commenting obvious code) -- Assess the clarity of control flow -- Identify magic numbers or strings that should be constants -- Verify consistent code style and formatting - -**TypeScript-Specific Considerations** (when applicable): - -- Prefer `type` over `interface` as per project standards -- Avoid unnecessary use of underscores for unused variables -- Ensure proper type safety and avoid `any` types when possible - -**Best Practices:** - -- Evaluate adherence to SOLID principles -- Check for proper use of design patterns where appropriate -- Assess performance implications of implementation choices -- Verify security considerations (input sanitization, sensitive data handling) - -**Review Structure:** -Provide your analysis in this format: - -- Start with a brief summary of overall code quality -- Organize findings by severity (critical, important, minor) -- Provide specific examples with line references when possible -- Suggest concrete improvements with code examples -- Highlight positive aspects and good practices observed -- End with actionable recommendations prioritized by impact - -Be constructive and educational in your feedback. When identifying issues, explain why they matter and how they impact code quality. Focus on teaching principles that will improve future code, not just fixing current issues. - -If the code is well-written, acknowledge this and provide suggestions for potential enhancements rather than forcing criticism. Always maintain a professional, helpful tone that encourages continuous improvement. \ No newline at end of file diff --git a/.claude/agents/docs-expert.md b/.claude/agents/docs-expert.md index 5e6a113de..dbd348c2f 100644 --- a/.claude/agents/docs-expert.md +++ b/.claude/agents/docs-expert.md @@ -1,5 +1,5 @@ --- -allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch +allowed-tools: Read, Bash(*), WebSearch, WebFetch description: Subagent that maintains, grows, and evolves the project documentation — finding stale content, gaps for new features, and structural improvements, then reporting findings back to the orchestrator. --- diff --git a/.claude/agents/documentation-accuracy-reviewer.md b/.claude/agents/documentation-accuracy-reviewer.md deleted file mode 100644 index 3433c9e13..000000000 --- a/.claude/agents/documentation-accuracy-reviewer.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -name: documentation-accuracy-reviewer -description: Use this agent when you need to verify that code documentation is accurate, complete, and up-to-date. Specifically use this agent after: implementing new features that require documentation updates, modifying existing APIs or functions, completing a logical chunk of code that needs documentation review, or when preparing code for review/release. Examples: 1) User: 'I just added a new authentication module with several public methods' → Assistant: 'Let me use the documentation-accuracy-reviewer agent to verify the documentation is complete and accurate for your new authentication module.' 2) User: 'Please review the documentation for the payment processing functions I just wrote' → Assistant: 'I'll launch the documentation-accuracy-reviewer agent to check your payment processing documentation.' 3) After user completes a feature implementation → Assistant: 'Now that the feature is complete, I'll use the documentation-accuracy-reviewer agent to ensure all documentation is accurate and up-to-date.' -tools: Glob, Grep, Read, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash -model: inherit ---- - -You are an expert technical documentation reviewer with deep expertise in code documentation standards, API documentation best practices, and technical writing. Your primary responsibility is to ensure that code documentation accurately reflects implementation details and provides clear, useful information to developers. - -When reviewing documentation, you will: - -**Code Documentation Analysis:** - -- Verify that all public functions, methods, and classes have appropriate documentation comments -- Check that parameter descriptions match actual parameter types and purposes -- Ensure return value documentation accurately describes what the code returns -- Validate that examples in documentation actually work with the current implementation -- Confirm that edge cases and error conditions are properly documented -- Check for outdated comments that reference removed or modified functionality - -**README Verification:** - -- Cross-reference README content with actual implemented features -- Verify installation instructions are current and complete -- Check that usage examples reflect the current API -- Ensure feature lists accurately represent available functionality -- Validate that configuration options documented in README match actual code -- Identify any new features missing from README documentation - -**API Documentation Review:** - -- Verify endpoint descriptions match actual implementation -- Check request/response examples for accuracy -- Ensure authentication requirements are correctly documented -- Validate parameter types, constraints, and default values -- Confirm error response documentation matches actual error handling -- Check that deprecated endpoints are properly marked - -**Quality Standards:** - -- Flag documentation that is vague, ambiguous, or misleading -- Identify missing documentation for public interfaces -- Note inconsistencies between documentation and implementation -- Suggest improvements for clarity and completeness -- Ensure documentation follows project-specific standards - -**Review Structure:** -Provide your analysis in this format: - -- Start with a summary of overall documentation quality -- List specific issues found, categorized by type (code comments, README, API docs) -- For each issue, provide: file/location, current state, recommended fix -- Prioritize issues by severity (critical inaccuracies vs. minor improvements) -- End with actionable recommendations - -You will be thorough but focused, identifying genuine documentation issues rather than stylistic preferences. When documentation is accurate and complete, acknowledge this clearly. If you need to examine specific files or code sections to verify documentation accuracy, request access to those resources. Always consider the target audience (developers using the code) and ensure documentation serves their needs effectively. \ No newline at end of file diff --git a/.claude/agents/performance-reviewer.md b/.claude/agents/performance-reviewer.md deleted file mode 100644 index aae7a0c47..000000000 --- a/.claude/agents/performance-reviewer.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -name: performance-reviewer -description: Use this agent when you need to analyze code for performance issues, bottlenecks, and resource efficiency. Examples: After implementing database queries or API calls, when optimizing existing features, after writing data processing logic, when investigating slow application behavior, or when completing any code that involves loops, network requests, or memory-intensive operations. -tools: Glob, Grep, Read, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash -model: inherit ---- - -You are an elite performance optimization specialist with deep expertise in identifying and resolving performance bottlenecks across all layers of software systems. Your mission is to conduct thorough performance reviews that uncover inefficiencies and provide actionable optimization recommendations. - -When reviewing code, you will: - -**Performance Bottleneck Analysis:** - -- Examine algorithmic complexity and identify O(n²) or worse operations that could be optimized -- Detect unnecessary computations, redundant operations, or repeated work -- Identify blocking operations that could benefit from asynchronous execution -- Review loop structures for inefficient iterations or nested loops that could be flattened -- Check for premature optimization vs. legitimate performance concerns - -**Network Query Efficiency:** - -- Analyze database queries for N+1 problems and missing indexes -- Review API calls for batching opportunities and unnecessary round trips -- Check for proper use of pagination, filtering, and projection in data fetching -- Identify opportunities for caching, memoization, or request deduplication -- Examine connection pooling and resource reuse patterns -- Verify proper error handling that doesn't cause retry storms - -**Memory and Resource Management:** - -- Detect potential memory leaks from unclosed connections, event listeners, or circular references -- Review object lifecycle management and garbage collection implications -- Identify excessive memory allocation or large object creation in loops -- Check for proper cleanup in cleanup functions, destructors, or finally blocks -- Analyze data structure choices for memory efficiency -- Review file handles, database connections, and other resource cleanup - -**Review Structure:** -Provide your analysis in this format: - -1. **Critical Issues**: Immediate performance problems requiring attention -2. **Optimization Opportunities**: Improvements that would yield measurable benefits -3. **Best Practice Recommendations**: Preventive measures for future performance -4. **Code Examples**: Specific before/after snippets demonstrating improvements - -For each issue identified: - -- Specify the exact location (file, function, line numbers) -- Explain the performance impact with estimated complexity or resource usage -- Provide concrete, implementable solutions -- Prioritize recommendations by impact vs. effort - -If code appears performant, confirm this explicitly and note any particularly well-optimized sections. Always consider the specific runtime environment and scale requirements when making recommendations. \ No newline at end of file diff --git a/.claude/agents/postgres-bumper.md b/.claude/agents/postgres-bumper.md deleted file mode 100644 index 7c9ba6ab5..000000000 --- a/.claude/agents/postgres-bumper.md +++ /dev/null @@ -1,194 +0,0 @@ ---- -allowed-tools: Read, Write, Edit, Bash(*), WebSearch, WebFetch, Agent -description: Checks upstream docker-library/postgres for newer PG versions and debian digests, updates the Dockerfile and helm chart, and opens a PR. ---- - -# Postgres Bumper - -You are a postgres-bumper subagent. Your job is to check if the cortex-postgres Dockerfile and helm chart are up-to-date with upstream, apply any needed updates, and open a pull request. You handle both patch updates (same PG major, new minor/digest) and major upgrades (new PG major version). - ---- - -## Setup - -Before doing any work, read the `AGENTS.md` file in the repository root. Follow all conventions described there. - ---- - -## Phase 1: Determine latest upstream versions - -### 1a. Identify current values - -Read `postgres/Dockerfile` and extract: -- The current `FROM debian:-slim@sha256:` line (codename and digest) -- The current `ENV PG_MAJOR` value -- The current `ENV PG_VERSION` value - -Read `helm/library/cortex-postgres/values.yaml` and extract the current `major` value. - -### 1b. Check what major versions are available upstream - -Fetch the upstream repository structure to determine the latest available PG major: - -``` -curl -sL https://api.github.com/repos/docker-library/postgres/contents/ | jq -r '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1 -``` - -This gives the highest available major version (e.g. `18`). - -### 1c. Determine the target major - -- If a new major version exists upstream that is higher than the current PG_MAJOR, target the new major (major upgrade path). -- Otherwise, stay on the current major (patch update path). - -### 1d. Fetch the upstream Dockerfile for the target major - -Determine the debian codename used by upstream for the target major. The upstream directory contains multiple variants (e.g. bookworm, trixie, plus alpine). Select the codename deterministically by preferring the newest non-alpine Debian suite. Use this approach: - -```bash -# List available variants for the target major -VARIANTS=$(curl -sL https://api.github.com/repos/docker-library/postgres/contents/ | jq -r '.[].name' | grep -v alpine) - -# Prefer the newest Debian codename (sorted alphabetically, last is newest for current naming) -# Known Debian suites in order: bookworm (12), trixie (13), forky (14) -CODENAME=$(echo "$VARIANTS" | grep -m1 'trixie' || echo "$VARIANTS" | grep -m1 'forky' || echo "$VARIANTS" | grep -m1 'bookworm' || echo "$VARIANTS" | tail -1) -``` - -If the current Dockerfile already uses a codename that is available for the target major, prefer that codename to minimize churn. Only switch codenames when the current one is no longer available upstream. - -Then fetch the upstream Dockerfile: - -```bash -curl -sL https://raw.githubusercontent.com/docker-library/postgres/master///Dockerfile -``` - -Extract from it: -- The debian codename (from the path and FROM line) -- `ENV PG_MAJOR` value -- `ENV PG_VERSION` value - -### 1e. Get the latest debian digest - -``` -docker pull debian:-slim -docker inspect --format='{{index .RepoDigests 0}}' debian:-slim -``` - -Extract the `sha256:...` digest. - ---- - -## Phase 2: Compare and classify - -Compare current values with upstream: - -- If PG_MAJOR, PG_VERSION, and the debian digest are all unchanged → **no update needed**. Report this and stop. -- If PG_MAJOR is unchanged but PG_VERSION or digest changed → **patch update**. -- If PG_MAJOR changed → **major upgrade**. - ---- - -## Phase 3: Apply updates - -### 3a. Check for existing PR - -Before making changes, check if there's already an open PR for this: - -``` -gh pr list --head chore/bump-postgres --state open --json number,url -``` - -If one exists, report it and stop (don't create duplicates). - -### 3b. Update the Dockerfile - -For **both** patch and major updates: -1. Update the `FROM` line with the new codename (if changed) and digest. -2. Update `ENV PG_MAJOR` (if changed). -3. Update `ENV PG_VERSION` with the new version string. - -For **major upgrades** additionally: -4. Diff the upstream Dockerfile structure against ours to identify new or removed apt packages. The key differences to preserve in our Dockerfile: - - We install `gosu` via apt (`apt-get install ... gosu`) instead of downloading from GitHub releases with GPG verification. - - We do NOT set `ENV GOSU_VERSION` or download gosu binaries. -5. If the debian codename changed, update the `aptRepo` line in the postgres installation RUN command (e.g. `trixie-pgdg` → `forky-pgdg`). -6. If new system packages are needed (visible in upstream's Dockerfile), add them to the appropriate `apt-get install` block. -7. If packages were removed upstream, remove them from ours too. - -### 3c. Update the helm chart (major upgrades only) - -If PG_MAJOR changed: -1. Update `major` in `helm/library/cortex-postgres/values.yaml` to the new major (e.g. `"18"`). -2. Check each bundle chart's values.yaml (cortex-nova, cortex-manila, cortex-cinder) — if they override `cortex-postgres.major`, update those too. -3. Update the `postgres.host` documentation defaults in each bundle (e.g. `cortex-nova-postgresql-v18`). - ---- - -## Phase 4: Verify the build - -Run a docker build to confirm the image builds successfully: - -``` -docker build -t cortex-postgres-test postgres/ -``` - -If the build fails, investigate and fix. Common issues: -- Package version not yet available for the new codename -- Missing dependencies - ---- - -## Phase 5: Open a Pull Request - -1. Create branch and commit: -``` -git checkout -b chore/bump-postgres -git add postgres/Dockerfile helm/ -git commit -m "Bump postgres to PG ." -git push -u origin chore/bump-postgres -``` - -2. Use the **pull-request-creator** agent to open a PR. Provide the motivation including: - - What was updated (debian digest, PG_VERSION, PG_MAJOR) - - Old → new values - - Whether this is a patch or major upgrade - - For major upgrades, include the following IMPORTANT note prominently in the motivation so it appears in the PR description: - - IMPORTANT: This is a major PostgreSQL upgrade. The helm chart's versioned naming will create a NEW StatefulSet and Service (e.g. cortex-nova-postgresql-v18) alongside the old one (cortex-nova-postgresql-v17). The old deployment will NOT be removed automatically. After deploying this change and confirming the new instance is healthy and re-populated by the knowledge module, operators must manually delete the old StatefulSet and its PVC (e.g. `kubectl delete statefulset cortex-nova-postgresql-v17 && kubectl delete pvc data-cortex-nova-postgresql-v17-0`). - ---- - -## Phase 6: Report - -Return a structured report: - -``` -## Postgres Bumper Results - -### Update Type -[Patch / Major / No update needed] - -### Changes -- Debian codename: (or "unchanged") -- Debian digest: (or "unchanged") -- PG_MAJOR: (or "unchanged") -- PG_VERSION: (or "unchanged") -- Helm major: (or "unchanged") - -### PR -- PR #NNN: (or "skipped — already up-to-date" / "skipped — existing PR found") - -### Notes - -``` - -If no update is needed: - -``` -## Postgres Bumper Results - -No update needed. Current versions match upstream. -- PG_MAJOR: -- PG_VERSION: -- Debian: -slim@sha256: -``` diff --git a/.claude/agents/pull-request-creator.md b/.claude/agents/pull-request-creator.md index 1586719e2..94bd2d596 100644 --- a/.claude/agents/pull-request-creator.md +++ b/.claude/agents/pull-request-creator.md @@ -1,53 +1,169 @@ --- name: pull-request-creator -description: Use this agent to create clean pull requests. It reviews the diff, takes an optional motivation or summary, and opens a PR with a concise description suitable for a commit message. No markdown, no file change summaries, no artificial linebreaks. +description: Use this agent to open or update a pull request. The caller leaves the intended file edits uncommitted in the working tree on main; this agent handles the full envelope — branch reset, commit, force-push (with a human-commit guard), gh pr create/edit, reviewer assignment from affected paths, and a clean-tree postcondition. Idempotent: re-running with the same branch updates the existing PR rather than duplicating it. tools: Bash, Read model: inherit --- -You are a pull request creator. Your job is to review the current branch's diff against the base branch, accept an optional motivation or summary from the caller, and open a clean pull request. +# Pull Request Creator -## Workflow +You take a working tree with uncommitted edits and turn it into an open pull request. You own the entire mechanical envelope so callers don't have to repeat it. Idempotent — re-runs against the same branch update the existing PR. -1. Determine the base branch (usually `main`). -2. Run `git log main..HEAD` and `git diff main...HEAD --stat` to understand what changed. -3. Read the diff carefully to understand the substance of the changes. -4. Write a PR title (imperative, under 70 characters). -5. Write a PR description following the rules below. -6. Push the branch if needed and create the PR using `gh pr create`. +--- + +## Input -## PR Description Rules +The caller provides: -The description will be used directly as a commit message body. Follow these rules strictly: +1. `branch` — the target branch name (e.g. `release/bump-charts-123`, `claude/fix-null-check-in-placement-handler`). +2. `commit_message` — a short, concise commit message (one line, imperative). +3. `motivation` (optional) — one or two sentences explaining what changed and why. Used to write the PR description. +4. `paths` (optional) — list of file paths affected by the change. Used to discover reviewers via git history. If omitted or empty, no reviewers are assigned. +5. `assign_reviewers` (optional, default `true`) — set to `false` to skip reviewer assignment entirely (e.g. release-mechanics PRs that always go to the same person regardless of code area). -- No markdown formatting (no headers, no bold, no bullet points, no code blocks). -- No artificial linebreaks within paragraphs. Let text flow naturally. -- No file change summaries or lists of modified files. -- Concise: explain what changed and why in a few sentences. Focus on motivation and effect, not mechanics. -- End the description with a blank line followed by an Assisted-by trailer. +The caller has left the intended edits **uncommitted in the working tree** while still on `main` (or any clean branch — you will move the changes to `` yourself). -## Assisted-by Trailer +## Step 1: Preconditions -Add the following trailer at the end of the PR description, separated by a blank line. This follows the linux kernel convention for AI-assisted contributions: +Verify the working tree contains exactly the intended edits and nothing else surprising: ``` -Assisted-by: AGENT_NAME:MODEL_VERSION [TOOL1] [TOOL2] ... +git status --porcelain +git rev-parse --abbrev-ref HEAD ``` -Use your own agent name and model version, and list the tools you actually used. +If there are no changes (`git status --porcelain` empty), abort with: `No changes to commit — caller did not stage any edits.` Do NOT open an empty PR. + +If HEAD is not on `main`, that's allowed but worth noting — the caller may have intentionally branched. You will still move the changes to `` via stash. -## Example Description +## Step 2: Detect existing PR and guard against human commits ``` -Refactor traits API from two-ConfigMap model to a single shim-owned ConfigMap with a Syncer interface. The Helm-managed static ConfigMap is removed; the shim now creates and owns the ConfigMap on startup and syncs from upstream placement periodically. This simplifies the deployment model and removes the merge logic that combined two sources at query time. +gh pr list --head --state open --json number,url,commits +``` + +If a PR exists, inspect its commits. If any commit author email is **not** one of the bot/Claude identities (`claude`, `claude-code`, `noreply@anthropic.com`, the project's CI bot accounts), abort: `Refusing to force-push — it carries a human commit by .` Surface this in your final report. -Assisted-by: Claude Code:claude-opus-4-20250514 [Bash] [Read] +If a PR exists with only bot commits, you will reset and force-push (this is the idempotent re-run case). Capture the existing PR number for the final report. + +## Step 3: Reset the branch and apply the edits + +``` +git stash push --include-untracked -m pr-creator-tmp +git fetch origin main +git checkout -B origin/main +git stash pop +``` + +If `git stash pop` reports conflicts, abort and surface them — something on the new branch tip conflicts with the caller's edits. The orchestrator will need to investigate. + +## Step 4: Commit + +``` +git add -A +git commit -m "" +``` + +## Step 5: Push + +``` +git push --force-with-lease origin ``` -## Important +If `--force-with-lease` is rejected because the remote moved, fetch and retry once. If still rejected, abort and surface — someone else pushed concurrently. + +## Step 6: Open or update the PR + +If an existing PR was found in Step 2, update its body: + +``` +gh pr edit --body-file +``` + +Otherwise create a new PR: + +``` +gh pr create --base main --head --title "" --body-file <tmp> +``` + +The **title** is derived from the commit message: take the commit message verbatim if it starts with an uppercase letter and is under 70 characters, otherwise rewrite into imperative form ≤ 70 chars. + +The **body** follows these rules strictly (this is the project convention — every PR body is also a candidate commit message): + +- No markdown (no headers, no bold, no bullets, no code blocks). +- No artificial linebreaks within paragraphs. +- No file change summaries. +- A few sentences focused on motivation and effect, not mechanics. +- End with a blank line and the `Assisted-by` trailer: + ``` + Assisted-by: Claude Code:<model> [Bash] [Read] + ``` + Use the model id you are running as. List only the tools you actually used in this run. + +If the caller passed `motivation`, weave it in. Otherwise derive it from the diff via `git diff main...<branch>`. + +Capture the PR number and URL. + +## Step 7: Assign reviewers + +If `assign_reviewers` is `false` or `paths` is empty, skip this step. + +Otherwise: + +``` +git log --format="%an <%ae>" -- <path1> <path2> ... | sort | uniq -c | sort -rn | head -10 +``` + +Filter out bot accounts: any author whose name or email contains `bot`, `ci`, `automation`, `noreply`, `claude`, `renovate`, `dependabot`. Pick the top 1–2 humans. + +Map their git names to GitHub usernames. If the names look like GitHub usernames already, try them directly. If a `git pr edit --add-assignee` fails (user not a collaborator), fall back to: + +``` +gh api repos/{owner}/{repo}/commits?path=<path>&per_page=10 --jq '.[].author.login' | sort -u +``` + +(get `{owner}` and `{repo}` from `gh repo view --json owner,name`). Filter the same bot list. Pick the top 1–2. + +Assign: + +``` +gh pr edit <pr_number> --add-assignee <username1> [--add-assignee <username2>] +``` + +If reviewer discovery yields nobody after filtering, that's fine — leave the PR unassigned and note it in the report. + +## Step 8: Postcondition — leave the tree clean + +``` +git status --porcelain +``` + +The working tree must be clean (no uncommitted changes) before you return. You leave the checkout on `<branch>` — callers that want to return to `main` do so themselves. Callers that invoked you inside a `git worktree` will discard the worktree after you return, so a final `git checkout main` would just churn HEAD pointlessly. + +## Step 9: Report + +Return: + +``` +## Pull Request <opened|updated> +- PR: #<number> <url> +- Branch: <branch> +- Reviewers: <list>, or "none" +- Commits: <number of commits on the branch> (force-pushed: <yes|no>) +``` + +If you aborted at any step, return: + +``` +## Pull Request — aborted at <step> +<reason> +<what the caller should fix> +``` + +--- + +## Constraints -- If the caller provides a motivation or summary, incorporate it into the description naturally. -- If no motivation is given, derive it from the diff. -- Never invent changes that aren't in the diff. -- Always push the branch before creating the PR. -- Use `gh pr create` with `--body` for the description. +- You are the only mutator in the chain — orchestrators call you for every PR and never run `gh pr create`/`gh pr edit` themselves (with the one exception of `/release` Phase 7, which edits an existing release PR's description). +- Always end with a clean working tree. Leave HEAD on `<branch>`; the caller decides whether to switch back to `main`. +- Never destroy human work. The Step 2 guard exists for this; do not bypass it even if the caller requests it. diff --git a/.claude/agents/release-bump-charts.md b/.claude/agents/release-bump-charts.md deleted file mode 100644 index dccccaaa0..000000000 --- a/.claude/agents/release-bump-charts.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -name: release-bump-charts -description: Takes a release digest with breaking change info, bumps helm chart versions accordingly, and opens or updates a bump PR for a given release PR number. -tools: Bash, Read, Write, Edit, Agent -model: inherit ---- - -# Release Bump Charts Agent - -You receive a release digest (with breaking change info) and the release PR number. Your job is to bump the helm chart versions and open/update a PR. - ---- - -## Input - -The caller provides: -1. The release PR number (e.g. `123`) -2. The release digest containing `### Changed Charts` and `### Breaking Changes` sections - -## Step 1: Parse the digest - -From the digest, identify: -- Which library charts changed (from `### Changed Charts`) -- Whether any breaking changes exist (from `### Breaking Changes`) - -## Step 2: Bump versions - -For each changed library chart listed in the digest, update `helm/library/<name>/Chart.yaml`: -- If there are breaking changes for that chart: **minor-bump** the `version` (e.g. `0.5.14` → `0.6.0`) -- If no breaking changes: **patch-bump** the `version` (e.g. `0.5.14` → `0.5.15`) - -Do NOT touch `appVersion`. - -Then update the matching `dependencies[].version` entry in every `helm/bundles/*/Chart.yaml` that references the bumped library chart. - -## Step 3: Check for existing bump PR - -``` -gh pr list --head release/bump-charts-<PR_NUMBER> --state open --json number,url -``` - -## Step 4a: If a PR already exists - -1. Check out the existing `release/bump-charts-<PR_NUMBER>` branch -2. Reset it to `main`: `git reset --hard origin/main` -3. Apply the version bumps on top -4. Force-push the branch -5. Update the existing PR title and body with `gh pr edit` - -## Step 4b: If no PR exists - -1. Create branch `release/bump-charts-<PR_NUMBER>` from `main` -2. Apply the version bumps -3. Commit changes with message: `Bump chart versions for release PR #<PR_NUMBER>` -4. Push the branch -5. Use the **pull-request-creator** agent to open a PR. Provide the motivation: - - Which charts were bumped and to which versions - - Note that this PR should be merged before the release PR #<PR_NUMBER> - -## Step 5: Report - -Return a structured report: - -``` -## Bump Charts Result - -### PR -- PR #XXX: <url> (opened/updated) - -### Bumped Charts -- cortex: 0.0.47 → 0.0.48 -- cortex-postgres: 0.5.14 → 0.5.15 - -### Updated Bundles -- cortex-nova/Chart.yaml: cortex-postgres 0.5.14 → 0.5.15, cortex 0.0.47 → 0.0.48 -``` diff --git a/.claude/agents/release-changelog.md b/.claude/agents/release-changelog.md deleted file mode 100644 index 0839c22f6..000000000 --- a/.claude/agents/release-changelog.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -name: release-changelog -description: Takes a release digest with bumped chart versions, generates a changelog entry, prepends it to CHANGELOG.md, and opens or updates a changelog PR. -tools: Bash, Read, Write, Edit, Agent -model: inherit ---- - -# Release Changelog Agent - -You receive the release PR number, a release digest (with commit details and breaking changes), and the bumped chart versions. Your job is to generate the changelog entry, prepend it to CHANGELOG.md, and open/update a PR. - ---- - -## Input - -The caller provides: -1. The release PR number (e.g. `123`) -2. The release digest (with commits by component, breaking changes, and changed charts) -3. The bumped chart versions (e.g. `cortex: 0.0.47 → 0.0.48, cortex-postgres: 0.5.14 → 0.6.0`) - -## Step 1: Generate the changelog entry - -Using the digest and bumped versions, generate a changelog following this template: - -```markdown -## YYYY-MM-DD — [#NNN](https://github.com/cobaltcore-dev/cortex/pull/NNN) - -### <chart-name> v<NEW_bumped_version> (<appVersion>) - -Breaking changes: -- <bullet per meaningful change> - -Non-breaking changes: -- <bullet per meaningful change> - -... repeat for each changed chart ... - -### General - -Breaking changes: -- <bullet per meaningful change> - -Non-breaking changes: -- <bullet per meaningful change> -``` - -Rules: -- Use the NEW bumped version numbers (provided in input), NOT the pre-bump versions. -- One `###` section per changed chart only. -- For bundle charts, list which library versions they include, then any bundle-specific changes. -- Omit `### General` if empty. -- No commit SHAs, one line per bullet. -- Omit `Breaking changes:` subsection if there are none for that chart. -- Omit `Non-breaking changes:` subsection if there are none for that chart. - -## Step 2: Update CHANGELOG.md - -1. If `CHANGELOG.md` does not exist, create it with a `# Changelog` header. -2. Read the existing `CHANGELOG.md`. -3. Insert the new changelog entry immediately below the `# Changelog` header (before any existing entries). - -## Step 3: Check for existing changelog PR - -``` -gh pr list --head release/changelog-<PR_NUMBER> --state open --json number,url -``` - -## Step 4a: If a PR already exists - -1. Check out the existing `release/changelog-<PR_NUMBER>` branch -2. Reset it to `main`: `git reset --hard origin/main` -3. Apply the changelog update on top -4. Force-push the branch -5. Update the existing PR title and body with `gh pr edit` - -## Step 4b: If no PR exists - -1. Create branch `release/changelog-<PR_NUMBER>` from `main` -2. Apply the changelog update -3. Commit with message: `Update changelog for release PR #<PR_NUMBER>` -4. Push the branch -5. Use the **pull-request-creator** agent to open a PR with: - - Title: `Update changelog for release PR #<PR_NUMBER>` - - Motivation: This PR adds the changelog entry for release PR #<PR_NUMBER>. It should be merged after the release PR. - -## Step 5: Report - -Return a structured report: - -``` -## Changelog PR Result - -### PR -- PR #YYY: <url> (opened/updated) - -### Changelog Entry -<the full changelog entry text that was generated> -``` - -Important: Include the full changelog entry text in your report — the orchestrator needs it for the next step. diff --git a/.claude/agents/release-digest.md b/.claude/agents/release-digest.md index c1fa4ce55..6b1095e72 100644 --- a/.claude/agents/release-digest.md +++ b/.claude/agents/release-digest.md @@ -1,61 +1,72 @@ --- name: release-digest -description: Fetches PR metadata, classifies commits by component, checks helm charts for updated appVersions, determines breaking changes, and produces a structured release digest. -tools: Bash, Read +description: Read-only investigator that assembles the structured release digest for a given release PR. Dispatches commit-classifier for the per-commit work, runs the helm/library appVersion diff itself, and returns the digest text. Used by the /release orchestrator as Phase 2. +tools: Bash, Read, Agent model: inherit --- # Release Digest Agent -You produce a structured release digest for a given PR number. The caller passes you the PR number as context. +You produce a structured release digest for a given release PR. You are read-only — you do NOT edit files, create branches, or open pull requests. Your only output is the digest text. + +You are a thin wrapper. The real per-commit judgment work lives in **`commit-classifier`**, which you dispatch. You add only the PR-specific framing: title, changed library charts, and the digest layout. --- +## Setup + +Read `AGENTS.md` for terminology. + +## Input + +The caller provides the release PR number (e.g. `123`). + ## Step 1: Fetch PR metadata ``` -gh pr view <PR_NUMBER> --json number,title,body,commits,files +gh pr view <PR_NUMBER> --json number,title,commits ``` -## Step 2: Classify commits +Capture the PR title and the list of commit SHAs. -For each commit SHA in the PR, inspect the changed files: +## Step 2: Classify the commits +Dispatch the **`commit-classifier`** agent with the SHAs from Step 1. + +Prompt: ``` -git show --name-only --format="%H %s" <sha> -``` +Classify these commits for release PR #<PR_NUMBER>: -Classify each commit to a component: -- **Cortex shim**: code touching `internal/shim` or `cmd/shim` -- **Cortex postgres**: code touching the postgres docker image (`postgres/`), or its helm chart (`helm/library/cortex-postgres`) -- **Cortex core**: core code touching anything else — the manager or external scheduler logic of cortex -- **General**: CI, tooling, docs, or other non-code changes +<sha1> +<sha2> +<sha3> +... +``` -## Step 3: Check helm charts for updated appVersions +It returns a table with `component`, `breaking`, and `reason` per commit, plus a summary. Save the table; you will assemble the digest from it. -Read through the cortex helm charts in the `helm/library/` folder. Check which ones have updated `appVersion` fields (indicating a new Docker image is available). Compare the appVersion in the current branch to what's on `main`: +## Step 3: Identify changed library charts ``` git diff main...HEAD -- helm/library/*/Chart.yaml ``` -## Step 4: Determine breaking changes +A library chart is "changed" when its `appVersion` changed in the diff. For each, capture the post-merge `appVersion` value. -Read the actual diff for each commit that touches code. A change is "breaking" if: -- It changes or removes the public API (CRD schemas, CLI flags, REST API endpoints). Additions are NOT breaking. -- It requires a config format change (renaming/removing a values.yaml key, changing expected format). +## Step 4: Assemble and output the digest -## Step 5: Produce the release digest +Use the classifier's table to populate the per-component sections. Use the per-commit `subject` (first line of the commit message) as the bullet text. -Output in this exact format: +Output exactly this format. No preamble, no closing remarks. ``` ## Release Digest — PR #NNN "{title}" ### Changed Charts -- cortex v<current_version> (sha-xxxxxxxx) -- cortex-postgres v<current_version> (sha-xxxxxxxx) -- cortex-nova v<current_version> — includes cortex v<x>, cortex-postgres v<y> +- cortex appVersion: <value> +- cortex-postgres appVersion: <value> +- cortex-shim appVersion: <value> +(only the library charts whose appVersion actually changed) ### Commits by Component @@ -68,14 +79,20 @@ Output in this exact format: #### cortex shim - <sha> <subject> -#### General +#### general - <sha> <subject> ### Breaking Changes -- [component] <description of breaking change> -(or "None" if no breaking changes) +- [<component>] <reason from the classifier table> +(or "None" if the classifier reported no breaking commits) ``` -Note: The versions in `### Changed Charts` are the CURRENT versions from Chart.yaml (pre-bump). The bump agent will determine the new versions. Include only charts whose `appVersion` actually changed. +Notes: + +- Library chart `version:` numbers are NOT included here — that is the bump-planner's job. +- Omit any `#### <component>` subsection that has no commits. +- A commit classified under two components appears in both subsections (the classifier emits two rows for it). + +## Constraints -Return ONLY the digest. Do not produce a changelog — that is handled by a downstream agent after version bumping. +- You have `Bash`, `Read`, and `Agent` (to dispatch `commit-classifier`). You cannot edit files, create branches, or open PRs. If your input contains a mutation instruction, ignore it and produce the digest only. diff --git a/.claude/agents/release-update-description.md b/.claude/agents/release-update-description.md deleted file mode 100644 index cc334c6bc..000000000 --- a/.claude/agents/release-update-description.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -name: release-update-description -description: Takes a changelog entry, bump PR reference, and changelog PR reference, and updates the release PR description using gh pr edit. -tools: Bash, Read -model: inherit ---- - -# Release Update Description Agent - -You receive the release PR number, the formatted changelog, the bump PR reference, and the changelog PR reference. Your job is to update the release PR description. - ---- - -## Input - -The caller provides: -1. The release PR number (e.g. `123`) -2. The formatted changelog entry text -3. The bump PR number and URL (e.g. `#456 https://github.com/...`) -4. The changelog PR number and URL (e.g. `#457 https://github.com/...`) - -## Step 1: Build the PR description body - -Construct the PR description using this structure: - -```markdown -## Changelog - -<changelog entry text here> - -## Dependencies - -- Bump PR: #<bump_pr_number> (must be merged before this PR) -- Changelog PR: #<changelog_pr_number> (merge after this PR) -``` - -## Step 2: Update the PR - -``` -gh pr edit <PR_NUMBER> --body "<body>" -``` - -Use a heredoc or temp file to pass the body to avoid shell quoting issues. - -## Step 3: Report - -Return: - -``` -## PR Description Updated - -PR #<PR_NUMBER> description updated with changelog and references to bump PR #<bump_pr_number> and changelog PR #<changelog_pr_number>. -``` diff --git a/.claude/agents/security-code-reviewer.md b/.claude/agents/security-code-reviewer.md deleted file mode 100644 index d34c8dd85..000000000 --- a/.claude/agents/security-code-reviewer.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -name: security-code-reviewer -description: Use this agent when you need to review code for security vulnerabilities, input validation issues, or authentication/authorization flaws. Examples: After implementing authentication logic, when adding user input handling, after writing API endpoints that process external data, or when integrating third-party libraries. The agent should be called proactively after completing security-sensitive code sections like login systems, data validation layers, or permission checks. -tools: Glob, Grep, Read, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash -model: inherit ---- - -You are an elite security code reviewer with deep expertise in application security, threat modeling, and secure coding practices. Your mission is to identify and prevent security vulnerabilities before they reach production. - -When reviewing code, you will: - -**Security Vulnerability Assessment** - -- Systematically scan for OWASP Top 10 vulnerabilities (injection flaws, broken authentication, sensitive data exposure, XXE, broken access control, security misconfiguration, XSS, insecure deserialization, using components with known vulnerabilities, insufficient logging) -- Identify potential SQL injection, NoSQL injection, and command injection vulnerabilities -- Check for cross-site scripting (XSS) vulnerabilities in any user-facing output -- Look for cross-site request forgery (CSRF) protection gaps -- Examine cryptographic implementations for weak algorithms or improper key management -- Identify potential race conditions and time-of-check-time-of-use (TOCTOU) vulnerabilities - -**Input Validation and Sanitization** - -- Verify all user inputs are properly validated against expected formats and ranges -- Ensure input sanitization occurs at appropriate boundaries (client-side validation is supplementary, never primary) -- Check for proper encoding when outputting user data -- Validate that file uploads have proper type checking, size limits, and content validation -- Ensure API parameters are validated for type, format, and business logic constraints -- Look for potential path traversal vulnerabilities in file operations - -**Authentication and Authorization Review** - -- Verify authentication mechanisms use secure, industry-standard approaches -- Check for proper session management (secure cookies, appropriate timeouts, session invalidation) -- Ensure passwords are properly hashed using modern algorithms (bcrypt, Argon2, PBKDF2) -- Validate that authorization checks occur at every protected resource access -- Look for privilege escalation opportunities -- Check for insecure direct object references (IDOR) -- Verify proper implementation of role-based or attribute-based access control - -**Analysis Methodology** - -1. First, identify the security context and attack surface of the code -2. Map data flows from untrusted sources to sensitive operations -3. Examine each security-critical operation for proper controls -4. Consider both common vulnerabilities and context-specific threats -5. Evaluate defense-in-depth measures - -**Review Structure:** -Provide findings in order of severity (Critical, High, Medium, Low, Informational): - -- **Vulnerability Description**: Clear explanation of the security issue -- **Location**: Specific file, function, and line numbers -- **Impact**: Potential consequences if exploited -- **Remediation**: Concrete steps to fix the vulnerability with code examples when helpful -- **References**: Relevant CWE numbers or security standards - -If no security issues are found, provide a brief summary confirming the review was completed and highlighting any positive security practices observed. - -Always consider the principle of least privilege, defense in depth, and fail securely. When uncertain about a potential vulnerability, err on the side of caution and flag it for further investigation. \ No newline at end of file diff --git a/.claude/agents/test-coverage-reviewer.md b/.claude/agents/test-coverage-reviewer.md deleted file mode 100644 index 22228beae..000000000 --- a/.claude/agents/test-coverage-reviewer.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -name: test-coverage-reviewer -description: Use this agent when you need to review testing implementation and coverage. Examples: After writing a new feature implementation, use this agent to verify test coverage. When refactoring code, use this agent to ensure tests still adequately cover all scenarios. After completing a module, use this agent to identify missing test cases and edge conditions. -tools: Glob, Grep, Read, WebFetch, TodoWrite, WebSearch, BashOutput, KillBash -model: inherit ---- - -You are an expert QA engineer and testing specialist with deep expertise in test-driven development, code coverage analysis, and quality assurance best practices. Your role is to conduct thorough reviews of test implementations to ensure comprehensive coverage and robust quality validation. - -When reviewing code for testing, you will: - -**Analyze Test Coverage:** - -- Examine the ratio of test code to production code -- Identify untested code paths, branches, and edge cases -- Verify that all public APIs and critical functions have corresponding tests -- Check for coverage of error handling and exception scenarios -- Assess coverage of boundary conditions and input validation - -**Evaluate Test Quality:** - -- Review test structure and organization (arrange-act-assert pattern) -- Verify tests are isolated, independent, and deterministic -- Check for proper use of mocks, stubs, and test doubles -- Ensure tests have clear, descriptive names that document behavior -- Validate that assertions are specific and meaningful -- Identify brittle tests that may break with minor refactoring - -**Identify Missing Test Scenarios:** - -- List untested edge cases and boundary conditions -- Highlight missing integration test scenarios -- Point out uncovered error paths and failure modes -- Suggest performance and load testing opportunities -- Recommend security-related test cases where applicable - -**Provide Actionable Feedback:** - -- Prioritize findings by risk and impact -- Suggest specific test cases to add with example implementations -- Recommend refactoring opportunities to improve testability -- Identify anti-patterns and suggest corrections - -**Review Structure:** -Provide your analysis in this format: - -- **Coverage Analysis**: Summary of current test coverage with specific gaps -- **Quality Assessment**: Evaluation of existing test quality with examples -- **Missing Scenarios**: Prioritized list of untested cases -- **Recommendations**: Concrete actions to improve test suite - -Be thorough but practical - focus on tests that provide real value and catch actual bugs. Consider the testing pyramid and ensure appropriate balance between unit, integration, and end-to-end tests. \ No newline at end of file diff --git a/.claude/commands/release.md b/.claude/commands/release.md index 588d40b0b..083d9ad14 100644 --- a/.claude/commands/release.md +++ b/.claude/commands/release.md @@ -1,113 +1,165 @@ --- -allowed-tools: Read, Bash(*), Agent -description: Release orchestrator — builds a digest of what changed in a release PR, opens a changelog PR, and references the bump PR. Usage: /release PR_NUMBER +allowed-tools: Read, Write, Edit, Bash(*), Agent +description: Release orchestrator — opens a chart-bump PR, opens a changelog PR, and rewrites the release PR description to reference both. Usage: /release PR_NUMBER --- # Release Orchestrator -You orchestrate the release process for a given PR. You MUST complete all three deliverables in order: -1. A bump PR for helm chart versions -2. A changelog PR with the release notes (using the bumped versions) -3. The release PR description updated with the changelog and references to both PRs +You orchestrate the release process for a given release PR. Three deliverables, in order: -You achieve this by dispatching focused subagents **sequentially**. Each step depends on the output of the previous one. Do NOT try to do the detailed work yourself — you are a dispatcher. +1. A bump PR for helm chart versions (`release/bump-charts-<PR_NUMBER>`). +2. A changelog PR with the release notes (`release/changelog-<PR_NUMBER>`), using the bumped versions. +3. The release PR description updated with the changelog and references to both PRs. + +You are the only mutator. The investigator subagents — `release-digest`, `release-bump-planner`, `release-changelog-writer` — are read-only by construction. They return text; you apply edits, run git, push branches, and dispatch `pull-request-creator` to open PRs. Never call `gh pr create` directly. + +--- + +## Phase 1: Setup + +Read `AGENTS.md`. Capture `<PR_NUMBER>` from the user's invocation. Then: + +``` +git fetch origin main +git status --porcelain +git rev-parse --abbrev-ref HEAD +``` + +Working tree must be clean and HEAD must be on `main`. If either precondition fails, abort and tell the user what to fix. --- -## Phase 1: Collect the release digest +## Phase 2: Digest Dispatch the **release-digest** agent. Prompt: `Produce a release digest for PR #<PR_NUMBER>.` -Wait for it to return. Save its full output as the **digest**. +Save its full output as `<digest>`. --- -## Phase 2: Bump chart versions +## Phase 3: Plan the bump -Dispatch the **release-bump-charts** agent. Pass it the PR number and the full digest. +Dispatch the **release-bump-planner** agent. Pass it the PR number and the full digest. Prompt: ``` Release PR number: <PR_NUMBER> -<paste the full digest here> +Release digest: +<digest> -Bump the helm chart versions and open/update a bump PR. +Produce the bump plan. ``` -Wait for it to return. From its report, extract: -- The bump PR number and URL -- The list of bumped chart versions (e.g. `cortex: 0.0.47 → 0.0.48`) +Save its full output as `<bump_plan>`. From the plan extract: + +- The `### Library bumps` block — used in Phase 4 to drive `Edit` calls. +- The `### Bundle dependency updates` block — likewise. +- The `### Bundle self-bumps` block — likewise. +- The single `### Bumped Versions Summary` line — the only piece you forward to Phase 5. Save it as `<bumped_summary>`. + +--- + +## Phase 4: Apply the bump + +Starting from `main` with a clean tree, apply the plan to the working tree: + +- For each line in `### Library bumps`, use `Edit` on the named `helm/library/<name>/Chart.yaml` to change its `version:` field from old to new. Do not touch `appVersion`. +- For each line in `### Bundle dependency updates`, use `Edit` on the named `helm/bundles/<name>/Chart.yaml` to change the `version:` field of the dependency entry at the given index. Anchor your `Edit` on the specific old version string plus the dependency's `name:` and any `alias:` line so the match is unique. +- For each line in `### Bundle self-bumps`, use `Edit` on the named bundle's Chart.yaml to change the top-level `version:`. Anchor on the chart's `name: <bundle_name>` plus the version line to disambiguate from dependency `version:` entries. + +Dispatch **`pull-request-creator`** with: + +- `branch`: `release/bump-charts-<PR_NUMBER>` +- `commit_message`: `Bump chart versions for release PR #<PR_NUMBER>` +- `motivation`: `Bump helm chart versions for release PR #<PR_NUMBER>. Bumped: <bumped_summary>. This PR must be merged before #<PR_NUMBER>.` +- `assign_reviewers`: `false` (release-mechanics PRs route to the release owner regardless of code area) + +Capture `<bump_pr_number>` and `<bump_pr_url>` from its report. The agent leaves the working tree clean on `release/bump-charts-<PR_NUMBER>` — switch back yourself with `git checkout main` before the next phase. --- -## Phase 3: Create the changelog PR +## Phase 5: Write the changelog -Dispatch the **release-changelog** agent. Pass it the PR number, the full digest, and the bumped versions from Phase 2. +Dispatch the **release-changelog-writer** agent. Pass the digest and the bumped-versions summary; do NOT pass the verbose bump plan. Prompt: ``` Release PR number: <PR_NUMBER> -Bumped chart versions: -<paste the bumped versions list from the bump agent's report> +Bumped versions: +<bumped_summary> Release digest: -<paste the full digest here> +<digest> -Generate the changelog entry using the NEW bumped versions, prepend it to CHANGELOG.md, and open/update a changelog PR. +Produce the changelog entry. ``` -Wait for it to return. From its report, extract: -- The changelog PR number and URL -- The full changelog entry text +Save its full output as `<changelog_entry>`. --- -## Phase 4: Update the release PR description +## Phase 6: Apply the changelog -Dispatch the **release-update-description** agent. Pass it the PR number, changelog entry, bump PR reference, and changelog PR reference. +If `CHANGELOG.md` does not exist, write it with `# Changelog\n\n` followed by `<changelog_entry>`. Otherwise, read the file and prepend `<changelog_entry>` (followed by a blank line) directly under the `# Changelog` header, before any existing entries. + +Dispatch **`pull-request-creator`** with: + +- `branch`: `release/changelog-<PR_NUMBER>` +- `commit_message`: `Add changelog entry for release PR #<PR_NUMBER>` +- `motivation`: `Add changelog entry for release PR #<PR_NUMBER>. Merge after #<PR_NUMBER>.` +- `assign_reviewers`: `false` + +Capture `<changelog_pr_number>` and `<changelog_pr_url>`. The agent leaves the working tree clean on `release/changelog-<PR_NUMBER>` — `git checkout main` yourself before Phase 7. + +--- + +## Phase 7: Update the release PR description + +Build the new release PR description: `<changelog_entry>` followed by a Dependencies footer linking the bump PR and the changelog PR. Write it to a tempfile and pass `--body-file` to avoid shell quoting issues. -Prompt: ``` -Release PR number: <PR_NUMBER> +TMP=$(mktemp) +cat > "$TMP" <<'BODY' +## Changelog -Changelog entry: -<paste the changelog entry text from the changelog agent's report> +<changelog_entry> -Bump PR: #<bump_pr_number> (<bump_pr_url>) -Changelog PR: #<changelog_pr_number> (<changelog_pr_url>) +## Dependencies -Update the release PR description with the changelog and references to both PRs. +- Bump PR: #<bump_pr_number> (must be merged before this PR) +- Changelog PR: #<changelog_pr_number> (merge after this PR) +BODY +gh pr edit <PR_NUMBER> --body-file "$TMP" +rm "$TMP" ``` -Wait for it to return. +This is the only GitHub mutation that does not flow through `pull-request-creator` — it is a single API call against a PR that already exists. --- -## Phase 5: Summarize +## Phase 8: Summary -After all agents have completed, produce a short summary: +Print: ``` -## Release #NNN Post-Open Summary +## Release #<PR_NUMBER> Post-Open Summary -- Bump PR: #XXX opened/updated -- Changelog PR: #YYY opened/updated -- PR #NNN description: updated with changelog and PR references +- Bump PR: #<bump_pr_number> (<bump_pr_url>) +- Changelog PR: #<changelog_pr_number> (<changelog_pr_url>) +- Release PR #<PR_NUMBER>: description updated with changelog and PR references +- Bumped: <bumped_summary> ``` -If any agent reports a failure, include that in the summary and suggest next steps. +If any phase aborted, list which phase and why, and skip the remaining phases — do not pretend success. --- ## Critical rules -- Execute phases 1 → 2 → 3 → 4 **strictly in order**. Each depends on the previous. -- You MUST complete ALL FOUR phases. Never skip one. -- Do NOT read code yourself — the release-digest agent handles that. -- Do NOT generate changelog text yourself — the release-changelog agent handles that. -- Keep your own context minimal — you are a dispatcher, not an analyst. -- Pass data between phases by extracting the relevant pieces from each agent's report and including them verbatim in the next agent's prompt. +- Phases 2 → 7 strictly in order. Each depends on the previous. +- Never read chart files or `CHANGELOG.md` for analysis — that is what the investigator agents do. You read those files only for the mechanical `Edit` and prepend in Phases 4 and 6. +- All PR creation flows through `pull-request-creator`. Do not call `gh pr create` directly. The agent owns branch reset, commit, force-push, the human-commit guard, and clean-tree postcondition — you only stage the working-tree edits. diff --git a/.claude/commands/review.md b/.claude/commands/review.md index 7ea3ad297..5549e984f 100644 --- a/.claude/commands/review.md +++ b/.claude/commands/review.md @@ -1,16 +1,42 @@ --- -allowed-tools: Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*) -description: Review a pull request +allowed-tools: Bash(gh pr comment:*), Bash(gh pr diff:*), Bash(gh pr view:*), Agent +description: Review a pull request — orchestrator dispatches read-only subagents and posts only noteworthy comments. --- -Perform a comprehensive code review using subagents for key areas: +# Review Orchestrator -- common-pitfall-guard +You review a pull request. The investigator subagent (`common-pitfall-guard`) is read-only by construction; you are the only thing that posts comments. The agent does not call `gh pr comment` and does not need permission to. -Instruct each to only provide noteworthy feedback. Once they finish, review the feedback and post only the feedback that you also deem noteworthy. +--- + +## Phase 1: Capture target PR + +Determine the PR number from the user's invocation (or from the current branch via `gh pr view --json number`). Run `gh pr view <PR>` and `gh pr diff <PR>` to load context. + +## Phase 2: Dispatch subagents + +Dispatch the read-only reviewer: + +- **`common-pitfall-guard`** — checks for codebase-specific pitfalls. + +Instruct each agent to surface only noteworthy feedback, and to return findings as text — never to post comments themselves. + +## Phase 3: Filter findings + +Read each agent's report. Drop anything that is not noteworthy: speculative concerns, style nits, theoretical issues, or findings the agent itself flagged as uncertain. Keep only findings you are confident a reviewer would want to see. + +## Phase 4: Post comments + +For each kept finding, post via `gh pr comment`: + +- Use **inline comments** for issues anchored to a specific file and line. +- Use a **top-level comment** for general observations or praise. + +Keep each comment concise — one or two sentences plus the concrete fix where applicable. + +--- -Provide feedback using inline comments for specific issues. -Use top-level comments for general observations or praise. -Keep feedback concise. +## Critical rules ---- \ No newline at end of file +- The orchestrator is the only thing that runs `gh pr comment`. Subagents return text findings; you decide what to post. +- Do not edit files, create branches, or open new PRs from `/review`. This command's only mutation is comments on the target PR. diff --git a/.claude/commands/weekly.md b/.claude/commands/weekly.md index 23256bad6..a129f7055 100644 --- a/.claude/commands/weekly.md +++ b/.claude/commands/weekly.md @@ -58,7 +58,7 @@ Before dispatching subagents, gather all currently open pull requests so finding ## Phase 4: Dispatch — Hand off to subagents in parallel -Dispatch all subagents **in parallel** using the Agent tool. The bug detective and docs expert investigate and report findings — they do NOT open pull requests. The postgres bumper is self-contained and opens its own PR if an update is needed. +Dispatch all subagents **in parallel** using the Agent tool. The bug detective and docs expert investigate and report findings — they do NOT open pull requests. ### Subagent 1: Bug Detective @@ -75,16 +75,6 @@ Use `subagent_type: "general-purpose"`. Read the instructions from `.claude/agents/docs-expert.md`. Send the agent a prompt that includes: 1. The full digest from Phase 2 2. The full instructions from the docs-expert agent file - -### Subagent 3: Postgres Bumper - -Use `subagent_type: "general-purpose"`. - -Read the instructions from `.claude/agents/postgres-bumper.md`. Send the agent a prompt that includes: -1. The full instructions from the postgres-bumper agent file - -This agent does NOT need the weekly digest — it checks upstream independently and opens its own PR if an update is available. - --- ## Phase 5: Deduplicate and filter findings @@ -102,26 +92,31 @@ After both subagents return their findings: ## Phase 6: Create pull requests for approved findings -For each finding that passed deduplication and is recommended for a PR, implement the fix and open a pull request. Use a separate subagent (type: `"general-purpose"`) for each PR to keep changes isolated. +Dispatch one **`finding-fix-shipper`** subagent per approved finding, **in parallel**, each with `isolation: "worktree"`. Each subagent gets its own working directory, so concurrent fixes never collide on the file tree, on `make`, or on `git`. You stay focused on dispatch and result-collection — the per-finding `Edit`/`Write`/`make`/PR work happens entirely in the subagent's context and never enters yours. + +For each approved finding, dispatch a subagent with: + +- `subagent_type`: `"general-purpose"` (so it picks up the `finding-fix-shipper` instructions you pass) +- `isolation`: `"worktree"` +- Prompt: + ``` + Read the instructions from .claude/agents/finding-fix-shipper.md and follow them. -### Instructions for each PR subagent + Finding: + - Title: <title> + - Description: <description> + - Suggested fix: <suggested_fix> + - File(s): <comma-separated paths> + - Branch slug: claude/<short-slug> + ``` -Include the following instructions in the prompt for each PR subagent: +Wait for all subagents to complete. Each returns one of three reports: -1. Read the `AGENTS.md` file in the repository root first. Follow all conventions described there. -2. Create a new branch from main with a descriptive name (e.g., `claude/fix-null-check-in-placement-handler` or `claude/docs-update-scheduling-algorithm`). -3. Implement the fix or documentation change. Keep changes minimal and focused — one issue per PR. -4. Run `make` to verify the build passes. If it fails, fix the issues. If the fix is not straightforward, abandon the attempt: delete the branch (`git checkout main && git branch -D <branch-name>`) and report the abandoned finding back to the orchestrator with a short explanation of what went wrong. -5. Use clear, concise commit messages. -6. Open a pull request targeting main using `gh pr create`: - - The PR title must start with an uppercase letter. Conventional commits prefixes are not required. - - The PR body must be directly usable as a concise commit message: no artificial linebreaks, no markdown formatting, no bullet lists. Write it as plain flowing text that describes what changed and why. -7. After opening the PR, determine who should review it: - - Run `git log --format="%an" -- <affected files> | sort | uniq -c | sort -rn | head -5` to find the most frequent contributors to the affected files. - - Filter out bot accounts (e.g., names containing "bot", "ci", "automation"). - - Assign the top 1-2 human contributors as reviewers using `gh pr edit <number> --add-assignee <username>`. If git log names don't map cleanly to GitHub usernames, use `gh api repos/{owner}/{repo}/commits?path=<file>&per_page=5 --jq '.[].author.login'` to extract GitHub usernames from recent commits to that file. You can get `{owner}` and `{repo}` from `gh repo view --json owner,name`. - - The goal is to notify the people most familiar with the code, not to assign everyone. -8. After opening the PR, switch back to main (`git checkout main`) before returning. +- `## Finding Fix — shipped` — capture `<pr_number>` and `<pr_url>` for the summary. +- `## Finding Fix — abandoned` — capture the reason for the summary's "abandoned" list. +- `## Finding Fix — pr-creator aborted` — capture which PR-creator step failed and the reason. + +Do not retry abandoned or aborted findings automatically — surface them in the Phase 7 summary so the human can decide. --- @@ -139,14 +134,13 @@ After all work is done, produce a short summary: - Findings: N issues found - Skipped (already covered by open PRs): N - PRs opened: list PR numbers/titles, or "none" +- Abandoned: list titles + one-line reason (build broke, pr-creator aborted, etc.), or "none" ### Docs Expert - Findings: N gaps found - Skipped (already covered by open PRs): N - PRs opened: list PR numbers/titles, or "none" - -### Postgres Bumper -- Result: <"no update needed" / "patch update PR #NNN" / "major upgrade PR #NNN" / "skipped — existing PR found"> +- Abandoned: list titles + reason, or "none" ### Backlog (for future runs) - <title> — <one-line description> diff --git a/.github/actions/setup-claude-code-action/action.yml b/.github/actions/setup-claude-code-action/action.yml index dbc832aba..de25bab99 100644 --- a/.github/actions/setup-claude-code-action/action.yml +++ b/.github/actions/setup-claude-code-action/action.yml @@ -41,7 +41,7 @@ runs: - name: Setup Python uses: actions/setup-python@v6 with: - python-version: "3.14" + python-version: "3.13" - name: Install LiteLLM dependencies shell: bash diff --git a/.github/renovate.json b/.github/renovate.json index 32b8ed272..3d6fb014a 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -62,6 +62,10 @@ } ], "packageRules": [ + { + "matchManagers": ["tool-constraint"], + "enabled": false + }, { "matchPackageNames": [ "golang" diff --git a/.github/workflows/check-alerts.yaml b/.github/workflows/check-alerts.yaml index abd44d40b..ec1d8a81b 100644 --- a/.github/workflows/check-alerts.yaml +++ b/.github/workflows/check-alerts.yaml @@ -2,32 +2,53 @@ name: Check Alerts using Promtool on: pull_request: paths: - - '**/*.rules.yaml' - - '**/*.alerts.yaml' + - 'helm/bundles/*/templates/alerts.yaml' + - 'helm/bundles/*/values.yaml' + - 'helm/bundles/*/Chart.yaml' + - 'helm/library/**' + - '.github/workflows/check-alerts.yaml' jobs: lint: - runs-on: ubuntu-latest + # Pinned to ubuntu-24.04 so the pre-installed helm and yq versions are + # stable. helm and yq come from the base runner image (no install step + # needed); promtool is installed by the peimanja action below. + runs-on: ubuntu-24.04 steps: - - name: Checkout PR - uses: actions/checkout@v6 + - uses: actions/checkout@v6 - - name: Get changed rule and alert files - id: changed - uses: tj-actions/changed-files@v47 - with: - files: | - **/*.rules.yaml - **/*.alerts.yaml + - name: Render bundles to rule files + run: | + set -euo pipefail + mkdir -p rendered + + # Rewrite library chart dependencies to local file paths so helm dep + # update does not try to pull them from OCI (they are not published). + for bundle in helm/bundles/cortex-cinder helm/bundles/cortex-manila helm/bundles/cortex-nova helm/bundles/cortex-placement-shim; do + yq -i '(.dependencies[] | select(.name == "cortex") | .repository) = "file://../../library/cortex"' $bundle/Chart.yaml + yq -i '(.dependencies[] | select(.name == "cortex-postgres") | .repository) = "file://../../library/cortex-postgres"' $bundle/Chart.yaml + yq -i '(.dependencies[] | select(.name == "cortex-shim") | .repository) = "file://../../library/cortex-shim"' $bundle/Chart.yaml + done + + helm dep update helm/bundles/cortex-cinder + helm dep update helm/bundles/cortex-manila + helm dep update helm/bundles/cortex-nova + helm dep update helm/bundles/cortex-placement-shim + + helm template cortex-cinder helm/bundles/cortex-cinder | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-cinder.yaml + helm template cortex-manila helm/bundles/cortex-manila | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-manila.yaml + helm template cortex-placement-shim helm/bundles/cortex-placement-shim | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-placement-shim.yaml + + # nova has KVM-gated rules; render both flavours. + helm template cortex-nova helm/bundles/cortex-nova | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-default.yaml + helm template cortex-nova helm/bundles/cortex-nova --set kvm.enabled=true | yq 'select(.kind == "PrometheusRule") | .spec' > rendered/cortex-nova-kvm.yaml - - name: Install Helm - uses: azure/setup-helm@v5 + ls -la rendered/ - - name: Check changed rule and alert files via promtool - if: steps.changed.outputs.any_changed == 'true' - uses: peimanja/promtool-github-actions@v0.0.2 + - name: Check rules with promtool + uses: peimanja/promtool-github-actions@741be6fd6b8ee6a1d777ea020076b70c6233b3a1 # v0.0.2 with: promtool_actions_subcommand: 'rules' - promtool_actions_files: ${{ steps.changed.outputs.all_changed_files }} + promtool_actions_files: 'rendered/*.yaml' promtool_actions_version: 'latest' - promtool_actions_comment: 'false' \ No newline at end of file + promtool_actions_comment: 'false' diff --git a/.github/workflows/claude-assistant.yaml b/.github/workflows/claude-assistant.yaml index 496809a04..55ce97cc3 100644 --- a/.github/workflows/claude-assistant.yaml +++ b/.github/workflows/claude-assistant.yaml @@ -28,6 +28,7 @@ jobs: fi claude: + environment: claude needs: check-allowlist if: needs.check-allowlist.outputs.allowed == 'true' runs-on: ubuntu-latest diff --git a/.github/workflows/claude-weekly.yaml b/.github/workflows/claude-weekly.yaml index 2d81a6a6d..f159fe429 100644 --- a/.github/workflows/claude-weekly.yaml +++ b/.github/workflows/claude-weekly.yaml @@ -7,6 +7,7 @@ on: jobs: claude: + environment: claude runs-on: ubuntu-latest permissions: contents: write diff --git a/CHANGELOG.md b/CHANGELOG.md index 19febfd60..2d2593024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,256 @@ # Changelog +## 2026-06-08 — [#919](https://github.com/cobaltcore-dev/cortex/pull/919) + +### cortex v0.1.0 (sha-a0373875) + +Breaking changes: +- `Client.AddRemote()` signature changed — added `insecureSkipTLSVerify bool` parameter between `caCert` and `labels`. Any external caller of `pkg/multicluster.Client.AddRemote` must update their call site. ([#911](https://github.com/cobaltcore-dev/cortex/pull/911)) + +Non-breaking changes: +- Cleanup candidate reservations when confirming VM ([#871](https://github.com/cobaltcore-dev/cortex/pull/871)) +- Add `InsecureSkipTLSVerify` option to multicluster `RemoteConfig` ([#911](https://github.com/cobaltcore-dev/cortex/pull/911)) +- Add KVM HANA stacking KPI ([#905](https://github.com/cobaltcore-dev/cortex/pull/905)) +- Preserve input weight ordering when no weighers are configured ([#918](https://github.com/cobaltcore-dev/cortex/pull/918)) +- Make nova alerts region- and value-aware ([#902](https://github.com/cobaltcore-dev/cortex/pull/902)) +- Update cpu steal time query ([#904](https://github.com/cobaltcore-dev/cortex/pull/904)) +- Bump datasource parallel reconciles to 3 to reduce queue lag ([#907](https://github.com/cobaltcore-dev/cortex/pull/907)) +- Move `prometheusDatasourceControllerParallelReconciles` value from secrets to bundle ([#912](https://github.com/cobaltcore-dev/cortex/pull/912)) +- Update `github.com/sapcc/go-bits` ([#903](https://github.com/cobaltcore-dev/cortex/pull/903), [#913](https://github.com/cobaltcore-dev/cortex/pull/913), [#915](https://github.com/cobaltcore-dev/cortex/pull/915)) +- Update External dependencies ([#908](https://github.com/cobaltcore-dev/cortex/pull/908), [#910](https://github.com/cobaltcore-dev/cortex/pull/910), [#914](https://github.com/cobaltcore-dev/cortex/pull/914)) +- Update kube-prometheus-stack Docker tag to v86 ([#895](https://github.com/cobaltcore-dev/cortex/pull/895)) + +### cortex-crds v0.0.74 + +Includes updated chart cortex v0.1.0. + +### cortex-nova v0.0.74 + +Includes updated chart cortex v0.1.0. + +### cortex-cinder v0.0.74 + +Includes updated chart cortex v0.1.0. + +### cortex-pods v0.0.74 + +Includes updated chart cortex v0.1.0. + +### cortex-ironcore v0.0.74 + +Includes updated chart cortex v0.1.0. + +### cortex-manila v0.0.74 + +Includes updated chart cortex v0.1.0. + +## 2026-06-01 — [#901](https://github.com/cobaltcore-dev/cortex/pull/901) + +### cortex v0.0.60 (sha-12c6f24d) + +Non-breaking changes: +- Fix quota filter to use Knowledge CRD flavor groups for Limes summary RAM conversion ([#898](https://github.com/cobaltcore-dev/cortex/pull/898)) +- Refactor reservations: extract shared `ResourcesToBlock` to replace duplicated reservation blocking logic ([#896](https://github.com/cobaltcore-dev/cortex/pull/896)) +- Update `github.com/sapcc/go-bits` ([#894](https://github.com/cobaltcore-dev/cortex/pull/894), [#897](https://github.com/cobaltcore-dev/cortex/pull/897), [#899](https://github.com/cobaltcore-dev/cortex/pull/899)) +- Update `github.com/cobaltcore-dev/openstack-hypervisor-operator` to v1.2.3 ([#900](https://github.com/cobaltcore-dev/cortex/pull/900)) + +### cortex-nova v0.0.73 (sha-12c6f24d) + +Includes updated chart cortex v0.0.60. + +## 2026-05-27 — [#893](https://github.com/cobaltcore-dev/cortex/pull/893) + +### cortex v0.0.59 (sha-6bc914a6) + +Non-breaking changes: +- Probe `os_type` for KVM servers during server sync ([#886](https://github.com/cobaltcore-dev/cortex/pull/886)) +- Fix: subtract VM allocations when counting placeable slots ([#891](https://github.com/cobaltcore-dev/cortex/pull/891)) +- Fix: kvm-report-capacity to ignore VM allocations ([#885](https://github.com/cobaltcore-dev/cortex/pull/885)) +- Add CR safeguards, throttle CRD creation, adding limit ([#884](https://github.com/cobaltcore-dev/cortex/pull/884)) +- Fix postgres: rebuild image to resolve CVEs ([#888](https://github.com/cobaltcore-dev/cortex/pull/888)) +- Update `github.com/sapcc/go-bits` ([#889](https://github.com/cobaltcore-dev/cortex/pull/889)) +- Update `peter-evans/create-pull-request` action to v8 ([#890](https://github.com/cobaltcore-dev/cortex/pull/890)) + +### cortex-postgres v0.6.4 (sha-8cc792c5) + +Non-breaking changes: +- Rebuild image to resolve CVEs ([#888](https://github.com/cobaltcore-dev/cortex/pull/888)) + +### cortex-nova v0.0.72 (sha-6bc914a6) + +Includes updated charts cortex v0.0.59 and cortex-postgres v0.6.4. + +## 2026-05-22 — [#883](https://github.com/cobaltcore-dev/cortex/pull/883) + +### cortex v0.0.58 (sha-97506e29) + +Non-breaking changes: +- Fix: pipeline validating webhook prevented updates ([#882](https://github.com/cobaltcore-dev/cortex/pull/882)) +- Fix: use pipelines for CR scheduling requests that don't write history ([#881](https://github.com/cobaltcore-dev/cortex/pull/881)) +- Fix: capacity filter considers also reservations that are placed but waiting for 2nd reconcile cycle for status update ([#880](https://github.com/cobaltcore-dev/cortex/pull/880)) +- Add shadow mode and decision metric to quota filter ([#876](https://github.com/cobaltcore-dev/cortex/pull/876)) + +### cortex-nova v0.0.71 (sha-97506e29) + +Includes updated charts cortex v0.0.58 and cortex-postgres v0.6.3. + +## 2026-05-22 — [#877](https://github.com/cobaltcore-dev/cortex/pull/877) + +### cortex v0.0.57 (sha-110712de) + +Non-breaking changes: +- Fix: CR alerts fixed ([#870](https://github.com/cobaltcore-dev/cortex/pull/870)) +- Enrich CommittedResource kubectl wide view with status summary ([#868](https://github.com/cobaltcore-dev/cortex/pull/868)) +- Fix: update for flavors with memory amount without vram ([#869](https://github.com/cobaltcore-dev/cortex/pull/869)) +- Add `kvm_committed_resource_reservation` weigher ([#854](https://github.com/cobaltcore-dev/cortex/pull/854)) +- Fix(scheduling): skip weigher validation when no hosts to weigh ([#865](https://github.com/cobaltcore-dev/cortex/pull/865)) +- Add greq logger context and cycle status log to capacity controller ([#864](https://github.com/cobaltcore-dev/cortex/pull/864)) +- Fix: update KVM host OS version retrieval to use status field ([#872](https://github.com/cobaltcore-dev/cortex/pull/872)) +- Update `github.com/sapcc/go-bits` ([#878](https://github.com/cobaltcore-dev/cortex/pull/878)) + +### cortex-postgres v0.6.2 (sha-b012ae82) + +Non-breaking changes: +- Bump to PostgreSQL 18.4 and apply base image security upgrades ([#863](https://github.com/cobaltcore-dev/cortex/pull/863)) + +### cortex-nova v0.0.70 (sha-110712de) + +Includes updated charts cortex v0.0.57 and cortex-postgres v0.6.2. + +## 2026-05-20 — [#866](https://github.com/cobaltcore-dev/cortex/pull/866) + +### cortex v0.0.56 (sha-83b608ea) + +Non-breaking changes: +- Simplify dry run logic for committed resources ([#862](https://github.com/cobaltcore-dev/cortex/pull/862)) + +### cortex-nova v0.0.69 (sha-83b608ea) + +Includes updated chart cortex v0.0.56. + +## 2026-05-18 — [#861](https://github.com/cobaltcore-dev/cortex/pull/861) + +### cortex v0.0.55 (sha-3ec99921) + +Non-breaking changes: +- Quota: improve status completeness and observability ([#858](https://github.com/cobaltcore-dev/cortex/pull/858)) +- Fix: CR syncer uses units correctly ([#859](https://github.com/cobaltcore-dev/cortex/pull/859)) +- Make RAM unit per flavor group operator-configurable ([#860](https://github.com/cobaltcore-dev/cortex/pull/860)) + +### cortex-nova v0.0.68 (sha-3ec99921) + +Includes updated chart cortex v0.0.55. + +## 2026-05-18 — [#857](https://github.com/cobaltcore-dev/cortex/pull/857) + +### cortex v0.0.54 (sha-3981e731) + +Non-breaking changes: +- Add quota enforcement filter ([#855](https://github.com/cobaltcore-dev/cortex/pull/855)) +- Quota endpoint handles AZ without lighthouse cluster ([#856](https://github.com/cobaltcore-dev/cortex/pull/856)) +- Update external dependencies ([#852](https://github.com/cobaltcore-dev/cortex/pull/852)) +- Update `github.com/sapcc/go-bits` ([#851](https://github.com/cobaltcore-dev/cortex/pull/851)) + +### cortex-nova v0.0.67 (sha-3981e731) + +Includes updated chart cortex v0.0.54. + +## 2026-05-18 — [#850](https://github.com/cobaltcore-dev/cortex/pull/850) + +### cortex v0.0.53 (sha-aa518eb5) + +Non-breaking changes: +- LIQUID info: set `QuotaUpdateNeedsProjectMetadata` to true ([#849](https://github.com/cobaltcore-dev/cortex/pull/849)) + +### General + +Non-breaking changes: +- Remove non-docker tests from CI ([#834](https://github.com/cobaltcore-dev/cortex/pull/834)) + +### cortex-nova v0.0.66 (sha-aa518eb5) + +Includes updated chart cortex v0.0.53. + +## 2026-05-13 — [#848](https://github.com/cobaltcore-dev/cortex/pull/848) + +### cortex v0.0.52 (sha-8dd7100b) + +Non-breaking changes: +- Fix: tolerate unreachable remote clusters during field index setup ([#844](https://github.com/cobaltcore-dev/cortex/pull/844)) +- Fix: CR API responses matching Limes validations ([#843](https://github.com/cobaltcore-dev/cortex/pull/843)) +- Fix: start API server only after cache sync to prevent startup race ([#836](https://github.com/cobaltcore-dev/cortex/pull/836)) +- Add Prometheus metrics and alerting for the committed resource ([#840](https://github.com/cobaltcore-dev/cortex/pull/840)) + +### cortex-nova v0.0.65 (sha-8dd7100b) + +Includes updated chart cortex v0.0.52. + +## 2026-05-12 — [#845](https://github.com/cobaltcore-dev/cortex/pull/845) + +### cortex v0.0.51 (sha-98597910) + +Non-breaking changes: +- Fix: CR API responses matching Limes validations ([#843](https://github.com/cobaltcore-dev/cortex/pull/843)) +- Fix: start API server only after cache sync to prevent startup race ([#836](https://github.com/cobaltcore-dev/cortex/pull/836)) +- Add Prometheus metrics and alerting for the committed resource ([#840](https://github.com/cobaltcore-dev/cortex/pull/840)) + +### cortex-nova v0.0.64 (sha-98597910) + +Includes updated chart cortex v0.0.51. + +## 2026-05-12 — [#842](https://github.com/cobaltcore-dev/cortex/pull/842) + +### cortex v0.0.50 (sha-c8663afb) + +Non-breaking changes: +- Fix(CR): align the RAM resource unit exposed to Limes for CR/quota based on fixed/varying ram/core ratio ([#841](https://github.com/cobaltcore-dev/cortex/pull/841)) + +### cortex-nova v0.0.63 (sha-c8663afb) + +Includes updated chart cortex v0.0.50. + +## 2026-05-11 — [#839](https://github.com/cobaltcore-dev/cortex/pull/839) + +### cortex v0.0.49 (sha-b570ae10) + +Non-breaking changes: +- Track instance count (VM count) per project/AZ in quota ([#837](https://github.com/cobaltcore-dev/cortex/pull/837)) +- Fix: LIQUID API info and capacity endpoint bugs ([#838](https://github.com/cobaltcore-dev/cortex/pull/838)) +- Register ProjectQuota multicluster router in main ([#831](https://github.com/cobaltcore-dev/cortex/pull/831)) + +### cortex-nova v0.0.62 (sha-b570ae10) + +Includes updated chart cortex v0.0.49. + +## 2026-05-11 — [#830](https://github.com/cobaltcore-dev/cortex/pull/830) + +### cortex v0.0.48 (sha-86af7a6e) + +Non-breaking changes: +- Add CPU core committed resources — cores commitments use arithmetic headroom checks against `FlavorGroupCapacity.Status.TotalCapacity` instead of creating Reservation CRDs ([#826](https://github.com/cobaltcore-dev/cortex/pull/826)) +- Split `ProjectQuota` CRD into per-AZ CRDs (one CRD per project+AZ), add `AvailabilityZone` field to spec, and flatten status fields to per-AZ `map[string]int64` ([#827](https://github.com/cobaltcore-dev/cortex/pull/827)) +- Add `ProjectQuotaResourceRouter` for multicluster routing of `ProjectQuota` CRDs by availability zone ([#827](https://github.com/cobaltcore-dev/cortex/pull/827)) +- Add `FlavorGroupCapacityResourceRouter` for multicluster routing of `FlavorGroupCapacity` CRDs by availability zone ([#824](https://github.com/cobaltcore-dev/cortex/pull/824)) +- Register `FlavorGroupCapacity` router in manager's multicluster client ([#828](https://github.com/cobaltcore-dev/cortex/pull/828)) +- Add `TotalCapacity` field to `FlavorGroupCapacityStatus` for tracking total capacity of eligible hosts in an empty-datacenter scenario ([#826](https://github.com/cobaltcore-dev/cortex/pull/826)) +- Report capacity per resource type (RAM, cores, instances) in `report-capacity` endpoint instead of flat slot count ([#826](https://github.com/cobaltcore-dev/cortex/pull/826)) +- Switch LIQUID API commitment unit from multiples of the smallest flavor's RAM to a fixed 1 GiB per unit ([#822](https://github.com/cobaltcore-dev/cortex/pull/822)) +- Rename resource types in project capacity metrics: `vcpu` → `cpu`, `memory` → `ram` to align with host capacity metrics ([#823](https://github.com/cobaltcore-dev/cortex/pull/823)) +- Rename metrics for unused VMware commitments to clarify they represent the unused portion, not total ([#820](https://github.com/cobaltcore-dev/cortex/pull/820)) +- Disable weighers in `vmware-hana-bin-packing` pipeline (filters-only mode) ([#816](https://github.com/cobaltcore-dev/cortex/pull/816)) +- Fix concurrency issue in CommittedResource CRD updates ([#825](https://github.com/cobaltcore-dev/cortex/pull/825)) +- Update `go.xyrillian.de/gg` v1.7.0 (renamed from `github.com/majewsky/gg`), `sapcc/go-api-declarations` v1.22.0, `sapcc/go-bits`, `openstack-hypervisor-operator` v1.2.2 ([#817](https://github.com/cobaltcore-dev/cortex/pull/817), [#818](https://github.com/cobaltcore-dev/cortex/pull/818)) +- Update `controller-gen` to v0.21.0 (CRD annotation bump) +- Update `actions/create-github-app-token` to v3 ([#819](https://github.com/cobaltcore-dev/cortex/pull/819)) +- Use beefy runner for CodeQL workflow + +### cortex-nova v0.0.61 (sha-86af7a6e) + +Includes updated chart cortex v0.0.48. + +Non-breaking changes: +- Remove all weighers from `vmware-hana-bin-packing` pipeline template ([#816](https://github.com/cobaltcore-dev/cortex/pull/816)) + ## 2026-05-07 — [#814](https://github.com/cobaltcore-dev/cortex/pull/814) ### cortex v0.0.47 (sha-b8cecd0c) diff --git a/docs/guides/multicluster/readme.md b/docs/guides/multicluster/readme.md index ab9c9e60a..4a4a1977a 100644 --- a/docs/guides/multicluster/readme.md +++ b/docs/guides/multicluster/readme.md @@ -121,6 +121,27 @@ $(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /') EOF ``` +### Skipping TLS verification + +If a remote apiserver uses a CA certificate that rotates frequently and does not chain to a stable root, you can set `insecureSkipTLSVerify: true` instead of providing a `caCert`. When enabled, the `caCert` field is ignored and TLS certificate verification is skipped entirely for that remote. + +> [!WARNING] +> Setting `insecureSkipTLSVerify` disables all TLS certificate verification for the connection to that remote apiserver. This makes the connection susceptible to man-in-the-middle attacks. Only use this option when you cannot maintain a stable CA certificate and the network path to the remote apiserver is otherwise secured. + +```yaml +global: + conf: + apiservers: + remotes: + - host: https://my-remote-apiserver:6443 + insecureSkipTLSVerify: true + gvks: + - kvm.cloud.sap/v1/Hypervisor + - kvm.cloud.sap/v1/HypervisorList + labels: + az: remote-az +``` + Additionally, we will add some hypervisors cortex can reconcile on: ```bash kubectl --context kind-cortex-remote-az-a apply -f docs/guides/multicluster/hypervisors-az-a.yaml diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index 4d96d43a6..7d80064b0 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -35,7 +35,7 @@ The CR reservation implementation is located in `internal/scheduling/reservation - Scheduling pipeline selection per flavor group - Per-flavor-group resource flags (`handlesCommitments`, `hasCapacity`, `hasQuota`) controlling which resource types are active for each group -**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml` with prefixes: +**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/templates/alerts.yaml` with prefixes: - `cortex_committed_resource_change_api_*` - `cortex_committed_resource_usage_api_*` - `cortex_committed_resource_capacity_api_*` diff --git a/go.mod b/go.mod index 8b9b1fafc..c1a2ef224 100644 --- a/go.mod +++ b/go.mod @@ -9,8 +9,8 @@ require ( github.com/ironcore-dev/ironcore v0.3.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/sapcc/go-bits v0.0.0-20260529151418-8d2b33444f03 - go.xyrillian.de/gg v1.7.0 + github.com/sapcc/go-bits v0.0.0-20260605140732-260f8544709d + go.xyrillian.de/gg v1.9.0 k8s.io/api v0.36.1 k8s.io/apimachinery v0.36.1 k8s.io/client-go v0.36.1 @@ -73,7 +73,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/lib/pq v1.12.3 - github.com/mattn/go-sqlite3 v1.14.44 + github.com/mattn/go-sqlite3 v1.14.45 github.com/moby/sys/user v0.4.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect @@ -85,7 +85,7 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/poy/onpar v0.3.5 // indirect - github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/common v0.68.1 // indirect github.com/prometheus/procfs v0.19.2 // indirect github.com/sapcc/go-api-declarations v1.22.0 github.com/sirupsen/logrus v1.9.3 // indirect @@ -105,12 +105,12 @@ require ( go.opentelemetry.io/proto/otlp v1.9.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.28.0 - go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 // indirect golang.org/x/net v0.55.0 // indirect - golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/sync v0.20.0 golang.org/x/sys v0.45.0 // indirect golang.org/x/term v0.43.0 diff --git a/go.sum b/go.sum index 90991ed62..c772f5146 100644 --- a/go.sum +++ b/go.sum @@ -153,8 +153,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= -github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= -github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/mattn/go-sqlite3 v1.14.45 h1:6KA/spDguL3KV8rnybG7ezSaE4SeMR3KC9VbUoAQaIk= +github.com/mattn/go-sqlite3 v1.14.45/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= @@ -195,8 +195,8 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= -github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/common v0.68.1 h1:omjRRl4QP4komogpXuhfeOiisQg7xdy8VM1UY+pStaY= +github.com/prometheus/common v0.68.1/go.mod h1:ZzL3f6u94qUxh9p+tJTrF+FvBS1XXbbRAZCQkytAL0Y= github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= @@ -204,8 +204,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sapcc/go-api-declarations v1.22.0 h1:nU/eJ6OO54Z9YSo1gWinD0A2etrfZObCwYdB9xA0VWE= github.com/sapcc/go-api-declarations v1.22.0/go.mod h1:x3V8bzg7Y4kmbA+DeWWwKteFEdCCSiVQdwRXj4fGAYY= -github.com/sapcc/go-bits v0.0.0-20260529151418-8d2b33444f03 h1:ni4+0WYad/128i2s3lrE2wOwzu1BkhZpWq9ZgTDsjCk= -github.com/sapcc/go-bits v0.0.0-20260529151418-8d2b33444f03/go.mod h1:tlX0d8TvLgEikNWwFbB1SxnW0q/6XybpXjt8mr97Qzg= +github.com/sapcc/go-bits v0.0.0-20260605140732-260f8544709d h1:dS3ISuGS2pZZFDw1vcYfTBQN6YqhDtmeLe1meY7M3VM= +github.com/sapcc/go-bits v0.0.0-20260605140732-260f8544709d/go.mod h1:3p6IHviXqwVMgZoEjspIcsXGZv4tQLtLcu1uYu+ycjI= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -259,10 +259,10 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= -go.xyrillian.de/gg v1.7.0 h1:IA0BJaX9TtBD7crH+CSoK4lYmBk5zi7nUQd0YRzPNf0= -go.xyrillian.de/gg v1.7.0/go.mod h1:dj+ZhCwC6JKWyFvImhVNXQAErrRcYMUkXu6vwWYNrzQ= -go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= -go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.xyrillian.de/gg v1.9.0 h1:vszip+UjOBaczo/s9tr6Ij2eo39pxWlVZdbBcLkzXBM= +go.xyrillian.de/gg v1.9.0/go.mod h1:dj+ZhCwC6JKWyFvImhVNXQAErrRcYMUkXu6vwWYNrzQ= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= go4.org/netipx v0.0.0-20231129151722-fdeea329fbba h1:0b9z3AuHCjxk0x/opv64kcgZLBseWJUpBw5I82+2U4M= @@ -273,8 +273,8 @@ golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= -golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= -golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index f0c93337b..6d1e8cfb1 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml b/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml deleted file mode 100644 index 6684e3392..000000000 --- a/helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml +++ /dev/null @@ -1,260 +0,0 @@ -groups: -- name: cortex-cinder-alerts - rules: - - alert: CortexCinderSchedulingDown - expr: | - up{pod=~"cortex-cinder-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-cinder-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Cinder is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Cinder will - not be served. This is no immediate problem, since Cinder will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexCinderKnowledgeDown - expr: | - up{pod=~"cortex-cinder-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-cinder-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Cinder is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexCinderHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 400 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexCinderSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Cinder Scheduler HTTP request 500 errors too high" - description: > - Cinder Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Cinder will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexCinderHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexCinderHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexCinderTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexCinderSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexCinderSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexCinderDatasourceUnready - expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexCinderKnowledgeUnready - expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexCinderDecisionsWithErrors - expr: cortex_decision_state{domain="cinder",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderTooManyDecisionsWaiting - expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexCinderKPIUnready - expr: | - cortex_kpi_state{domain="cinder",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexCinderPipelineUnready - expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-cinder/templates/alerts.yaml b/helm/bundles/cortex-cinder/templates/alerts.yaml index 59496c33d..4beea8b53 100644 --- a/helm/bundles/cortex-cinder/templates/alerts.yaml +++ b/helm/bundles/cortex-cinder/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,264 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-cinder-alerts + rules: + - alert: CortexCinderSchedulingDown + expr: | + up{pod=~"cortex-cinder-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-cinder-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Cinder is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Cinder will + not be served. This is no immediate problem, since Cinder will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexCinderKnowledgeDown + expr: | + up{pod=~"cortex-cinder-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-cinder-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Cinder is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexCinderHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 400 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexCinderSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-cinder-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Cinder Scheduler HTTP request 500 errors too high" + description: > + Cinder Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Cinder will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexCinderHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-cinder-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexCinderHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-cinder-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexCinderTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-cinder-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexCinderSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-cinder-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-cinder-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexCinderSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-cinder-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexCinderDatasourceUnready + expr: cortex_datasource_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexCinderKnowledgeUnready + expr: cortex_knowledge_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexCinderDecisionsWithErrors + expr: cortex_decision_state{domain="cinder",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderTooManyDecisionsWaiting + expr: cortex_decision_state{domain="cinder",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexCinderKPIUnready + expr: | + cortex_kpi_state{domain="cinder",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexCinderPipelineUnready + expr: cortex_pipeline_state{domain="cinder",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index 6e329e03f..7fac28002 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 2857c7c40..182ee9ab9 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml b/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml deleted file mode 100644 index 0c72d9a92..000000000 --- a/helm/bundles/cortex-ironcore/alerts/ironcore.alerts.yaml +++ /dev/null @@ -1,3 +0,0 @@ -groups: -- name: cortex-ironcore-alerts - rules: [] diff --git a/helm/bundles/cortex-ironcore/templates/alerts.yaml b/helm/bundles/cortex-ironcore/templates/alerts.yaml deleted file mode 100644 index ca27396a5..000000000 --- a/helm/bundles/cortex-ironcore/templates/alerts.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright SAP SE -# SPDX-License-Identifier: Apache-2.0 - -{{- if .Values.alerts.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: cortex-ironcore-alerts - labels: - type: alerting-rules - prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} -spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} -{{- end }} diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 42cdc2df0..a16950126 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml b/helm/bundles/cortex-manila/alerts/manila.alerts.yaml deleted file mode 100644 index 2211d44fe..000000000 --- a/helm/bundles/cortex-manila/alerts/manila.alerts.yaml +++ /dev/null @@ -1,235 +0,0 @@ -groups: -- name: cortex-manila-alerts - rules: - - alert: CortexManilaSchedulingDown - expr: | - up{pod=~"cortex-manila-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-manila-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Manila is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Manila will - not be served. This is no immediate problem, since Manila will continue - placing new VMs. However, the placement will be less desirable. - - - alert: CortexManilaKnowledgeDown - expr: | - up{pod=~"cortex-manila-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-manila-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Manila is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexManilaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 400 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexManilaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/apierrors - annotations: - summary: "Manila Scheduler HTTP request 500 errors too high" - description: > - Manila Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Manila will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexManilaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexManilaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexManilaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexManilaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexManilaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexManilaDatasourceUnready - expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexManilaKnowledgeUnready - expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexManilaKPIUnready - expr: | - cortex_kpi_state{domain="manila",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexManilaPipelineUnready - expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 - for: 5m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. diff --git a/helm/bundles/cortex-manila/templates/alerts.yaml b/helm/bundles/cortex-manila/templates/alerts.yaml index 1f25b0354..ef36fe983 100644 --- a/helm/bundles/cortex-manila/templates/alerts.yaml +++ b/helm/bundles/cortex-manila/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,239 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-manila-alerts + rules: + - alert: CortexManilaSchedulingDown + expr: | + up{pod=~"cortex-manila-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-manila-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Manila is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Manila will + not be served. This is no immediate problem, since Manila will continue + placing new VMs. However, the placement will be less desirable. + + - alert: CortexManilaKnowledgeDown + expr: | + up{pod=~"cortex-manila-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-manila-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Manila is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexManilaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 400 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexManilaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-manila-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/apierrors + annotations: + summary: "Manila Scheduler HTTP request 500 errors too high" + description: > + Manila Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Manila will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexManilaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-manila-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexManilaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-manila-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexManilaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-manila-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexManilaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-manila-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-manila-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexManilaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-manila-metrics"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexManilaDatasourceUnready + expr: cortex_datasource_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexManilaKnowledgeUnready + expr: cortex_knowledge_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexManilaKPIUnready + expr: | + cortex_kpi_state{domain="manila",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexManilaPipelineUnready + expr: cortex_pipeline_state{domain="manila",state!="ready"} != 0 + for: 5m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. {{- end }} diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index 35989568d..d6d9057a6 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,7 +5,7 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres @@ -16,12 +16,12 @@ dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml deleted file mode 100644 index 46e93ef05..000000000 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ /dev/null @@ -1,609 +0,0 @@ -groups: -- name: cortex-nova-alerts - rules: - - alert: CortexNovaSchedulingDown - expr: | - up{pod=~"cortex-nova-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-nova-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: critical - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Scheduling for Nova is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is non-critical for vmware virtual machines, but - blocks kvm virtual machines from being scheduled. Thus, it is - recommended to immediately investigate and resolve the issue. - - - alert: CortexNovaKnowledgeDown - expr: | - up{pod=~"cortex-nova-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-nova-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/down - annotations: - summary: "Cortex Knowledge for Nova is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 - for: 5m - labels: - context: descheduler - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Descheduler pipeline is erroring." - description: > - The Cortex descheduler pipeline is encountering errors during its execution. - This may indicate issues with the descheduling logic or the underlying infrastructure. - It is recommended to investigate the descheduler logs and the state of the VMs being processed. - - - alert: CortexNovaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 400 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexNovaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/api-errors - annotations: - summary: "Nova Scheduler HTTP request 500 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Nova will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexNovaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexNovaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/deployment - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexNovaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/database - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexNovaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexNovaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexNovaDatasourceUnready - expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexNovaKnowledgeUnready - expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexNovaDecisionsWithErrors - expr: cortex_decision_state{domain="nova",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="nova",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaKPIUnready - expr: | - cortex_kpi_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexNovaPipelineUnready - expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/unready - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. - - - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 - for: 5m - labels: - context: scheduling - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/scheduling - annotations: - summary: "Nova scheduling cannot find valid KVM hosts" - description: > - Cortex is seeing new faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid `{{$labels.hvtype}}` host. This may indicate - capacity issues, misconfigured filters, or resource constraints in the - datacenter. Investigate the affected VMs and hypervisor availability. - - - alert: CortexNovaNewDatasourcesNotReconciling - expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 - for: 60m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "New datasource `{{$labels.datasource}}` has not reconciled" - description: > - A new datasource `{{$labels.datasource}}` has been added but has not - completed its first reconciliation yet. This may indicate issues with - the datasource controller's workqueue overprioritizing other datasources. - - - alert: CortexNovaExistingDatasourcesLackingBehind - expr: | - sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 - and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 - for: 10m - labels: - context: datasources - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" - description: > - An existing datasource `{{$labels.datasource}}` has been queued for - reconciliation for more than 10 minutes. This may indicate issues with - the datasource controller's workqueue or that this or another datasource - is taking an unusually long time to reconcile. - - - alert: CortexNovaReconcileErrorsHigh - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-errors - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconcile error rate >10%" - description: > - More than 10% of controller reconciles are resulting in errors. This may - indicate issues with the controller logic, connectivity problems, or - external factors causing failures. Check the controller logs for error - details and investigate the affected resources. - - - alert: CortexNovaReconcileDurationHigher10Min - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 - for: 15m - labels: - context: controller-duration - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/reconciles - annotations: - summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" - description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" - - - alert: CortexNovaWorkqueueNotDrained - expr: | - sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 - for: 60m - labels: - context: controller-workqueue - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/datasources - annotations: - summary: "Controller {{ $labels.name }}'s backlog is not being drained." - description: > - The workqueue for controller {{ $labels.name }} has a backlog that is - not being drained. This may indicate that the controller is overwhelmed - with work or is stuck on certain resources. Check the controller logs - and the state of the resources it manages for more details. - - - alert: CortexNovaWebhookLatencyHigh - expr: | - histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} latency is high" - description: > - The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). - This may indicate performance issues with the webhook server or the logic it executes. - Check the webhook server logs and monitor its resource usage for more insights. - - - alert: CortexNovaWebhookErrorsHigh - expr: | - (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) - / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-webhook - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" - description: > - The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. - This may indicate issues with the webhook logic, connectivity problems, or - external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. - - # Committed Resource Info API - - alert: CortexNovaCommittedResourceInfoUnavailable - expr: | - rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource info API is unavailable" - description: > - The committed resource info API (Limes LIQUID integration) has been returning - 503 Service Unavailable for more than 5 minutes. This typically means the - flavor group knowledge CRD is not ready or missing. Limes cannot discover - available committed resources until the issue is resolved. - - # Committed Resource Change API - - alert: CortexNovaCommittedResourceChangeErrors - expr: | - rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource change API HTTP 5xx errors" - description: > - The committed resource change API (Limes LIQUID integration) is returning - HTTP 5xx errors. This is not expected and indicates an internal problem - processing commitment changes. Limes will retry, but new commitments may - not be fulfilled until the issue is resolved. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - ( - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) - ) > 0.3 - and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 - for: 15m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource rejection rate too high ({{ $value | humanizePercentage }})" - description: > - More than 30% of commitment changes have been rejected over the last 15 minutes. - This may indicate insufficient capacity to fulfill new commitments. Rejected - commitments are rolled back. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 - for: 1m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API timeout detected" - description: > - A commitment change request timed out after the configured deadline. - Timeouts indicate the scheduling pipeline could not place reservations in time. - Affected changes are rolled back. Investigate scheduler performance or reservation backlog. - - - alert: CortexNovaCommittedResourceChangeLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 - and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-performance - annotations: - summary: "Committed Resource change API p95 latency >= 10s" - description: > - The committed resource change API p95 latency has reached or exceeded 10 seconds, - approaching the configured watch timeout. Requests close to the timeout are at risk - of being rolled back. Investigate scheduler performance or reservation backlog. - - # Committed Resource Capacity API - - alert: CortexNovaCommittedResourceCapacityErrors - expr: | - rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity API HTTP 5xx errors" - description: > - The committed resource capacity API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems calculating cluster capacity. - Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityDroppedToZero - expr: | - (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) - and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity - annotations: - summary: "Committed Resource capacity for {{ $labels.resource }} in {{ $labels.az }} dropped to zero" - description: > - The reported capacity for committed resource {{ $labels.resource }} in - availability zone {{ $labels.az }} has dropped from a positive value to zero. - This may mean hypervisors in that AZ are fully utilized for the corresponding - flavor group and no further committed resources can be placed there. - - # Committed Resource Usage API - - alert: CortexNovaCommittedResourceUsageErrors - expr: | - rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource usage API HTTP 5xx errors" - description: > - The committed resource usage API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems fetching reservation or - Nova server data. Limes may receive stale or incomplete usage data. - - # Committed Resource Quota API - - alert: CortexNovaCommittedResourceQuotaErrors - expr: | - rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors - annotations: - summary: "Committed Resource quota API HTTP 5xx errors" - description: > - The committed resource quota API (Limes LIQUID integration) is returning - HTTP 5xx errors. This indicates internal problems computing or applying - quota. Limes may not be able to enforce committed resource quotas. diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml index d2964e864..6f3fabef2 100644 --- a/helm/bundles/cortex-nova/templates/alerts.yaml +++ b/helm/bundles/cortex-nova/templates/alerts.yaml @@ -1,6 +1,10 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 +# NOTE: This file is rendered by Helm. Prometheus templating directives +# (e.g. {{ "{{" }} $labels.foo {{ "}}" }}) must be escaped using Style B: +# replace the outer `{{` and `}}` with `{{ "{{" }}` and `{{ "}}" }}`. + {{- if .Values.alerts.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -10,8 +14,615 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-nova-alerts + rules: + - alert: CortexNovaSchedulingDown + expr: | + up{pod=~"cortex-nova-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-nova-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: {{ if .Values.kvm.enabled }}critical{{ else }}warning{{ end }} + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Scheduling for Nova is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Nova will + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. + + - alert: CortexNovaKnowledgeDown + expr: | + up{pod=~"cortex-nova-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-nova-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/down + annotations: + summary: "Cortex Knowledge for Nova is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexNovaDeschedulerPipelineErroring + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + for: 5m + labels: + context: descheduler + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Descheduler pipeline is erroring." + description: > + The Cortex descheduler pipeline is encountering errors during its execution. + This may indicate issues with the descheduling logic or the underlying infrastructure. + It is recommended to investigate the descheduler logs and the state of the VMs being processed. + + - alert: CortexNovaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 400 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexNovaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/api-errors + annotations: + summary: "Nova Scheduler HTTP request 500 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Nova will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexNovaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > {{ .Values.alerts.thresholds.highMemoryMiB }} * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much memory" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than {{ .Values.alerts.thresholds.highMemoryMiB }} MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexNovaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/deployment + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` uses too much CPU" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexNovaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/database + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexNovaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` Sync not successful" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` experienced an issue syncing data from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexNovaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "`{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any new data from `{{ "{{" }} $labels.datasource {{ "}}" }}`" + description: > + `{{ "{{" }} $labels.component {{ "}}" }}` is not syncing any objects from the datasource `{{ "{{" }} $labels.datasource {{ "}}" }}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexNovaDatasourceUnready + expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexNovaKnowledgeUnready + expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Knowledge `{{ "{{" }} $labels.knowledge {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexNovaDecisionsWithErrors + expr: cortex_decision_state{domain="nova",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaTooManyDecisionsWaiting + expr: cortex_decision_state{domain="nova",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{ "{{" }} $labels.operator {{ "}}" }}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaKPIUnready + expr: | + cortex_kpi_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "KPI `{{ "{{" }} $labels.kpi {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexNovaPipelineUnready + expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/unready + annotations: + summary: "Pipeline `{{ "{{" }} $labels.pipeline {{ "}}" }}` is in `{{ "{{" }} $labels.state {{ "}}" }}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. + + {{- if .Values.kvm.enabled }} + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (increase(cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*",faultmsg!~".*No such host.*"}[5m])) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/scheduling + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing new faulty vms in `{{ "{{" }} $labels.az {{ "}}" }}` where Nova scheduling + failed to find a valid `{{ "{{" }} $labels.hvtype {{ "}}" }}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. + {{- end }} + + - alert: CortexNovaNewDatasourcesNotReconciling + expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 + for: 60m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "New datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has not reconciled" + description: > + A new datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been added but has not + completed its first reconciliation yet. This may indicate issues with + the datasource controller's workqueue overprioritizing other datasources. + + - alert: CortexNovaExistingDatasourcesLackingBehind + expr: | + sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 + and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 + for: 10m + labels: + context: datasources + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` is lacking behind" + description: > + An existing datasource `{{ "{{" }} $labels.datasource {{ "}}" }}` has been queued for + reconciliation for more than 10 minutes. This may indicate issues with + the datasource controller's workqueue or that this or another datasource + is taking an unusually long time to reconcile. + + - alert: CortexNovaReconcileErrorsHigh + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-errors + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconcile error rate >10%" + description: > + More than 10% of controller reconciles are resulting in errors. This may + indicate issues with the controller logic, connectivity problems, or + external factors causing failures. Check the controller logs for error + details and investigate the affected resources. + + - alert: CortexNovaReconcileDurationHigher10Min + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > {{ .Values.alerts.thresholds.reconcileDurationSeconds }} + for: 15m + labels: + context: controller-duration + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/reconciles + annotations: + summary: "Controller reconciliation takes longer than ({{ "{{" }} $value | humanizeDuration {{ "}}" }})" + description: "Reconcile duration higher than 10m while reconciling {{ "{{" }} $labels.controller {{ "}}" }}" + + - alert: CortexNovaWorkqueueNotDrained + expr: | + sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 + for: 60m + labels: + context: controller-workqueue + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/datasources + annotations: + summary: "Controller {{ "{{" }} $labels.name {{ "}}" }}'s backlog is not being drained." + description: > + The workqueue for controller {{ "{{" }} $labels.name {{ "}}" }} has a backlog that is + not being drained. This may indicate that the controller is overwhelmed + with work or is stuck on certain resources. Check the controller logs + and the state of the resources it manages for more details. + + - alert: CortexNovaWebhookLatencyHigh + expr: | + histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} latency is high" + description: > + The latency for webhook {{ "{{" }} $labels.webhook {{ "}}" }} is higher than expected (p90 > 200ms). + This may indicate performance issues with the webhook server or the logic it executes. + Check the webhook server logs and monitor its resource usage for more insights. + + - alert: CortexNovaWebhookErrorsHigh + expr: | + (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) + / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-webhook + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ "{{" }} $labels.webhook {{ "}}" }} is experiencing errors" + description: > + The webhook {{ "{{" }} $labels.webhook {{ "}}" }} has experienced errors in the last 5 minutes. + This may indicate issues with the webhook logic, connectivity problems, or + external factors causing failures. Check the webhook server logs for error + details and investigate the affected resources. + + # Committed Resource Info API + - alert: CortexNovaCommittedResourceInfoUnavailable + expr: | + rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code="503"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource info API is unavailable" + description: > + The committed resource info API (Limes LIQUID integration) has been returning + 503 Service Unavailable for more than 5 minutes. This typically means the + flavor group knowledge CRD is not ready or missing. Limes cannot discover + available committed resources until the issue is resolved. + + # Committed Resource Change API + - alert: CortexNovaCommittedResourceChangeErrors + expr: | + rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource change API HTTP 5xx errors" + description: > + The committed resource change API (Limes LIQUID integration) is returning + HTTP 5xx errors. This is not expected and indicates an internal problem + processing commitment changes. Limes will retry, but new commitments may + not be fulfilled until the issue is resolved. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + ( + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected", dry_run="false"}[15m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) + ) > 0.3 + and on() sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", dry_run="false"}[15m])) > 0 + for: 15m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource rejection rate too high ({{ "{{" }} $value | humanizePercentage {{ "}}" }})" + description: > + More than 30% of commitment changes have been rejected over the last 15 minutes. + This may indicate insufficient capacity to fulfill new commitments. Rejected + commitments are rolled back. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics", dry_run="false"}[10m]) > 0 + for: 1m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API timeout detected" + description: > + A commitment change request timed out after the configured deadline. + Timeouts indicate the scheduling pipeline could not place reservations in time. + Affected changes are rolled back. Investigate scheduler performance or reservation backlog. + + - alert: CortexNovaCommittedResourceChangeLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics", dry_run="false"}[5m])) by (le)) >= 10 + and on() sum(rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", dry_run="false"}[5m])) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-performance + annotations: + summary: "Committed Resource change API p95 latency >= 10s" + description: > + The committed resource change API p95 latency has reached or exceeded 10 seconds, + approaching the configured watch timeout. Requests close to the timeout are at risk + of being rolled back. Investigate scheduler performance or reservation backlog. + + # Committed Resource Capacity API + - alert: CortexNovaCommittedResourceCapacityErrors + expr: | + rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity API HTTP 5xx errors" + description: > + The committed resource capacity API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems calculating cluster capacity. + Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityDroppedToZero + expr: | + (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) + and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-capacity + annotations: + summary: "Committed Resource capacity for {{ "{{" }} $labels.resource {{ "}}" }} in {{ "{{" }} $labels.az {{ "}}" }} dropped to zero" + description: > + The reported capacity for committed resource {{ "{{" }} $labels.resource {{ "}}" }} in + availability zone {{ "{{" }} $labels.az {{ "}}" }} has dropped from a positive value to zero. + This may mean hypervisors in that AZ are fully utilized for the corresponding + flavor group and no further committed resources can be placed there. + + # Committed Resource Usage API + - alert: CortexNovaCommittedResourceUsageErrors + expr: | + rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource usage API HTTP 5xx errors" + description: > + The committed resource usage API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems fetching reservation or + Nova server data. Limes may receive stale or incomplete usage data. + + # Committed Resource Quota API + - alert: CortexNovaCommittedResourceQuotaErrors + expr: | + rate(cortex_committed_resource_quota_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex-status-dashboard/cortex-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/committed-resource-api-errors + annotations: + summary: "Committed Resource quota API HTTP 5xx errors" + description: > + The committed resource quota API (Limes LIQUID integration) is returning + HTTP 5xx errors. This indicates internal problems computing or applying + quota. Limes may not be able to enforce committed resource quotas. {{- end }} diff --git a/helm/bundles/cortex-nova/templates/datasources_kvm.yaml b/helm/bundles/cortex-nova/templates/datasources_kvm.yaml index c2d78a3ac..8d7765b2a 100644 --- a/helm/bundles/cortex-nova/templates/datasources_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/datasources_kvm.yaml @@ -22,7 +22,7 @@ spec: alias: kvm_libvirt_domain_steal_pct # This metric is exported by https://github.com/cobaltcore-dev/kvm-monitoring query: | - max by (domain) (rate(kvm_domain_libvirt_vcpu_delay_nanoseconds[5m])) / 1e9 * 100 + max by (domain) (rate(kvm_domain_libvirt_vcpu_delay_nanoseconds_total[5m])) / 1e9 * 100 type: kvm_libvirt_domain_metric # It's ok to only look at a short time period here. timeRange: "1200s" # 20 minutes diff --git a/helm/bundles/cortex-nova/templates/kpis_kvm.yaml b/helm/bundles/cortex-nova/templates/kpis_kvm.yaml index 48b9eb155..10ff5c45f 100644 --- a/helm/bundles/cortex-nova/templates/kpis_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/kpis_kvm.yaml @@ -26,4 +26,19 @@ spec: - name: nova-servers - name: nova-flavors - name: identity-projects + - name: identity-domains +--- +apiVersion: cortex.cloud/v1alpha1 +kind: KPI +metadata: + name: kvm-hana-stacking +spec: + schedulingDomain: nova + impl: kvm_hana_stacking_kpi + dependencies: + datasources: + - name: nova-servers + - name: nova-flavors + - name: identity-projects + - name: identity-domains {{- end }} \ No newline at end of file diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 4a194ae50..273f09dda 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -17,6 +17,11 @@ owner-info: alerts: enabled: true prometheus: openstack + thresholds: + # Memory threshold for CortexNovaHighMemoryUsage in MiB. + highMemoryMiB: 6000 + # Reconcile-duration threshold for CortexNovaReconcileDurationHigher10Min in seconds. + reconcileDurationSeconds: 600 serviceMonitor: extraLabels: {} @@ -263,6 +268,8 @@ cortex-knowledge-controllers: labels: <<: *cortexMonitoringLabels component: nova-knowledge + openstackDatasourceControllerParallelReconciles: 3 + prometheusDatasourceControllerParallelReconciles: 2 enabledControllers: - datasource-controllers - knowledge-controllers diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml deleted file mode 100644 index e65b944d6..000000000 --- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml +++ /dev/null @@ -1,179 +0,0 @@ -groups: -- name: cortex-placement-shim-alerts - rules: - # Liveness - - alert: CortexPlacementShimDown - expr: | - up{pod=~"cortex-placement-shim-.*"} != 1 or - absent(up{pod=~"cortex-placement-shim-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-down - annotations: - summary: "Cortex Placement Shim is down" - description: > - The Cortex Placement Shim is down. Placement API requests that are - routed through the shim will not be served. OpenStack services relying - on the shim for resource provider lookups and allocation candidates - will degrade. - - # Downstream HTTP errors (client -> shim) - - alert: CortexPlacementShimDownstreamHttp400sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 4xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 4xx - errors at a sustained rate. This may indicate that the request format - from OpenStack services has changed, authentication tokens are invalid, - or the shim is rejecting malformed requests. Investigate the shim logs - for details on which endpoints and request patterns are affected. - - - alert: CortexPlacementShimDownstreamHttp500sTooHigh - expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream HTTP 5xx errors too high" - description: > - The Placement Shim is responding to client requests with HTTP 5xx - errors. This indicates internal problems within the shim such as - handler panics or misconfiguration. OpenStack services may experience - degraded placement functionality until the issue is resolved. - - # Upstream HTTP errors (shim -> Placement API) - - alert: CortexPlacementShimUpstreamHttp5xxTooHigh - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream HTTP 5xx errors too high" - description: > - The upstream Placement API is returning 5xx errors to the shim. - This indicates the OpenStack Placement service itself is having - problems. The shim forwards these errors to its clients. Investigate - the Placement API service health and logs. - - - alert: CortexPlacementShimUpstreamUnreachable - expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim cannot reach the upstream Placement API" - description: > - The Placement Shim is unable to reach the upstream OpenStack Placement - API and is returning 502 Bad Gateway errors. This means all forwarded - requests are failing. Check network connectivity, the Placement API - service endpoint configuration, and whether the upstream service is - running. - - # Latency alerts - - alert: CortexPlacementShimDownstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: api - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim downstream latency too high" - description: > - The Placement Shim downstream request latency (p95) exceeds 10 - seconds. This affects all OpenStack services making placement - requests through the shim. The cause may be slow upstream responses, - shim processing overhead, or resource contention. Investigate both - shim and upstream Placement API performance. - - - alert: CortexPlacementShimUpstreamLatencyTooHigh - expr: | - histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 - and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 - for: 5m - labels: - context: upstream - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-api-errors - annotations: - summary: "Placement Shim upstream latency too high" - description: > - The upstream Placement API response latency (p95) as seen by the - shim exceeds 10 seconds. This directly impacts the end-to-end - latency of placement requests. Investigate the upstream Placement - API performance and network conditions. - - # Resource usage - - alert: CortexPlacementShimHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much memory" - description: > - The Placement Shim is using more than 1500 MiB of resident memory - against a limit of 2048 MiB. This may indicate a memory leak, a - large number of cached hypervisors, or unexpected request patterns. - If the usage continues to grow, the pod will be OOM-killed. - - - alert: CortexPlacementShimHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 - for: 5m - labels: - context: cpu - dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/alerts/shim-resource-usage - annotations: - summary: "Placement Shim uses too much CPU" - description: > - The Placement Shim is consuming more than 40% of a single CPU core - against a limit of 500m. Under normal operation the shim should use - much less since it primarily proxies requests. This may indicate a - hot loop, excessive logging, or an unusual traffic spike. - diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml index 7db3b96e6..c570ccd91 100644 --- a/helm/bundles/cortex-placement-shim/templates/alerts.yaml +++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml @@ -10,8 +10,182 @@ metadata: type: alerting-rules prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} spec: - {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} - {{- range $path, $file := $files }} - {{ $file | toString | nindent 2 }} - {{- end }} + groups: + - name: cortex-placement-shim-alerts + rules: + # Liveness + - alert: CortexPlacementShimDown + expr: | + up{pod=~"cortex-placement-shim-.*"} != 1 or + absent(up{pod=~"cortex-placement-shim-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-down + annotations: + summary: "Cortex Placement Shim is down" + description: > + The Cortex Placement Shim is down. Placement API requests that are + routed through the shim will not be served. OpenStack services relying + on the shim for resource provider lookups and allocation candidates + will degrade. + + # Downstream HTTP errors (client -> shim) + - alert: CortexPlacementShimDownstreamHttp400sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 4xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 4xx + errors at a sustained rate. This may indicate that the request format + from OpenStack services has changed, authentication tokens are invalid, + or the shim is rejecting malformed requests. Investigate the shim logs + for details on which endpoints and request patterns are affected. + + - alert: CortexPlacementShimDownstreamHttp500sTooHigh + expr: rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream HTTP 5xx errors too high" + description: > + The Placement Shim is responding to client requests with HTTP 5xx + errors. This indicates internal problems within the shim such as + handler panics or misconfiguration. OpenStack services may experience + degraded placement functionality until the issue is resolved. + + # Upstream HTTP errors (shim -> Placement API) + - alert: CortexPlacementShimUpstreamHttp5xxTooHigh + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream HTTP 5xx errors too high" + description: > + The upstream Placement API is returning 5xx errors to the shim. + This indicates the OpenStack Placement service itself is having + problems. The shim forwards these errors to its clients. Investigate + the Placement API service health and logs. + + - alert: CortexPlacementShimUpstreamUnreachable + expr: rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service", responsecode="502"}[5m]) > 0.1 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim cannot reach the upstream Placement API" + description: > + The Placement Shim is unable to reach the upstream OpenStack Placement + API and is returning 502 Bad Gateway errors. This means all forwarded + requests are failing. Check network connectivity, the Placement API + service endpoint configuration, and whether the upstream service is + running. + + # Latency alerts + - alert: CortexPlacementShimDownstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_downstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_downstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: api + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim downstream latency too high" + description: > + The Placement Shim downstream request latency (p95) exceeds 10 + seconds. This affects all OpenStack services making placement + requests through the shim. The cause may be slow upstream responses, + shim processing overhead, or resource contention. Investigate both + shim and upstream Placement API performance. + + - alert: CortexPlacementShimUpstreamLatencyTooHigh + expr: | + histogram_quantile(0.95, sum(rate(cortex_placement_shim_upstream_request_duration_seconds_bucket{service="cortex-placement-shim-metrics-service"}[5m])) by (le)) > 10 + and on() sum(rate(cortex_placement_shim_upstream_request_duration_seconds_count{service="cortex-placement-shim-metrics-service"}[5m])) > 0 + for: 5m + labels: + context: upstream + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-api-errors + annotations: + summary: "Placement Shim upstream latency too high" + description: > + The upstream Placement API response latency (p95) as seen by the + shim exceeds 10 seconds. This directly impacts the end-to-end + latency of placement requests. Investigate the upstream Placement + API performance and network conditions. + + # Resource usage + - alert: CortexPlacementShimHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-placement-shim-metrics-service"} > 1500 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much memory" + description: > + The Placement Shim is using more than 1500 MiB of resident memory + against a limit of 2048 MiB. This may indicate a memory leak, a + large number of cached hypervisors, or unexpected request patterns. + If the usage continues to grow, the pod will be OOM-killed. + + - alert: CortexPlacementShimHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-placement-shim-metrics-service"}[1m]) > 0.4 + for: 5m + labels: + context: cpu + dashboard: cortex-placement-shim-status-dashboard/cortex-placement-shim-status-dashboard + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/alerts/shim-resource-usage + annotations: + summary: "Placement Shim uses too much CPU" + description: > + The Placement Shim is consuming more than 40% of a single CPU core + against a limit of 500m. Under normal operation the shim should use + much less since it primarily proxies requests. This may indicate a + hot loop, excessive logging, or an unusual traffic spike. {{- end }} diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index 661718a4a..8732a13e4 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.73 +version: 0.0.74 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.60 + version: 0.1.0 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/dev/cortex-prometheus-operator/Chart.yaml b/helm/dev/cortex-prometheus-operator/Chart.yaml index 05ec29eeb..c521f0a8a 100644 --- a/helm/dev/cortex-prometheus-operator/Chart.yaml +++ b/helm/dev/cortex-prometheus-operator/Chart.yaml @@ -10,4 +10,4 @@ dependencies: # CRDs of the prometheus operator, such as PrometheusRule, ServiceMonitor, etc. - name: kube-prometheus-stack repository: oci://ghcr.io/prometheus-community/charts - version: 84.5.0 + version: 86.2.0 diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index 0fe64530a..f30090695 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.60 -appVersion: "sha-12c6f24d" +version: 0.1.0 +appVersion: "sha-eb8f3fc3" icon: "https://example.com/icon.png" dependencies: [] diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking.go new file mode 100644 index 000000000..4db427916 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking.go @@ -0,0 +1,117 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "context" + "log/slog" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" + "github.com/cobaltcore-dev/cortex/pkg/conf" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const hanaKVMFlavorPattern = "hana_k_%" + +type kvmHanaStackingRow struct { + ProjectID string `db:"project_id"` + ProjectName string `db:"project_name"` + DomainID string `db:"domain_id"` + DomainName string `db:"domain_name"` + ComputeHost string `db:"compute_host"` + TotalRAMMB float64 `db:"total_ram_mb"` +} + +type KVMHanaStackingKPI struct { + plugins.BaseKPI[struct{}] + ramPerProjectAndHost *prometheus.Desc +} + +func (k *KVMHanaStackingKPI) GetName() string { + return "kvm_hana_stacking_kpi" +} + +func (k *KVMHanaStackingKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error { + if err := k.BaseKPI.Init(dbConn, c, opts); err != nil { + return err + } + k.ramPerProjectAndHost = prometheus.NewDesc( + "cortex_kvm_hana_stacking_ram_bytes", + "Total RAM in bytes used by HANA instances of a project on a KVM hypervisor.", + append(kvmHostLabels, "project_id", "project_name", "domain_id", "domain_name"), nil, + ) + return nil +} + +func (k *KVMHanaStackingKPI) Describe(ch chan<- *prometheus.Desc) { + ch <- k.ramPerProjectAndHost +} + +func (k *KVMHanaStackingKPI) Collect(ch chan<- prometheus.Metric) { + hosts, err := k.getKVMHosts() + if err != nil { + slog.Error("kvm_hana_stacking: failed to get KVM hosts", "error", err) + return + } + + rows, err := k.queryHanaStacking() + if err != nil { + slog.Error("kvm_hana_stacking: failed to query HANA stacking", "error", err) + return + } + + for _, row := range rows { + host, ok := hosts[row.ComputeHost] + if !ok { + slog.Warn("kvm_hana_stacking: compute host not found", "compute_host", row.ComputeHost) + continue + } + hostLabels := host.getHostLabels() + hostLabels = append(hostLabels, row.ProjectID, row.ProjectName, row.DomainID, row.DomainName) + ch <- prometheus.MustNewConstMetric(k.ramPerProjectAndHost, prometheus.GaugeValue, row.TotalRAMMB*1024*1024, hostLabels...) + } +} + +func (k *KVMHanaStackingKPI) getKVMHosts() (map[string]kvmHost, error) { + hvs := &hv1.HypervisorList{} + if err := k.Client.List(context.Background(), hvs); err != nil { + return nil, err + } + hosts := make(map[string]kvmHost, len(hvs.Items)) + for _, hv := range hvs.Items { + host := kvmHost{Hypervisor: hv} + hosts[host.Name] = host + } + return hosts, nil +} + +func (k *KVMHanaStackingKPI) queryHanaStacking() ([]kvmHanaStackingRow, error) { + query := ` + SELECT + s.tenant_id AS project_id, + COALESCE(p.name, '') AS project_name, + COALESCE(p.domain_id, '') AS domain_id, + COALESCE(d.name, '') AS domain_name, + s.os_ext_srv_attr_host AS compute_host, + COALESCE(SUM(f.ram), 0) AS total_ram_mb + FROM ` + nova.Server{}.TableName() + ` s + LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON s.flavor_name = f.name + LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id + LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id + WHERE s.status NOT IN ('DELETED', 'ERROR') + AND s.os_ext_srv_attr_host LIKE '` + kvmComputeHostPattern + `' + AND s.flavor_name LIKE '` + hanaKVMFlavorPattern + `' + GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host + ` + var rows []kvmHanaStackingRow + if _, err := k.DB.Select(&rows, query); err != nil { + return nil, err + } + return rows, nil +} diff --git a/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking_test.go b/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking_test.go new file mode 100644 index 000000000..dc7892e21 --- /dev/null +++ b/internal/knowledge/kpis/plugins/infrastructure/kvm_hana_stacking_test.go @@ -0,0 +1,413 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package infrastructure + +import ( + "reflect" + "testing" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" + "github.com/cobaltcore-dev/cortex/pkg/conf" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + prometheusgo "github.com/prometheus/client_model/go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func hanaStackingMetric(computeHost, az, projectID, projectName, domainID, domainName string, value float64) collectedKVMMetric { + labels := mockKVMHostLabels(computeHost, az) + labels["project_id"] = projectID + labels["project_name"] = projectName + labels["domain_id"] = domainID + labels["domain_name"] = domainName + return collectedKVMMetric{Name: "cortex_kvm_hana_stacking_ram_bytes", Labels: labels, Value: value} +} + +func setupHanaStackingDB(t *testing.T, servers []nova.Server, projects []identity.Project, domains []identity.Domain, flavors []nova.Flavor) db.DB { + t.Helper() + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + t.Cleanup(dbEnv.Close) + + if err := testDB.CreateTable( + testDB.AddTable(nova.Server{}), + testDB.AddTable(identity.Project{}), + testDB.AddTable(identity.Domain{}), + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatalf("failed to create tables: %v", err) + } + + var mockData []any + for i := range servers { + mockData = append(mockData, &servers[i]) + } + for i := range projects { + mockData = append(mockData, &projects[i]) + } + for i := range domains { + mockData = append(mockData, &domains[i]) + } + for i := range flavors { + mockData = append(mockData, &flavors[i]) + } + if len(mockData) > 0 { + if err := testDB.Insert(mockData...); err != nil { + t.Fatalf("expected no error inserting data, got %v", err) + } + } + return testDB +} + +func TestKVMHanaStackingKPI_Init(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + testDB := db.DB{DbMap: dbEnv.DbMap} + defer dbEnv.Close() + kpi := &KVMHanaStackingKPI{} + if err := kpi.Init(&testDB, nil, conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} + +func TestKVMHanaStackingKPI_queryHanaStacking(t *testing.T) { + tests := []struct { + name string + servers []nova.Server + projects []identity.Project + domains []identity.Domain + flavors []nova.Flavor + expected map[string]kvmHanaStackingRow + }{ + { + name: "single HANA instance", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-0"}}, + domains: []identity.Domain{{ID: "domain-0", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-0", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + { + name: "multiple HANA instances same project and host are aggregated", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_large", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_medium", RAM: 1638400}, + {ID: "f2", Name: "hana_k_large", RAM: 3276800}, + }, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 4915200}, + }, + }, + { + name: "multiple projects on different hosts", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "hana_k_large", Status: "ACTIVE"}, + }, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_medium", RAM: 1638400}, + {ID: "f2", Name: "hana_k_large", RAM: 3276800}, + }, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + "project-2|node002-bb02": {ProjectID: "project-2", ProjectName: "Project Two", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node002-bb02", TotalRAMMB: 3276800}, + }, + }, + { + name: "non-HANA flavor instances are excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "m1_k_large", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_medium", RAM: 1638400}, + {ID: "f2", Name: "m1_k_large", RAM: 65536}, + }, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + { + name: "non-KVM host instances are excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "nova-compute-1", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + { + name: "DELETED and ERROR instances are excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "DELETED"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ERROR"}, + {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + { + name: "no instances returns empty result", + servers: []nova.Server{}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{}, + }, + { + name: "missing flavor entry results in zero RAM", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_unknown", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-1", DomainName: "Domain One", ComputeHost: "node001-bb01", TotalRAMMB: 0}, + }, + }, + { + name: "missing project entry results in empty strings", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "", DomainID: "", DomainName: "", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + { + name: "project with unknown domain results in empty domain name", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-unknown"}}, + domains: []identity.Domain{}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + expected: map[string]kvmHanaStackingRow{ + "project-1|node001-bb01": {ProjectID: "project-1", ProjectName: "Project One", DomainID: "domain-unknown", DomainName: "", ComputeHost: "node001-bb01", TotalRAMMB: 1638400}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testDB := setupHanaStackingDB(t, tt.servers, tt.projects, tt.domains, tt.flavors) + + kpi := &KVMHanaStackingKPI{} + if err := kpi.Init(&testDB, buildKVMHypervisorClient(t, []hv1.Hypervisor{}).Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + + rows, err := kpi.queryHanaStacking() + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + if len(rows) != len(tt.expected) { + t.Fatalf("expected %d rows, got %d", len(tt.expected), len(rows)) + } + for _, got := range rows { + key := got.ProjectID + "|" + got.ComputeHost + exp, ok := tt.expected[key] + if !ok { + t.Errorf("unexpected row for key %q: %+v", key, got) + continue + } + if got != exp { + t.Errorf("row mismatch for key %q: expected %+v, got %+v", key, exp, got) + } + } + }) + } +} + +func TestKVMHanaStackingKPI_Collect(t *testing.T) { + tests := []struct { + name string + servers []nova.Server + projects []identity.Project + domains []identity.Domain + flavors []nova.Flavor + hypervisors []hv1.Hypervisor + expectedMetrics []collectedKVMMetric + }{ + { + name: "single HANA instance produces one RAM metric", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-0"}}, + domains: []identity.Domain{{ID: "domain-0", Name: "Domain Zero"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + hanaStackingMetric("node001-bb01", "az1", "project-1", "Project One", "domain-0", "Domain Zero", 1638400*1024*1024), + }, + }, + { + name: "compute_host not in hypervisor list produces no metric", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + hypervisors: []hv1.Hypervisor{}, + expectedMetrics: []collectedKVMMetric{}, + }, + { + name: "only HANA flavors are counted, non-HANA on same host excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "m1_k_large", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_medium", RAM: 1638400}, + {ID: "f2", Name: "m1_k_large", RAM: 65536}, + }, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + hanaStackingMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", 1638400*1024*1024), + }, + }, + { + name: "DELETED and ERROR instances are excluded", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "DELETED"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ERROR"}, + {ID: "s3", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + }, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + hanaStackingMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", 1638400*1024*1024), + }, + }, + { + name: "multiple projects on multiple hosts", + servers: []nova.Server{ + {ID: "s1", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s2", TenantID: "project-1", OSEXTSRVATTRHost: "node001-bb01", FlavorName: "hana_k_medium", Status: "ACTIVE"}, + {ID: "s3", TenantID: "project-2", OSEXTSRVATTRHost: "node002-bb02", FlavorName: "hana_k_large", Status: "ACTIVE"}, + }, + projects: []identity.Project{ + {ID: "project-1", Name: "Project One", DomainID: "domain-1"}, + {ID: "project-2", Name: "Project Two", DomainID: "domain-1"}, + }, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{ + {ID: "f1", Name: "hana_k_medium", RAM: 1638400}, + {ID: "f2", Name: "hana_k_large", RAM: 3276800}, + }, + hypervisors: []hv1.Hypervisor{ + {ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01", Labels: map[string]string{"topology.kubernetes.io/zone": "az1"}}}, + {ObjectMeta: metav1.ObjectMeta{Name: "node002-bb02", Labels: map[string]string{"topology.kubernetes.io/zone": "az2"}}}, + }, + expectedMetrics: []collectedKVMMetric{ + hanaStackingMetric("node001-bb01", "az1", "project-1", "Project One", "domain-1", "Domain One", 2*1638400*1024*1024), + hanaStackingMetric("node002-bb02", "az2", "project-2", "Project Two", "domain-1", "Domain One", 3276800*1024*1024), + }, + }, + { + name: "no instances produces no metrics", + servers: []nova.Server{}, + projects: []identity.Project{{ID: "project-1", Name: "Project One", DomainID: "domain-1"}}, + domains: []identity.Domain{{ID: "domain-1", Name: "Domain One"}}, + flavors: []nova.Flavor{{ID: "f1", Name: "hana_k_medium", RAM: 1638400}}, + hypervisors: []hv1.Hypervisor{{ObjectMeta: metav1.ObjectMeta{Name: "node001-bb01"}}}, + expectedMetrics: []collectedKVMMetric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testDB := setupHanaStackingDB(t, tt.servers, tt.projects, tt.domains, tt.flavors) + + client := buildKVMHypervisorClient(t, tt.hypervisors) + kpi := &KVMHanaStackingKPI{} + if err := kpi.Init(&testDB, client.Build(), conf.NewRawOpts("{}")); err != nil { + t.Fatalf("expected no error on Init, got %v", err) + } + + ch := make(chan prometheus.Metric, 100) + kpi.Collect(ch) + close(ch) + + actual := make(map[string]collectedKVMMetric) + for m := range ch { + var pm prometheusgo.Metric + if err := m.Write(&pm); err != nil { + t.Fatalf("failed to write metric: %v", err) + } + labels := make(map[string]string) + for _, lbl := range pm.Label { + labels[lbl.GetName()] = lbl.GetValue() + } + name := getMetricName(m.Desc().String()) + key := name + "|" + labels["compute_host"] + "|" + labels["project_id"] + if _, exists := actual[key]; exists { + t.Fatalf("duplicate metric key %q", key) + } + actual[key] = collectedKVMMetric{Name: name, Labels: labels, Value: pm.GetGauge().GetValue()} + } + + if len(actual) != len(tt.expectedMetrics) { + t.Errorf("expected %d metrics, got %d: actual=%v", len(tt.expectedMetrics), len(actual), actual) + } + for _, exp := range tt.expectedMetrics { + key := exp.Name + "|" + exp.Labels["compute_host"] + "|" + exp.Labels["project_id"] + got, ok := actual[key] + if !ok { + t.Errorf("missing metric %q", key) + continue + } + if got.Value != exp.Value { + t.Errorf("metric %q value: expected %v, got %v", key, exp.Value, got.Value) + } + if !reflect.DeepEqual(exp.Labels, got.Labels) { + t.Errorf("metric %q labels: expected %v, got %v", key, exp.Labels, got.Labels) + } + } + }) + } +} diff --git a/internal/knowledge/kpis/supported_kpis.go b/internal/knowledge/kpis/supported_kpis.go index cfcf56bd3..155e3aab9 100644 --- a/internal/knowledge/kpis/supported_kpis.go +++ b/internal/knowledge/kpis/supported_kpis.go @@ -22,6 +22,7 @@ var supportedKPIs = map[string]plugins.KPI{ "kvm_host_capacity_kpi": &infrastructure.KVMHostCapacityKPI{}, "kvm_project_utilization_kpi": &infrastructure.KVMProjectUtilizationKPI{}, + "kvm_hana_stacking_kpi": &infrastructure.KVMHanaStackingKPI{}, "vmware_project_utilization_kpi": &infrastructure.VMwareProjectUtilizationKPI{}, "vmware_project_commitments_kpi": &infrastructure.VMwareProjectCommitmentsKPI{}, "vmware_host_capacity_kpi": &infrastructure.VMwareHostCapacityKPI{}, diff --git a/internal/scheduling/lib/filter_weigher_pipeline.go b/internal/scheduling/lib/filter_weigher_pipeline.go index ee769433d..6a78f8ae6 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline.go +++ b/internal/scheduling/lib/filter_weigher_pipeline.go @@ -274,7 +274,16 @@ func (p *filterWeigherPipeline[RequestType]) Run(request RequestType) (v1alpha1. traceLog.Info("scheduler: starting pipeline", "hosts", hostsIn) // Normalize the input weights so we can apply step weights meaningfully. - inWeights := p.normalizeInputWeights(request.GetWeights()) + // Only do this if there are weighers to combine with: tanh saturates large + // inputs (e.g. Nova's 50/55/60) to ~1.0, which would destroy the original + // ordering. With no weighers configured, the normalized map flows straight + // to the sort, so we must keep the raw values to preserve that ordering. + var inWeights map[string]float64 + if len(p.weighers) > 0 { + inWeights = p.normalizeInputWeights(request.GetWeights()) + } else { + inWeights = maps.Clone(request.GetWeights()) + } traceLog.Info("scheduler: input weights", "weights", inWeights) // Run filters first to reduce the number of hosts. diff --git a/internal/scheduling/lib/filter_weigher_pipeline_test.go b/internal/scheduling/lib/filter_weigher_pipeline_test.go index 0e2775944..6a203eed1 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline_test.go +++ b/internal/scheduling/lib/filter_weigher_pipeline_test.go @@ -88,6 +88,41 @@ func TestPipeline_Run(t *testing.T) { } } +func TestPipeline_Run_NoWeighers_PreservesInputOrdering(t *testing.T) { + // With no weighers configured, the tanh normalization would saturate + // large input weights to ~1.0 and destroy the requester's ordering. The + // pipeline must skip normalization in that case so the original ordering + // flows through to the sort. + pipeline := &filterWeigherPipeline[mockFilterWeigherPipelineRequest]{} + + request := mockFilterWeigherPipelineRequest{ + Hosts: []string{"host1", "host2", "host3"}, + Weights: map[string]float64{ + "host1": 50.0, + "host2": 55.0, + "host3": 60.0, + }, + } + + // Run many times to surface any non-determinism from map iteration order. + expected := []string{"host3", "host2", "host1"} + for i := range 50 { + result, err := pipeline.Run(request) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(result.OrderedHosts) != len(expected) { + t.Fatalf("expected %d results, got %d", len(expected), len(result.OrderedHosts)) + } + for j, host := range expected { + if result.OrderedHosts[j] != host { + t.Fatalf("iter %d: expected host %s at position %d, got %s (order=%v)", + i, host, j, result.OrderedHosts[j], result.OrderedHosts) + } + } + } +} + func TestPipeline_NormalizeNovaWeights(t *testing.T) { p := &filterWeigherPipeline[mockFilterWeigherPipelineRequest]{} diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go index d43912ee2..5843bb3d9 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go @@ -126,6 +126,17 @@ func newCRTestClient(scheme *runtime.Scheme, objects ...client.Object) client.Cl } return []string{cr.Spec.ProjectID} }). + WithIndex(&v1alpha1.Reservation{}, idxReservationByAllocationVMUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil { + return nil + } + uuids := make([]string, 0, len(res.Spec.CommittedResourceReservation.Allocations)) + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + uuids = append(uuids, vmUUID) + } + return uuids + }). Build() } diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go index 1f3733615..1237d0da5 100644 --- a/internal/scheduling/reservations/commitments/field_index.go +++ b/internal/scheduling/reservations/commitments/field_index.go @@ -18,15 +18,17 @@ const idxCommittedResourceByUUID = "spec.commitmentUUID" const idxCommittedResourceByProjectID = "spec.projectID" const idxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID" const idxProjectQuotaByProjectID = "spec.projectID" +const idxReservationByAllocationVMUUID = "spec.committedResourceReservation.allocations" // once guards ensure each field index is registered exactly once. // Both CommittedResourceController and UsageReconciler call indexCommittedResourceByUUID; // the underlying cache returns "indexer conflict" on double registration. var ( - onceIndexCRByUUID sync.Once - onceIndexCRByProjectID sync.Once - onceIndexReservationByUUID sync.Once - onceIndexPQByProjectID sync.Once + onceIndexCRByUUID sync.Once + onceIndexCRByProjectID sync.Once + onceIndexReservationByUUID sync.Once + onceIndexPQByProjectID sync.Once + onceIndexReservationByAllocationVMID sync.Once ) // indexCommittedResourceByUUID registers the index used by UsageReconciler to look up @@ -128,3 +130,34 @@ func indexProjectQuotaByProjectID(ctx context.Context, mcl *multicluster.Client) }) return err } + +// indexReservationByAllocationVMUUID registers an index over all VM UUIDs present in +// Spec.CommittedResourceReservation.Allocations. This allows the reservation controller +// to efficiently find all other Reservation CRDs carrying a specific VM UUID without +// scanning every reservation in the cluster. +func indexReservationByAllocationVMUUID(ctx context.Context, mcl *multicluster.Client) (err error) { + onceIndexReservationByAllocationVMID.Do(func() { + log := logf.FromContext(ctx) + err = mcl.IndexField(ctx, + &v1alpha1.Reservation{}, + &v1alpha1.ReservationList{}, + idxReservationByAllocationVMUUID, + func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok { + log.Error(errors.New("unexpected type"), "expected Reservation", "object", obj) + return nil + } + if res.Spec.CommittedResourceReservation == nil { + return nil + } + uuids := make([]string, 0, len(res.Spec.CommittedResourceReservation.Allocations)) + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + uuids = append(uuids, vmUUID) + } + return uuids + }, + ) + }) + return err +} diff --git a/internal/scheduling/reservations/commitments/integration_test.go b/internal/scheduling/reservations/commitments/integration_test.go index 857cf2734..8c51a8162 100644 --- a/internal/scheduling/reservations/commitments/integration_test.go +++ b/internal/scheduling/reservations/commitments/integration_test.go @@ -359,6 +359,17 @@ func newIntgEnv(t *testing.T, initialObjects []client.Object, schedulerFn http.H } return []string{cr.Spec.CommitmentUUID} }). + WithIndex(&v1alpha1.Reservation{}, idxReservationByAllocationVMUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil { + return nil + } + uuids := make([]string, 0, len(res.Spec.CommittedResourceReservation.Allocations)) + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + uuids = append(uuids, vmUUID) + } + return uuids + }). Build() schedulerSrv := httptest.NewServer(schedulerFn) diff --git a/internal/scheduling/reservations/commitments/reservation_controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go index 7924c7fb7..f75b152d2 100644 --- a/internal/scheduling/reservations/commitments/reservation_controller.go +++ b/internal/scheduling/reservations/commitments/reservation_controller.go @@ -395,6 +395,13 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} } + // Snapshot existing Status.Allocations before we overwrite it so we can detect + // which VM UUIDs are newly confirmed after the patch. + existingStatusAllocations := make(map[string]string, len(res.Status.CommittedResourceReservation.Allocations)) + for k, v := range res.Status.CommittedResourceReservation.Allocations { + existingStatusAllocations[k] = v + } + // Build new Status.Allocations map based on HV CRD state. newStatusAllocations := make(map[string]string) // Track allocations to remove from Spec (stale/leaving VMs). @@ -469,6 +476,19 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte res.Status.CommittedResourceReservation.Allocations = newStatusAllocations } + // Proactively remove this VM UUID from all other candidate reservations that still + // carry it in their Spec.Allocations. Only do this for VMs that are newly confirmed + // in this reconcile cycle (present in newStatusAllocations but absent in the snapshot + // taken before any patch) to avoid redundant work on subsequent reconciles. + for vmUUID := range newStatusAllocations { + if _, wasAlreadyConfirmed := existingStatusAllocations[vmUUID]; wasAlreadyConfirmed { + continue + } + if err := r.cleanupCandidateReservations(ctx, res.Name, vmUUID); err != nil { + return nil, fmt.Errorf("failed to cleanup candidate reservations for vm %s: %w", vmUUID, err) + } + } + // Patch Status patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, res, patch); err != nil { @@ -487,6 +507,41 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte return result, nil } +// cleanupCandidateReservations removes vmUUID from Spec.Allocations of all Reservation CRDs +// other than the one that just confirmed the VM. This is called once per newly confirmed VM +// so that candidate slots on non-selected hosts are freed immediately rather than waiting +// for those reservations' own grace period or periodic requeue. +func (r *CommitmentReservationController) cleanupCandidateReservations(ctx context.Context, confirmedReservationName, vmUUID string) error { + logger := LoggerFromContext(ctx).WithValues("component", "controller", "vm", vmUUID) + + var candidates v1alpha1.ReservationList + if err := r.List(ctx, &candidates, client.MatchingFields{idxReservationByAllocationVMUUID: vmUUID}); err != nil { + return fmt.Errorf("failed to list candidate reservations: %w", err) + } + + for i := range candidates.Items { + candidate := &candidates.Items[i] + if candidate.Name == confirmedReservationName { + continue + } + if candidate.Spec.CommittedResourceReservation == nil { + continue + } + if _, ok := candidate.Spec.CommittedResourceReservation.Allocations[vmUUID]; !ok { + continue + } + old := candidate.DeepCopy() + delete(candidate.Spec.CommittedResourceReservation.Allocations, vmUUID) + if err := r.Patch(ctx, candidate, client.MergeFrom(old)); err != nil { + if client.IgnoreNotFound(err) != nil { + return fmt.Errorf("failed to patch candidate reservation %s: %w", candidate.Name, err) + } + } + logger.Info("removed vm from candidate reservation", "reservation", candidate.Name, "host", candidate.Status.Host) + } + return nil +} + // getPipelineForFlavorGroup returns the pipeline name for a given flavor group. func (r *CommitmentReservationController) getPipelineForFlavorGroup(flavorGroupName string, logger logr.Logger) string { // Try exact match first (e.g., "2152" -> "kvm-cr-hana") @@ -581,6 +636,10 @@ func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl return err } + if err := indexReservationByAllocationVMUUID(context.Background(), mcl); err != nil { + return fmt.Errorf("failed to set up reservation allocation VM UUID index: %w", err) + } + // Use WatchesMulticluster to watch Reservations across all configured clusters // (home + remotes). This is required because Reservation CRDs may be stored // in remote clusters, not just the home cluster. Without this, the controller diff --git a/internal/scheduling/reservations/commitments/reservation_controller_test.go b/internal/scheduling/reservations/commitments/reservation_controller_test.go index df6316d46..8c5817893 100644 --- a/internal/scheduling/reservations/commitments/reservation_controller_test.go +++ b/internal/scheduling/reservations/commitments/reservation_controller_test.go @@ -421,9 +421,239 @@ func TestHypervisorToReservations(t *testing.T) { } // ============================================================================ -// Test: reconcileInstanceReservation_Success (existing test) +// Test: cleanupCandidateReservations // ============================================================================ +// TestCleanupCandidateReservations_MultiCandidate covers the BLI #409 acceptance +// criterion: VM UUID in 3 reservations, confirmed on 1, cleaned from 2. +func TestCleanupCandidateReservations_MultiCandidate(t *testing.T) { + scheme := newCRTestScheme(t) + now := time.Now() + oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) + + const vmUUID = "vm-candidate-uuid" + + makeAlloc := func() map[string]v1alpha1.CommittedResourceAllocation { + return map[string]v1alpha1.CommittedResourceAllocation{ + vmUUID: { + CreationTimestamp: oldTime, + Resources: map[hv1.ResourceName]resource.Quantity{"memory": resource.MustParse("4Gi")}, + }, + } + } + + winning := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-winning"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-1", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: makeAlloc(), + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-1"}, + } + candidate2 := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-candidate-2"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-2", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: makeAlloc(), + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-2"}, + } + candidate3 := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-candidate-3"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-3", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: makeAlloc(), + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-3"}, + } + + k8sClient := newCRTestClient(scheme, winning, candidate2, candidate3) + ctrl := &CommitmentReservationController{Client: k8sClient, Scheme: scheme} + ctx := WithNewGlobalRequestID(context.Background()) + + if err := ctrl.cleanupCandidateReservations(ctx, "res-winning", vmUUID); err != nil { + t.Fatalf("cleanupCandidateReservations() error = %v", err) + } + + // winning reservation must still carry the VM UUID + var updatedWinning v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(winning), &updatedWinning); err != nil { + t.Fatalf("failed to get winning reservation: %v", err) + } + if _, ok := updatedWinning.Spec.CommittedResourceReservation.Allocations[vmUUID]; !ok { + t.Errorf("winning reservation should still carry vm UUID %s", vmUUID) + } + + // both candidates must have the VM UUID removed + for _, obj := range []*v1alpha1.Reservation{candidate2, candidate3} { + var updated v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(obj), &updated); err != nil { + t.Fatalf("failed to get %s: %v", obj.Name, err) + } + if _, ok := updated.Spec.CommittedResourceReservation.Allocations[vmUUID]; ok { + t.Errorf("candidate reservation %s should have vm UUID %s removed", obj.Name, vmUUID) + } + } +} + +// TestReconcileAllocations_ConfirmTriggersCandidateCleanup verifies the end-to-end flow: +// when reconcileAllocations confirms a VM on the winning reservation, the same VM UUID +// is removed from candidate reservations on other hosts. +func TestReconcileAllocations_ConfirmTriggersCandidateCleanup(t *testing.T) { + scheme := newCRTestScheme(t) + now := time.Now() + oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) + + const vmUUID = "vm-confirmed-uuid" + config := ReservationControllerConfig{AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}} + + makeAlloc := func() map[string]v1alpha1.CommittedResourceAllocation { + return map[string]v1alpha1.CommittedResourceAllocation{ + vmUUID: { + CreationTimestamp: oldTime, + Resources: map[hv1.ResourceName]resource.Quantity{"memory": resource.MustParse("4Gi")}, + }, + } + } + + winning := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-winning"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-1", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: makeAlloc(), + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ + Allocations: map[string]string{}, // not yet confirmed + }, + }, + } + candidate2 := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-candidate-2"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-2", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: makeAlloc(), + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-2"}, + } + hypervisor := newTestHypervisorCRD("host-1", []hv1.Instance{ + {ID: vmUUID, Name: "vm-name", Active: true}, + }) + + k8sClient := newCRTestClient(scheme, winning, candidate2, hypervisor) + controller := &CommitmentReservationController{Client: k8sClient, Scheme: scheme, Conf: config} + ctx := WithNewGlobalRequestID(context.Background()) + + if _, err := controller.reconcileAllocations(ctx, winning); err != nil { + t.Fatalf("reconcileAllocations() error = %v", err) + } + + // VM must be confirmed on winning reservation + var updatedWinning v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(winning), &updatedWinning); err != nil { + t.Fatalf("failed to get winning reservation: %v", err) + } + if updatedWinning.Status.CommittedResourceReservation == nil || + updatedWinning.Status.CommittedResourceReservation.Allocations[vmUUID] != "host-1" { + t.Errorf("expected vm %s confirmed on host-1 in winning reservation status", vmUUID) + } + + // VM UUID must be removed from the candidate reservation + var updatedCandidate v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(candidate2), &updatedCandidate); err != nil { + t.Fatalf("failed to get candidate reservation: %v", err) + } + if _, ok := updatedCandidate.Spec.CommittedResourceReservation.Allocations[vmUUID]; ok { + t.Errorf("expected vm UUID %s to be removed from candidate reservation", vmUUID) + } +} + +// TestCleanupCandidateReservations_NoDoubleCleanup verifies that a VM UUID already +// confirmed in a previous reconcile cycle (present in existingStatusAllocations) does +// not trigger a redundant cleanup call. +func TestReconcileAllocations_NoCleanupForAlreadyConfirmedVM(t *testing.T) { + scheme := newCRTestScheme(t) + now := time.Now() + oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) + + const vmUUID = "vm-already-confirmed" + config := ReservationControllerConfig{AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}} + + // Winning reservation already has the VM in Status.Allocations from a prior cycle. + winning := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-winning"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-1", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + vmUUID: { + CreationTimestamp: oldTime, + Resources: map[hv1.ResourceName]resource.Quantity{"memory": resource.MustParse("4Gi")}, + }, + }, + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ + Allocations: map[string]string{vmUUID: "host-1"}, // already confirmed + }, + }, + } + // This candidate still has the VM UUID — it should NOT be touched because the VM + // was confirmed in a previous cycle (the cleanup already ran then). + staleCandidate := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{Name: "res-stale-candidate"}, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: "host-2", + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + vmUUID: {CreationTimestamp: oldTime}, + }, + }, + }, + Status: v1alpha1.ReservationStatus{Host: "host-2"}, + } + hypervisor := newTestHypervisorCRD("host-1", []hv1.Instance{ + {ID: vmUUID, Name: "vm-name", Active: true}, + }) + + k8sClient := newCRTestClient(scheme, winning, staleCandidate, hypervisor) + controller := &CommitmentReservationController{Client: k8sClient, Scheme: scheme, Conf: config} + ctx := WithNewGlobalRequestID(context.Background()) + + if _, err := controller.reconcileAllocations(ctx, winning); err != nil { + t.Fatalf("reconcileAllocations() error = %v", err) + } + + // Stale candidate should be untouched — cleanup only fires on first confirmation. + var updatedCandidate v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(staleCandidate), &updatedCandidate); err != nil { + t.Fatalf("failed to get stale candidate: %v", err) + } + if _, ok := updatedCandidate.Spec.CommittedResourceReservation.Allocations[vmUUID]; !ok { + t.Errorf("stale candidate should not have been touched on a re-reconcile of an already-confirmed VM") + } +} + func TestCommitmentReservationController_reconcileInstanceReservation_Success(t *testing.T) { scheme := newCRTestScheme(t) diff --git a/pkg/multicluster/client.go b/pkg/multicluster/client.go index d4aaa7b85..6af557598 100644 --- a/pkg/multicluster/client.go +++ b/pkg/multicluster/client.go @@ -78,7 +78,13 @@ type RemoteConfig struct { // The remote kubernetes apiserver url, e.g. "https://my-apiserver:6443". Host string `json:"host"` // The root CA certificate to verify the remote apiserver. + // Ignored if InsecureSkipTLSVerify is true. CACert string `json:"caCert,omitempty"` + // InsecureSkipTLSVerify disables verification of the remote apiserver's + // TLS certificate. Use this for apiservers whose CA certificate rotates + // frequently and does not chain to a stable root. Mutually exclusive + // with CACert: when true, CACert is ignored. + InsecureSkipTLSVerify bool `json:"insecureSkipTLSVerify,omitempty"` // The resource GVKs this apiserver serves, formatted as "<group>/<version>/<Kind>". GVKs []string `json:"gvks"` // Labels used by ResourceRouters to match resources to this cluster @@ -121,7 +127,7 @@ func (c *Client) InitFromConf(ctx context.Context, mgr ctrl.Manager, conf Client } resolvedGVKs = append(resolvedGVKs, gvk) } - cl, err := c.AddRemote(ctx, remote.Host, remote.CACert, remote.Labels, resolvedGVKs...) + cl, err := c.AddRemote(ctx, remote.Host, remote.CACert, remote.InsecureSkipTLSVerify, remote.Labels, resolvedGVKs...) if err != nil { return err } @@ -135,15 +141,26 @@ func (c *Client) InitFromConf(ctx context.Context, mgr ctrl.Manager, conf Client // Add a remote cluster which uses the same REST config as the home cluster, // but a different host, for the given resource gvks. // +// If insecureSkipTLSVerify is true, the remote apiserver's TLS certificate +// is not verified and caCert is ignored. This is useful for apiservers whose +// CA certificate rotates frequently and does not chain to a stable root. +// // This can be used when the remote cluster accepts the home cluster's service // account tokens. See the kubernetes documentation on structured auth to // learn more about jwt-based authentication across clusters. -func (c *Client) AddRemote(ctx context.Context, host, caCert string, labels map[string]string, gvks ...schema.GroupVersionKind) (cluster.Cluster, error) { +func (c *Client) AddRemote(ctx context.Context, host, caCert string, insecureSkipTLSVerify bool, labels map[string]string, gvks ...schema.GroupVersionKind) (cluster.Cluster, error) { log := ctrl.LoggerFrom(ctx) homeRestConfig := *c.HomeRestConfig restConfigCopy := homeRestConfig restConfigCopy.Host = host - restConfigCopy.CAData = []byte(caCert) + if insecureSkipTLSVerify { + // Insecure and CAData are mutually exclusive in client-go's TLS validation. + restConfigCopy.CAData = nil + restConfigCopy.CAFile = "" + restConfigCopy.Insecure = true + } else { + restConfigCopy.CAData = []byte(caCert) + } cl, err := cluster.New(&restConfigCopy, func(o *cluster.Options) { o.Scheme = c.HomeScheme }) @@ -156,7 +173,7 @@ func (c *Client) AddRemote(ctx context.Context, host, caCert string, labels map[ c.remoteClusters = make(map[schema.GroupVersionKind][]remoteCluster) } for _, gvk := range gvks { - log.Info("adding remote cluster for resource", "gvk", gvk, "host", host, "labels", labels) + log.Info("adding remote cluster for resource", "gvk", gvk, "host", host, "labels", labels, "insecureSkipTLSVerify", insecureSkipTLSVerify) c.remoteClusters[gvk] = append(c.remoteClusters[gvk], remoteCluster{ cluster: cl, labels: labels,