diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 7c1cd7265df..2a64178cb85 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4363,6 +4363,17 @@ query_rejection: # external labels for alerting rules [ruler_external_labels: | default = []] +# Per-tenant external URL for the ruler. If set, it overrides the global +# -ruler.external.url for this tenant's alert notifications. +[ruler_external_url: | default = ""] + +# Go text/template for alert generator URLs. Available variables: .ExternalURL +# (resolved external URL) and .Expression (PromQL expression). Built-in +# functions like urlquery are available. If empty, uses default Prometheus +# /graph format. Example for a custom explore link: +# "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}" +[ruler_alert_generator_url_template: | default = ""] + # Enable to allow rules to be evaluated with data from a single zone, if other # zones are not available. [rules_partial_data: | default = false] diff --git a/docs/getting-started/.env b/docs/getting-started/.env index 52b62bd990b..81b6cc44d5b 100644 --- a/docs/getting-started/.env +++ b/docs/getting-started/.env @@ -2,4 +2,4 @@ CORTEX_VERSION=v1.20.1 GRAFANA_VERSION=10.4.2 PROMETHEUS_VERSION=v3.2.1 SEAWEEDFS_VERSION=3.67 -PERSES_VERSION=v0.49-distroless-debug +PERSES_VERSION=v0.53.1-distroless-debug diff --git a/docs/getting-started/cortex-config.yaml b/docs/getting-started/cortex-config.yaml index 1b24084ad3f..9351b788f18 100644 --- a/docs/getting-started/cortex-config.yaml +++ b/docs/getting-started/cortex-config.yaml @@ -82,6 +82,14 @@ frontend_worker: # https://cortexmetrics.io/docs/configuration/configuration-file/#ruler_config ruler: enable_api: true + external_url: http://localhost:9009 + alertmanager_url: http://localhost:9009/alertmanager + +# Per-tenant runtime configuration (hot-reloaded without restart). +# This file configures per-tenant overrides such as custom alert generator +# URL templates for Grafana, Perses, or any metrics explorer. +runtime_config: + file: /config/runtime-config.yaml # https://cortexmetrics.io/docs/configuration/configuration-file/#ruler_storage_config ruler_storage: diff --git a/docs/getting-started/docker-compose.yaml b/docs/getting-started/docker-compose.yaml index 1c48394b16c..47ac1d7e2e3 100644 --- a/docs/getting-started/docker-compose.yaml +++ b/docs/getting-started/docker-compose.yaml @@ -17,6 +17,7 @@ services: - -config.file=/config/cortex-config.yaml volumes: - ./cortex-config.yaml:/config/cortex-config.yaml:ro + - ./runtime-config.yaml:/config/runtime-config.yaml:ro ports: - "9009:9009" healthcheck: @@ -47,6 +48,8 @@ services: volumes: - ./perses/config.yaml:/etc/perses/config/config.yaml:ro - ./perses/datasource.yaml:/etc/perses/resources/datasource.yaml:ro + - ./perses/datasource-tenant-a.yaml:/etc/perses/resources/datasource-tenant-a.yaml:ro + - ./perses/datasource-tenant-b.yaml:/etc/perses/resources/datasource-tenant-b.yaml:ro - ./perses/project.yaml:/etc/perses/resources/project.yaml:ro - ./perses/dashboards/cortex-writes.yaml:/etc/perses/resources/cortex-writes.yaml:ro prometheus: diff --git a/docs/getting-started/grafana-datasource-docker.yaml b/docs/getting-started/grafana-datasource-docker.yaml index a40cce5e65f..2087d9f237d 100644 --- a/docs/getting-started/grafana-datasource-docker.yaml +++ b/docs/getting-started/grafana-datasource-docker.yaml @@ -5,6 +5,7 @@ apiVersion: 1 datasources: - name: Cortex type: prometheus + uid: cortex access: proxy orgId: 1 url: http://cortex:9009/api/prom @@ -71,3 +72,25 @@ datasources: secureJsonData: httpHeaderValue1: cortex version: 1 + - orgId: 1 + name: Tenant A Alertmanager + type: alertmanager + access: proxy + url: http://cortex:9009/ + jsonData: + httpHeaderName1: X-Scope-OrgID + implementation: cortex + secureJsonData: + httpHeaderValue1: tenant-a + version: 1 + - orgId: 1 + name: Tenant B Alertmanager + type: alertmanager + access: proxy + url: http://cortex:9009/ + jsonData: + httpHeaderName1: X-Scope-OrgID + implementation: cortex + secureJsonData: + httpHeaderValue1: tenant-b + version: 1 diff --git a/docs/getting-started/perses/config.yaml b/docs/getting-started/perses/config.yaml index b87f81bc0f6..ba04acce34e 100644 --- a/docs/getting-started/perses/config.yaml +++ b/docs/getting-started/perses/config.yaml @@ -8,7 +8,7 @@ security: database: file: extension: yaml - folder: /perses + folder: /tmp/perses-data schemas: datasources_path: /etc/perses/cue/schemas/datasources @@ -16,6 +16,11 @@ schemas: panels_path: /etc/perses/cue/schemas/panels queries_path: /etc/perses/cue/schemas/queries variables_path: /etc/perses/cue/schemas/variables + +frontend: + explorer: + enable: true + provisioning: folders: - /etc/perses/resources \ No newline at end of file diff --git a/docs/getting-started/perses/dashboards/cortex-writes.yaml b/docs/getting-started/perses/dashboards/cortex-writes.yaml index 8705ad5f556..a7de3b2795b 100644 --- a/docs/getting-started/perses/dashboards/cortex-writes.yaml +++ b/docs/getting-started/perses/dashboards/cortex-writes.yaml @@ -4,7 +4,7 @@ metadata: createdAt: 2025-03-24T19:15:47.468680767Z updatedAt: 2025-03-24T19:43:53.000136362Z version: 12 - project: default + project: cortex spec: display: name: Cortex / Writes diff --git a/docs/getting-started/perses/datasource-tenant-a.yaml b/docs/getting-started/perses/datasource-tenant-a.yaml new file mode 100644 index 00000000000..78d67370828 --- /dev/null +++ b/docs/getting-started/perses/datasource-tenant-a.yaml @@ -0,0 +1,14 @@ +kind: GlobalDatasource +metadata: + name: TenantA +spec: + default: false + plugin: + kind: PrometheusDatasource + spec: + proxy: + kind: HTTPProxy + spec: + url: http://cortex:9009/api/prom + headers: + X-Scope-OrgID: tenant-a diff --git a/docs/getting-started/perses/datasource-tenant-b.yaml b/docs/getting-started/perses/datasource-tenant-b.yaml new file mode 100644 index 00000000000..40f80a67492 --- /dev/null +++ b/docs/getting-started/perses/datasource-tenant-b.yaml @@ -0,0 +1,14 @@ +kind: GlobalDatasource +metadata: + name: TenantB +spec: + default: false + plugin: + kind: PrometheusDatasource + spec: + proxy: + kind: HTTPProxy + spec: + url: http://cortex:9009/api/prom + headers: + X-Scope-OrgID: tenant-b diff --git a/docs/getting-started/perses/project.yaml b/docs/getting-started/perses/project.yaml index a39681c7841..3b1a1ad9835 100644 --- a/docs/getting-started/perses/project.yaml +++ b/docs/getting-started/perses/project.yaml @@ -1,6 +1,6 @@ kind: Project metadata: - name: default + name: cortex spec: display: - name: "default" \ No newline at end of file + name: "Cortex" \ No newline at end of file diff --git a/docs/getting-started/runtime-config.yaml b/docs/getting-started/runtime-config.yaml new file mode 100644 index 00000000000..487cc9864b1 --- /dev/null +++ b/docs/getting-started/runtime-config.yaml @@ -0,0 +1,25 @@ +# Runtime configuration with per-tenant overrides. +# This file is hot-reloaded by Cortex without requiring a restart. +# +# The examples below demonstrate per-tenant alert generator URL templates. +# Each tenant can have a different URL format for alert "Source" links. + +overrides: + # Tenant using Grafana Explore for alert generator URLs. + # Clicking "Source" on an alert in Alertmanager opens Grafana Explore + # with the PromQL expression pre-filled. + tenant-a: + ruler_external_url: "http://localhost:3000" + ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?schemaVersion=1&panes=%7B%22default%22:%7B%22datasource%22:%22cortex%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22{{ urlquery .Expression }}%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&orgId=1 + + # Tenant using Perses for alert generator URLs. + # Clicking "Source" on an alert opens Perses explore view with + # the PromQL expression pre-filled and the TenantB datasource selected. + tenant-b: + ruler_external_url: http://localhost:8080 + ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?explorer=Prometheus-PrometheusExplorer&data=%7B%22tab%22%3A%22graph%22%2C%22queries%22%3A%5B%7B%22kind%22%3A%22TimeSeriesQuery%22%2C%22spec%22%3A%7B%22plugin%22%3A%7B%22kind%22%3A%22PrometheusTimeSeriesQuery%22%2C%22spec%22%3A%7B%22datasource%22%3A%7B%22kind%22%3A%22PrometheusDatasource%22%2C%22name%22%3A%22tenantb%22%7D%2C%22query%22%3A%22{{ urlquery .Expression }}%22%7D%7D%7D%7D%5D%7D + + # Tenants without overrides use the global ruler.external.url + # and the default Prometheus /graph format. diff --git a/docs/getting-started/single-binary.md b/docs/getting-started/single-binary.md index 6321a1c238e..4b7c93ceb14 100644 --- a/docs/getting-started/single-binary.md +++ b/docs/getting-started/single-binary.md @@ -214,6 +214,133 @@ docker run --network cortex-docs-getting-started_default \ Configure Alertmanager notification policies in Grafana: [Alerting → Notification policies](http://localhost:3000/alerting/notifications?search=&alertmanager=Cortex%20Alertmanager) +## Step 7: Per-Tenant Alert Generator URLs (Optional) + +Cortex supports customizing the "Source" link on alerts per-tenant using Go `text/template` strings. This lets each tenant's alerts link back to their preferred metrics explorer — Grafana Explore, Perses, or any other tool. + +The getting-started example includes a `runtime-config.yaml` with two tenant configurations: +- **tenant-a**: Alert source links point to **Grafana Explore** +- **tenant-b**: Alert source links point to **Perses** + +### How It Works + +The `ruler_alert_generator_url_template` field accepts a Go template with two variables: +- `{{ .ExternalURL }}` — the resolved external URL for this tenant (set via `ruler_external_url`) +- `{{ .Expression }}` — the PromQL expression that triggered the alert + +Built-in Go template functions like `urlquery` are available for URL encoding. + +Example for Grafana Explore: +```yaml +ruler_external_url: "http://localhost:3000" +ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?expr={{ urlquery .Expression }} +``` + +### Try It Out + +1. **Load alertmanager configs** for tenant-a and tenant-b: + +```sh +# Upload alertmanager config for tenant-a +curl -X POST http://localhost:9009/api/v1/alerts \ + -H "X-Scope-OrgID: tenant-a" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +alertmanager_config: | + receivers: + - name: default-receiver + route: + receiver: default-receiver + group_wait: 5s + group_interval: 10s +EOF + +# Upload alertmanager config for tenant-b +curl -X POST http://localhost:9009/api/v1/alerts \ + -H "X-Scope-OrgID: tenant-b" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +alertmanager_config: | + receivers: + - name: default-receiver + route: + receiver: default-receiver + group_wait: 5s + group_interval: 10s +EOF +``` + +2. **Load demo alert rules** that fire immediately: + +```sh +# Alert rules for tenant-a +curl -X POST http://localhost:9009/api/v1/rules/demo \ + -H "X-Scope-OrgID: tenant-a" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +name: demo_alerts +interval: 10s +rules: + - alert: HighMemoryUsage + expr: vector(85) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: "Memory usage is above 80%" + - alert: HighErrorRate + expr: vector(5.2) > 5 + for: 0m + labels: + severity: critical + annotations: + summary: "Error rate exceeds 5%" +EOF + +# Alert rules for tenant-b +curl -X POST http://localhost:9009/api/v1/rules/demo \ + -H "X-Scope-OrgID: tenant-b" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +name: demo_alerts +interval: 10s +rules: + - alert: DiskSpaceLow + expr: vector(92) > 90 + for: 0m + labels: + severity: critical + annotations: + summary: "Disk space usage above 90%" + - alert: HighLatency + expr: vector(3.5) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: "P99 latency exceeds 2s" +EOF +``` + +3. **Wait ~30 seconds** for the ruler to evaluate rules and send alerts to the alertmanager. + +4. **View alerts in Grafana** at [Alerting → Alert groups](http://localhost:3000/alerting/groups?groupBy=alertname): + - Select the **Tenant A Alertmanager** datasource — click "See source" to open Grafana Explore + - Select the **Tenant B Alertmanager** datasource — click "See source" to open Perses + +5. **Verify generator URLs** via the API: + +```sh +# Tenant A: Grafana Explore URLs +curl -s "http://localhost:9009/alertmanager/api/v2/alerts" \ + -H "X-Scope-OrgID: tenant-a" | jq '.[].generatorURL' + +# Tenant B: Perses URLs +curl -s "http://localhost:9009/alertmanager/api/v2/alerts" \ + -H "X-Scope-OrgID: tenant-b" | jq '.[].generatorURL' +``` + ## Explore and Experiment Now that everything is running, try these experiments to learn how Cortex works: @@ -306,6 +433,7 @@ This setup uses several configuration files. Here's what each does: |----------------------------------|---------------------------------------------------------------| | `docker-compose.yaml` | Defines all services (Cortex, Prometheus, Grafana, SeaweedFS) | | `cortex-config.yaml` | Cortex configuration (storage, limits, components) | +| `runtime-config.yaml` | Per-tenant runtime overrides (alert generator URL templates) | | `prometheus-config.yaml` | Prometheus configuration with remote_write to Cortex | | `grafana-datasource-docker.yaml` | Grafana datasource pointing to Cortex | | `rules.yaml` | Example recording rules | diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 3a13151b4c6..2b46ae2b3df 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -19,6 +19,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/util/strutil" "github.com/weaveworks/common/httpgrpc" "github.com/weaveworks/common/user" @@ -173,6 +174,8 @@ type RulesLimits interface { RulerQueryOffset(userID string) time.Duration DisabledRuleGroups(userID string) validation.DisabledRuleGroups RulerExternalLabels(userID string) labels.Labels + RulerExternalURL(userID string) string + RulerAlertGeneratorURLTemplate(userID string) string } type QueryExecutor func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) @@ -378,11 +381,26 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi Appendable: NewPusherAppendable(p, userID, overrides, evalMetrics.TotalWritesVec.WithLabelValues(userID), evalMetrics.FailedWritesVec.WithLabelValues(userID)), - Queryable: q, - QueryFunc: queryFunc, - Context: prometheusContext, - ExternalURL: cfg.ExternalURL.URL, - NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), + Queryable: q, + QueryFunc: queryFunc, + Context: prometheusContext, + ExternalURL: cfg.ExternalURL.URL, + NotifyFunc: SendAlerts(notifier, func(expr string) string { + externalURL := cfg.ExternalURL.String() + if tenantURL := overrides.RulerExternalURL(userID); tenantURL != "" { + externalURL = tenantURL + } + tmplStr := overrides.RulerAlertGeneratorURLTemplate(userID) + if tmplStr == "" { + return externalURL + strutil.TableLinkForExpression(expr) + } + result, err := executeGeneratorURLTemplate(tmplStr, externalURL, expr) + if err != nil { + level.Warn(logger).Log("msg", "failed to execute generator URL template, falling back to prometheus format", "err", err) + return externalURL + strutil.TableLinkForExpression(expr) + } + return result + }), Logger: util_log.GoKitLogToSlog(log.With(logger, "user", userID)), Registerer: reg, OutageTolerance: cfg.OutageTolerance, diff --git a/pkg/ruler/external_url.go b/pkg/ruler/external_url.go new file mode 100644 index 00000000000..0928413a889 --- /dev/null +++ b/pkg/ruler/external_url.go @@ -0,0 +1,56 @@ +package ruler + +import ( + "sync" +) + +// userExternalURL tracks per-user resolved external URLs and detects changes. +type userExternalURL struct { + global string + limits RulesLimits + + mtx sync.Mutex + users map[string]string +} + +func newUserExternalURL(global string, limits RulesLimits) *userExternalURL { + return &userExternalURL{ + global: global, + limits: limits, + + mtx: sync.Mutex{}, + users: map[string]string{}, + } +} + +func (e *userExternalURL) update(userID string) (string, bool) { + tenantURL := e.limits.RulerExternalURL(userID) + resolved := e.global + if tenantURL != "" { + resolved = tenantURL + } + + e.mtx.Lock() + defer e.mtx.Unlock() + + if prev, ok := e.users[userID]; ok && prev == resolved { + return resolved, false + } + + e.users[userID] = resolved + return resolved, true +} + +func (e *userExternalURL) remove(user string) { + e.mtx.Lock() + defer e.mtx.Unlock() + delete(e.users, user) +} + +func (e *userExternalURL) cleanup() { + e.mtx.Lock() + defer e.mtx.Unlock() + for user := range e.users { + delete(e.users, user) + } +} diff --git a/pkg/ruler/external_url_test.go b/pkg/ruler/external_url_test.go new file mode 100644 index 00000000000..50b88563e8e --- /dev/null +++ b/pkg/ruler/external_url_test.go @@ -0,0 +1,67 @@ +package ruler + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestUserExternalURL(t *testing.T) { + limits := ruleLimits{} + e := newUserExternalURL("http://global:9090", &limits) + + const userID = "test-user" + + t.Run("global URL used when no per-tenant override", func(t *testing.T) { + e.remove(userID) + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("no change on second update", func(t *testing.T) { + url, changed := e.update(userID) + require.False(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("per-tenant URL overrides global", func(t *testing.T) { + limits.mtx.Lock() + limits.externalURL = "http://tenant:3000" + limits.mtx.Unlock() + + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://tenant:3000", url) + }) + + t.Run("no change when per-tenant URL is the same", func(t *testing.T) { + url, changed := e.update(userID) + require.False(t, changed) + require.Equal(t, "http://tenant:3000", url) + }) + + t.Run("revert to global when per-tenant override removed", func(t *testing.T) { + limits.mtx.Lock() + limits.externalURL = "" + limits.mtx.Unlock() + + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("remove and cleanup lifecycle", func(t *testing.T) { + e.remove(userID) + // After remove, next update should report changed + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + + e.cleanup() + // After cleanup, next update should report changed + url, changed = e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) +} diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index d44a0d95829..86611201899 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -53,6 +53,9 @@ type DefaultMultiTenantManager struct { // Per-user externalLabels. userExternalLabels *userExternalLabels + // Per-user externalURL. + userExternalURL *userExternalURL + // rules backup rulesBackupManager *rulesBackupManager @@ -101,6 +104,7 @@ func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory ruleEvalMetrics: evalMetrics, notifiers: map[string]*rulerNotifier{}, userExternalLabels: newUserExternalLabels(cfg.ExternalLabels, limits), + userExternalURL: newUserExternalURL(cfg.ExternalURL.String(), limits), notifiersDiscoveryMetrics: notifiersDiscoveryMetrics, mapper: newMapper(cfg.RulePath, logger), userManagers: map[string]RulesManager{}, @@ -166,6 +170,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou r.removeNotifier(userID) r.mapper.cleanupUser(userID) r.userExternalLabels.remove(userID) + r.userExternalURL.remove(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) r.configUpdatesTotal.DeleteLabelValues(userID) @@ -210,6 +215,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user return } externalLabels, externalLabelsUpdated := r.userExternalLabels.update(user) + externalURL, externalURLUpdated := r.userExternalURL.update(user) existing := true manager := r.getRulesManager(user, ctx) @@ -222,13 +228,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user return } - if !existing || rulesUpdated || externalLabelsUpdated { + if !existing || rulesUpdated || externalLabelsUpdated || externalURLUpdated { level.Debug(r.logger).Log("msg", "updating rules", "user", user) r.configUpdatesTotal.WithLabelValues(user).Inc() - if (rulesUpdated || externalLabelsUpdated) && existing { + if (rulesUpdated || externalLabelsUpdated || externalURLUpdated) && existing { r.updateRuleCache(user, manager.RuleGroups()) } - err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), r.ruleGroupIterationFunc) + err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, externalURL, r.ruleGroupIterationFunc) r.deleteRuleCache(user) if err != nil { r.lastReloadSuccessful.WithLabelValues(user).Set(0) @@ -443,6 +449,7 @@ func (r *DefaultMultiTenantManager) Stop() { // cleanup user rules directories r.mapper.cleanup() r.userExternalLabels.cleanup() + r.userExternalURL.cleanup() } func (m *DefaultMultiTenantManager) ValidateRuleGroup(g rulefmt.RuleGroup) []error { diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 57ee59e370a..197c96eca98 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -1,6 +1,7 @@ package ruler import ( + "bytes" "context" "flag" "fmt" @@ -12,6 +13,7 @@ import ( "sort" "strings" "sync" + "text/template" "time" "github.com/go-kit/log" @@ -26,7 +28,6 @@ import ( "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/promql/parser" promRules "github.com/prometheus/prometheus/rules" - "github.com/prometheus/prometheus/util/strutil" "github.com/weaveworks/common/user" "golang.org/x/sync/errgroup" @@ -507,7 +508,7 @@ type sender interface { // It filters any non-firing alerts from the input. // // Copied from Prometheus's main.go. -func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { +func SendAlerts(n sender, generatorURLFn func(expr string) string) promRules.NotifyFunc { return func(ctx context.Context, expr string, alerts ...*promRules.Alert) { var res []*notifier.Alert @@ -516,7 +517,7 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { StartsAt: alert.FiredAt, Labels: alert.Labels, Annotations: alert.Annotations, - GeneratorURL: externalURL + strutil.TableLinkForExpression(expr), + GeneratorURL: generatorURLFn(expr), } if !alert.ResolvedAt.IsZero() { a.EndsAt = alert.ResolvedAt @@ -532,6 +533,28 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { } } +// generatorURLTemplateData holds the variables available in generator URL templates. +type generatorURLTemplateData struct { + ExternalURL string + Expression string +} + +// executeGeneratorURLTemplate executes a Go text/template to produce a generator URL. +func executeGeneratorURLTemplate(tmplStr, externalURL, expr string) (string, error) { + tmpl, err := template.New("generator_url").Parse(tmplStr) + if err != nil { + return "", err + } + var buf bytes.Buffer + if err := tmpl.Execute(&buf, generatorURLTemplateData{ + ExternalURL: externalURL, + Expression: expr, + }); err != nil { + return "", err + } + return buf.String(), nil +} + func ruleGroupDisabled(ruleGroup *rulespb.RuleGroupDesc, disabledRuleGroupsForUser validation.DisabledRuleGroups) bool { for _, disabledRuleGroupForUser := range disabledRuleGroupsForUser { if ruleGroup.Namespace == disabledRuleGroupForUser.Namespace && diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index e5738945cb4..a305a8ec39c 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -35,6 +35,7 @@ import ( promRules "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/util/annotations" + "github.com/prometheus/prometheus/util/strutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -89,14 +90,16 @@ func defaultRulerConfig(t testing.TB) Config { } type ruleLimits struct { - mtx sync.RWMutex - tenantShard float64 - maxRulesPerRuleGroup int - maxRuleGroups int - disabledRuleGroups validation.DisabledRuleGroups - maxQueryLength time.Duration - queryOffset time.Duration - externalLabels labels.Labels + mtx sync.RWMutex + tenantShard float64 + maxRulesPerRuleGroup int + maxRuleGroups int + disabledRuleGroups validation.DisabledRuleGroups + maxQueryLength time.Duration + queryOffset time.Duration + externalLabels labels.Labels + externalURL string + alertGeneratorURLTemplate string } func (r *ruleLimits) setRulerExternalLabels(lset labels.Labels) { @@ -147,6 +150,18 @@ func (r *ruleLimits) RulerExternalLabels(_ string) labels.Labels { return r.externalLabels } +func (r *ruleLimits) RulerExternalURL(_ string) string { + r.mtx.RLock() + defer r.mtx.RUnlock() + return r.externalURL +} + +func (r *ruleLimits) RulerAlertGeneratorURLTemplate(_ string) string { + r.mtx.RLock() + defer r.mtx.RUnlock() + return r.alertGeneratorURLTemplate +} + func newEmptyQueryable() storage.Queryable { return storage.QueryableFunc(func(mint, maxt int64) (storage.Querier, error) { return emptyQuerier{}, nil @@ -2684,10 +2699,13 @@ func (s senderFunc) Send(alerts ...*notifier.Alert) { func TestSendAlerts(t *testing.T) { testCases := []struct { - in []*promRules.Alert - exp []*notifier.Alert + name string + in []*promRules.Alert + exp []*notifier.Alert + generatorURLFn func(expr string) string }{ { + name: "prometheus format with valid until", in: []*promRules.Alert{ { Labels: labels.FromStrings("l1", "v1"), @@ -2706,8 +2724,12 @@ func TestSendAlerts(t *testing.T) { GeneratorURL: "http://localhost:9090/graph?g0.expr=up&g0.tab=1", }, }, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, }, { + name: "prometheus format with resolved at", in: []*promRules.Alert{ { Labels: labels.FromStrings("l1", "v1"), @@ -2726,21 +2748,107 @@ func TestSendAlerts(t *testing.T) { GeneratorURL: "http://localhost:9090/graph?g0.expr=up&g0.tab=1", }, }, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, }, { - in: []*promRules.Alert{}, + name: "empty alerts", + in: []*promRules.Alert{}, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, + }, + { + name: "custom template format", + in: []*promRules.Alert{ + { + Labels: labels.FromStrings("l1", "v1"), + Annotations: labels.FromStrings("a2", "v2"), + ActiveAt: time.Unix(1, 0), + FiredAt: time.Unix(2, 0), + ValidUntil: time.Unix(3, 0), + }, + }, + exp: []*notifier.Alert{ + { + Labels: labels.FromStrings("l1", "v1"), + Annotations: labels.FromStrings("a2", "v2"), + StartsAt: time.Unix(2, 0), + EndsAt: time.Unix(3, 0), + GeneratorURL: "http://grafana.example.com/explore?expr=up", + }, + }, + generatorURLFn: func(expr string) string { + result, _ := executeGeneratorURLTemplate( + "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}", + "http://grafana.example.com", expr) + return result + }, }, } - for i, tc := range testCases { - t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - senderFunc := senderFunc(func(alerts ...*notifier.Alert) { + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + sf := senderFunc(func(alerts ...*notifier.Alert) { if len(tc.in) == 0 { t.Fatalf("sender called with 0 alert") } - require.Equal(t, tc.exp, alerts) + if tc.exp != nil { + require.Equal(t, tc.exp, alerts) + } }) - SendAlerts(senderFunc, "http://localhost:9090")(context.TODO(), "up", tc.in...) + SendAlerts(sf, tc.generatorURLFn)(context.TODO(), "up", tc.in...) + }) + } +} + +func TestExecuteGeneratorURLTemplate(t *testing.T) { + testCases := []struct { + name string + tmplStr string + externalURL string + expr string + expected string + expectErr bool + }{ + { + name: "basic template with expression", + tmplStr: "{{ .ExternalURL }}/graph?expr={{ .Expression }}", + externalURL: "http://prometheus:9090", + expr: "up", + expected: "http://prometheus:9090/graph?expr=up", + }, + { + name: "template with urlquery", + tmplStr: "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}", + externalURL: "http://grafana.example.com", + expr: "rate(http_requests_total[5m])", + expected: "http://grafana.example.com/explore?expr=rate%28http_requests_total%5B5m%5D%29", + }, + { + name: "invalid template returns error", + tmplStr: "{{ .Invalid", + expectErr: true, + }, + { + name: "template with multiple variables", + tmplStr: "{{ .ExternalURL }}/explore?left=%7B%22queries%22:%5B%7B%22expr%22:%22{{ urlquery .Expression }}%22%7D%5D%7D", + externalURL: "http://grafana:3000", + expr: "up", + expected: "http://grafana:3000/explore?left=%7B%22queries%22:%5B%7B%22expr%22:%22up%22%7D%5D%7D", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result, err := executeGeneratorURLTemplate(tc.tmplStr, tc.externalURL, tc.expr) + if tc.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tc.expected, result) + } }) } } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 73f09fe3407..4b7a64ba00f 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -10,6 +10,7 @@ import ( "math" "regexp" "strings" + "text/template" "time" "github.com/cespare/xxhash/v2" @@ -210,13 +211,15 @@ type Limits struct { QueryRejection QueryRejection `yaml:"query_rejection" json:"query_rejection" doc:"nocli|description=Configuration for query rejection."` // Ruler defaults and limits. - RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` - RulerTenantShardSize float64 `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` - RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` - RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` - RulerQueryOffset model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"` - RulerExternalLabels labels.Labels `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"` - RulesPartialData bool `yaml:"rules_partial_data" json:"rules_partial_data" doc:"nocli|description=Enable to allow rules to be evaluated with data from a single zone, if other zones are not available.|default=false"` + RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` + RulerTenantShardSize float64 `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` + RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` + RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` + RulerQueryOffset model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"` + RulerExternalLabels labels.Labels `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"` + RulerExternalURL string `yaml:"ruler_external_url" json:"ruler_external_url" doc:"nocli|description=Per-tenant external URL for the ruler. If set, it overrides the global -ruler.external.url for this tenant's alert notifications."` + RulerAlertGeneratorURLTemplate string `yaml:"ruler_alert_generator_url_template" json:"ruler_alert_generator_url_template" doc:"nocli|description=Go text/template for alert generator URLs. Available variables: .ExternalURL (resolved external URL) and .Expression (PromQL expression). Built-in functions like urlquery are available. If empty, uses default Prometheus /graph format."` + RulesPartialData bool `yaml:"rules_partial_data" json:"rules_partial_data" doc:"nocli|description=Enable to allow rules to be evaluated with data from a single zone, if other zones are not available.|default=false"` // Store-gateway. StoreGatewayTenantShardSize float64 `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -417,6 +420,12 @@ func (l *Limits) Validate(nameValidationScheme model.ValidationScheme, shardByAl } } + if l.RulerAlertGeneratorURLTemplate != "" { + if _, err := template.New("").Parse(l.RulerAlertGeneratorURLTemplate); err != nil { + return fmt.Errorf("invalid ruler_alert_generator_url_template: %w", err) + } + } + return nil } @@ -1161,6 +1170,14 @@ func (o *Overrides) RulerExternalLabels(userID string) labels.Labels { return o.GetOverridesForUser(userID).RulerExternalLabels } +func (o *Overrides) RulerExternalURL(userID string) string { + return o.GetOverridesForUser(userID).RulerExternalURL +} + +func (o *Overrides) RulerAlertGeneratorURLTemplate(userID string) string { + return o.GetOverridesForUser(userID).RulerAlertGeneratorURLTemplate +} + // MaxRegexPatternLength returns the maximum length of an unoptimized regex pattern. // This is only used in Ingester. func (o *Overrides) MaxRegexPatternLength(userID string) int { diff --git a/schemas/cortex-config-schema.json b/schemas/cortex-config-schema.json index 20cf970c35b..42cf631203b 100644 --- a/schemas/cortex-config-schema.json +++ b/schemas/cortex-config-schema.json @@ -5501,6 +5501,10 @@ "x-cli-flag": "frontend.results-cache-ttl", "x-format": "duration" }, + "ruler_alert_generator_url_template": { + "description": "Go text/template for alert generator URLs. Available variables: .ExternalURL (resolved external URL) and .Expression (PromQL expression). Built-in functions like urlquery are available. If empty, uses default Prometheus /graph format.", + "type": "string" + }, "ruler_evaluation_delay_duration": { "default": "0s", "description": "Deprecated(use ruler.query-offset instead) and will be removed in v1.19.0: Duration to delay the evaluation of rules to ensure the underlying metrics have been pushed to Cortex.", @@ -5514,6 +5518,10 @@ "description": "external labels for alerting rules", "type": "object" }, + "ruler_external_url": { + "description": "Per-tenant external URL for the ruler. If set, it overrides the global -ruler.external.url for this tenant's alert notifications.", + "type": "string" + }, "ruler_max_rule_groups_per_tenant": { "default": 0, "description": "Maximum number of rule groups per-tenant. 0 to disable.",