Skip to content

Commit 9236ca1

Browse files
committed
feat(webapp): notify customers of incidents via BetterStack webhook
When a status report is published on the status page, an inbound BetterStack webhook fans the update out over Slack, email, and Discord. Fires only on published incident updates, not monitor auto-alerts. Deduped per update; each channel no-ops unless its own config is present. Verified end-to-end against a real BetterStack test status page: real webhook delivered to Slack and Discord.
1 parent cc752cc commit 9236ca1

17 files changed

Lines changed: 1148 additions & 2 deletions
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
area: webapp
3+
type: feature
4+
---
5+
6+
Add an inbound webhook (`POST /webhooks/v1/betterstack-incidents`) that receives
7+
status-page incident updates and proactively notifies customers over Slack
8+
(channels matching a configurable name prefix), email (org admins, via the
9+
alerts email transport), and Discord (an incoming webhook). Delivery runs on the
10+
alerts redis-worker with per-surface jobs and is deduped on the incident update
11+
id. Gated by `INCIDENT_NOTIFY_ENABLED` plus a shared-secret token in the webhook
12+
URL; each surface no-ops unless its own config is present.

apps/webapp/app/env.server.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,6 +1606,17 @@ const EnvironmentSchema = z
16061606
BETTERSTACK_API_KEY: z.string().optional(),
16071607
BETTERSTACK_STATUS_PAGE_ID: z.string().optional(),
16081608

1609+
// Incident notifications: when a status report is published on the status
1610+
// page, BetterStack calls our inbound webhook and we fan the update out over
1611+
// Slack/email/Discord. Each surface is a no-op unless its own config below
1612+
// is present. The status-page webhook has no signature, so the endpoint is
1613+
// gated by a shared secret in the URL. The Slack channel prefix has no
1614+
// default — Slack delivery only runs when a prefix is explicitly configured.
1615+
INCIDENT_NOTIFY_ENABLED: z.string().default("0"),
1616+
BETTERSTACK_INCIDENT_WEBHOOK_SECRET: z.string().optional(),
1617+
INCIDENT_NOTIFY_SLACK_CHANNEL_PREFIX: z.string().optional(),
1618+
INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL: z.string().optional(),
1619+
16091620
RUN_REPLICATION_REDIS_HOST: z
16101621
.string()
16111622
.optional()
@@ -2010,6 +2021,14 @@ const EnvironmentSchema = z
20102021
.and(GithubAppEnvSchema)
20112022
.and(S2EnvSchema)
20122023
.superRefine((env, ctx) => {
2024+
if (env.INCIDENT_NOTIFY_ENABLED === "1" && !env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET) {
2025+
ctx.addIssue({
2026+
code: z.ZodIssueCode.custom,
2027+
path: ["BETTERSTACK_INCIDENT_WEBHOOK_SECRET"],
2028+
message: "BETTERSTACK_INCIDENT_WEBHOOK_SECRET is required when INCIDENT_NOTIFY_ENABLED=1",
2029+
});
2030+
}
2031+
20132032
const presets = new Set(env.COMPUTE_TEMPLATE_MACHINE_PRESETS);
20142033
for (const required of env.COMPUTE_TEMPLATE_MACHINE_PRESETS_REQUIRED) {
20152034
if (!presets.has(required)) {
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import { type ActionFunctionArgs, json } from "@remix-run/server-runtime";
2+
import { createHash, timingSafeEqual } from "node:crypto";
3+
import { env } from "~/env.server";
4+
import {
5+
IncidentWebhookSchema,
6+
isCustomerNotifiableEvent,
7+
normalizeIncidentUpdate,
8+
} from "~/services/betterstack/incidentWebhook";
9+
import { logger } from "~/services/logger.server";
10+
import { alertsWorker } from "~/v3/alertsWorker.server";
11+
12+
// Inbound webhook for status-page incident updates. BetterStack calls this when
13+
// we publish or edit a status report; we fan the update out to customers over
14+
// Slack/email/Discord. The webhook is unsigned and BetterStack cannot attach
15+
// custom headers to status-page subscriptions (it explicitly recommends
16+
// URL-based auth), so the endpoint is gated by a shared secret passed as
17+
// `?token=`. The token is redacted from access logs and the request logger at
18+
// ingress (see redactSensitiveQueryParams in server.ts). When the feature is
19+
// disabled or no secret is configured we 404 so the endpoint isn't advertised.
20+
//
21+
// We respond 200 quickly and hand delivery to the background worker. BetterStack
22+
// redelivers on failure (up to 10 times), so the enqueue is deduped on the
23+
// update id to avoid notifying twice for the same update.
24+
export async function action({ request }: ActionFunctionArgs) {
25+
if (request.method.toUpperCase() !== "POST") {
26+
return json({ error: "Method not allowed" }, { status: 405 });
27+
}
28+
29+
const secret = env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET;
30+
if (env.INCIDENT_NOTIFY_ENABLED !== "1" || !secret) {
31+
return json({ error: "Not found" }, { status: 404 });
32+
}
33+
34+
const token = new URL(request.url).searchParams.get("token") ?? "";
35+
if (!secretsMatch(token, secret)) {
36+
return json({ error: "Invalid token" }, { status: 401 });
37+
}
38+
39+
const rawBody = await request.text();
40+
41+
let parsed: unknown;
42+
try {
43+
parsed = JSON.parse(rawBody);
44+
} catch {
45+
return json({ error: "Invalid JSON" }, { status: 400 });
46+
}
47+
48+
const payload = IncidentWebhookSchema.safeParse(parsed);
49+
if (!payload.success) {
50+
logger.warn("BetterStack incident webhook: invalid payload", {
51+
issues: payload.error.issues,
52+
});
53+
return json({ error: "Invalid payload", issues: payload.error.issues }, { status: 400 });
54+
}
55+
56+
// Maintenance and component-update events are not customer incidents.
57+
if (!isCustomerNotifiableEvent(payload.data)) {
58+
return json({ ignored: true, reason: "non_incident_event" }, { status: 200 });
59+
}
60+
61+
const update = normalizeIncidentUpdate(payload.data);
62+
if (!update) {
63+
return json({ ignored: true, reason: "no_updates" }, { status: 200 });
64+
}
65+
66+
await alertsWorker.enqueueOnce({
67+
id: `incident-notify:${update.updateId}`,
68+
job: "v3.fanoutIncidentNotification",
69+
payload: update,
70+
});
71+
72+
return json({ received: true }, { status: 200 });
73+
}
74+
75+
// timingSafeEqual requires equal-length buffers; hashing both sides to a fixed
76+
// width keeps the comparison constant-time without leaking the secret length.
77+
function secretsMatch(a: string, b: string): boolean {
78+
const aHash = createHash("sha256").update(a).digest();
79+
const bHash = createHash("sha256").update(b).digest();
80+
return timingSafeEqual(aHash, bHash);
81+
}
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import { z } from "zod";
2+
3+
// Payload shape for the BetterStack status-page "subscribe via webhook"
4+
// integration. This fires when a status report is published or updated on the
5+
// status page (event_type "incident"). Maintenance windows and individual
6+
// component changes arrive as separate event types that we ignore.
7+
//
8+
// Note: this webhook is NOT cryptographically signed by BetterStack, so the
9+
// route gates access with a shared secret in the URL instead.
10+
11+
// BetterStack sends resource ids as JSON numbers, not strings. Accept either
12+
// and normalize to a string so the rest of the pipeline (dedup keys, logs) can
13+
// treat every id uniformly.
14+
const IdSchema = z.union([z.string(), z.number()]).transform((v) => String(v));
15+
16+
export const IncidentUpdateSchema = z.object({
17+
id: IdSchema,
18+
status_report_id: IdSchema.optional(),
19+
body: z.string().nullish(),
20+
created_at: z.string().nullish(),
21+
updated_at: z.string().nullish(),
22+
});
23+
24+
export const IncidentWebhookSchema = z.object({
25+
event_type: z.string(),
26+
page: z
27+
.object({
28+
id: IdSchema.optional(),
29+
status_indicator: z.string().nullish(),
30+
status_description: z.string().nullish(),
31+
})
32+
.optional(),
33+
// Optional so maintenance/component-update callbacks (which carry no
34+
// `incident`) still parse and fall through to the ignore path, rather than
35+
// failing with a 400 that BetterStack would retry.
36+
incident: z
37+
.object({
38+
id: IdSchema,
39+
name: z.string().nullish(),
40+
created_at: z.string().nullish(),
41+
updated_at: z.string().nullish(),
42+
shortlink: z.string().nullish(),
43+
incident_updates: z.array(IncidentUpdateSchema).default([]),
44+
})
45+
.optional(),
46+
});
47+
48+
export type IncidentWebhook = z.infer<typeof IncidentWebhookSchema>;
49+
50+
export const NormalizedIncidentUpdateSchema = z.object({
51+
incidentId: z.string(),
52+
updateId: z.string(),
53+
name: z.string(),
54+
statusIndicator: z.string(),
55+
body: z.string(),
56+
shortlink: z.string().nullable(),
57+
updatedAt: z.string().nullable(),
58+
});
59+
60+
export type NormalizedIncidentUpdate = {
61+
/** Stable id of the incident itself, for grouping all of its updates. */
62+
incidentId: string;
63+
/** Id of the specific status report update — our idempotency key. */
64+
updateId: string;
65+
/** Incident title shown on the status page. */
66+
name: string;
67+
/** Latest aggregate state of the page: operational | degraded | downtime | maintenance. */
68+
statusIndicator: string;
69+
/** Body of the most recent status report update (markdown). */
70+
body: string;
71+
/** Public shortlink to the incident on the status page. */
72+
shortlink: string | null;
73+
/** ISO timestamp of the most recent update, when provided. */
74+
updatedAt: string | null;
75+
};
76+
77+
/**
78+
* Only status-report ("incident") events feed customer notifications. Auto
79+
* alerting from monitors arrives via a different webhook and never as this
80+
* event type, which is exactly the separation we want.
81+
*/
82+
export function isCustomerNotifiableEvent(payload: IncidentWebhook): boolean {
83+
return payload.event_type === "incident" && !!payload.incident;
84+
}
85+
86+
/**
87+
* Collapse the webhook into the single most-recent update. Returns null when
88+
* there are no updates to relay (nothing to notify about yet).
89+
*/
90+
export function normalizeIncidentUpdate(payload: IncidentWebhook): NormalizedIncidentUpdate | null {
91+
if (!payload.incident) {
92+
return null;
93+
}
94+
95+
const updates = payload.incident.incident_updates;
96+
if (updates.length === 0) {
97+
return null;
98+
}
99+
100+
// BetterStack lists updates newest-first; fall back to the last entry if the
101+
// ordering ever changes by sorting on created_at when available.
102+
const mostRecent = [...updates].sort((a, b) => {
103+
const aTime = a.created_at ? Date.parse(a.created_at) : 0;
104+
const bTime = b.created_at ? Date.parse(b.created_at) : 0;
105+
return bTime - aTime;
106+
})[0];
107+
108+
return {
109+
incidentId: payload.incident.id,
110+
updateId: mostRecent.id,
111+
name: payload.incident.name?.trim() || "Service incident",
112+
statusIndicator: payload.page?.status_indicator?.trim() || "downtime",
113+
body: mostRecent.body?.trim() || "",
114+
shortlink: payload.incident.shortlink ?? null,
115+
updatedAt: mostRecent.created_at ?? payload.incident.updated_at ?? null,
116+
};
117+
}

apps/webapp/app/utils/redactUrl.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Query params that carry credentials and must never reach access logs, the
2+
// structured request logger, or traces. Some inbound webhooks (e.g. the
3+
// BetterStack status-page webhook) can only authenticate via a URL query token
4+
// because the sender cannot attach custom headers, so we redact at ingress.
5+
export const SENSITIVE_QUERY_PARAMS = ["token", "secret", "access_token", "api_key"];
6+
7+
/**
8+
* Return `url` with the values of any sensitive query params replaced by
9+
* `[redacted]`. Accepts absolute URLs or path+query strings. Never throws — a
10+
* malformed URL is returned unchanged so redaction can't break request handling
11+
* or span creation.
12+
*/
13+
export function redactSensitiveQueryParams(url: string): string {
14+
const queryStart = url.indexOf("?");
15+
if (queryStart === -1) {
16+
return url;
17+
}
18+
19+
try {
20+
const params = new URLSearchParams(url.slice(queryStart + 1));
21+
let didRedact = false;
22+
for (const key of SENSITIVE_QUERY_PARAMS) {
23+
if (params.has(key)) {
24+
params.set(key, "[redacted]");
25+
didRedact = true;
26+
}
27+
}
28+
return didRedact ? `${url.slice(0, queryStart)}?${params.toString()}` : url;
29+
} catch {
30+
return url;
31+
}
32+
}

apps/webapp/app/v3/alertsWorker.server.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,17 @@ import { z } from "zod";
44
import { env } from "~/env.server";
55
import { logger } from "~/services/logger.server";
66
import { singleton } from "~/utils/singleton";
7+
import { NormalizedIncidentUpdateSchema } from "~/services/betterstack/incidentWebhook";
78
import { DeliverAlertService } from "./services/alerts/deliverAlert.server";
89
import { DeliverErrorGroupAlertService } from "./services/alerts/deliverErrorGroupAlert.server";
910
import { ErrorAlertEvaluator } from "./services/alerts/errorAlertEvaluator.server";
11+
import { deliverIncidentToDiscord } from "./services/alerts/incidentNotifications/deliverDiscord.server";
12+
import {
13+
deliverIncidentEmailPage,
14+
deliverIncidentEmailToRecipient,
15+
} from "./services/alerts/incidentNotifications/deliverEmail.server";
16+
import { deliverIncidentToSlack } from "./services/alerts/incidentNotifications/deliverSlack.server";
17+
import { fanoutIncidentNotification } from "./services/alerts/incidentNotifications/fanout.server";
1018
import { PerformDeploymentAlertsService } from "./services/alerts/performDeploymentAlerts.server";
1119
import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server";
1220

@@ -93,6 +101,52 @@ function initializeWorker() {
93101
},
94102
logErrors: true,
95103
},
104+
"v3.fanoutIncidentNotification": {
105+
schema: NormalizedIncidentUpdateSchema,
106+
visibilityTimeoutMs: 30_000,
107+
retry: {
108+
maxAttempts: 3,
109+
},
110+
logErrors: true,
111+
},
112+
"v3.deliverIncidentSlack": {
113+
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
114+
visibilityTimeoutMs: 60_000,
115+
retry: {
116+
maxAttempts: 3,
117+
},
118+
logErrors: true,
119+
},
120+
"v3.deliverIncidentDiscord": {
121+
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
122+
visibilityTimeoutMs: 30_000,
123+
retry: {
124+
maxAttempts: 3,
125+
},
126+
logErrors: true,
127+
},
128+
"v3.deliverIncidentEmail": {
129+
schema: z.object({
130+
update: NormalizedIncidentUpdateSchema,
131+
cursor: z.string().nullable(),
132+
}),
133+
visibilityTimeoutMs: 60_000,
134+
retry: {
135+
maxAttempts: 3,
136+
},
137+
logErrors: true,
138+
},
139+
"v3.deliverIncidentEmailRetry": {
140+
schema: z.object({
141+
update: NormalizedIncidentUpdateSchema,
142+
recipient: z.object({ userId: z.string(), email: z.string() }),
143+
}),
144+
visibilityTimeoutMs: 30_000,
145+
retry: {
146+
maxAttempts: 3,
147+
},
148+
logErrors: true,
149+
},
96150
},
97151
concurrency: {
98152
workers: env.ALERTS_WORKER_CONCURRENCY_WORKERS,
@@ -126,6 +180,21 @@ function initializeWorker() {
126180
const service = new DeliverErrorGroupAlertService();
127181
await service.call(payload);
128182
},
183+
"v3.fanoutIncidentNotification": async ({ payload }) => {
184+
await fanoutIncidentNotification(payload);
185+
},
186+
"v3.deliverIncidentSlack": async ({ payload }) => {
187+
await deliverIncidentToSlack(payload.update);
188+
},
189+
"v3.deliverIncidentDiscord": async ({ payload }) => {
190+
await deliverIncidentToDiscord(payload.update);
191+
},
192+
"v3.deliverIncidentEmail": async ({ payload }) => {
193+
await deliverIncidentEmailPage(payload);
194+
},
195+
"v3.deliverIncidentEmailRetry": async ({ payload }) => {
196+
await deliverIncidentEmailToRecipient(payload);
197+
},
129198
},
130199
});
131200

0 commit comments

Comments
 (0)