Skip to content

Commit b229cd3

Browse files
committed
feat(webapp): notify customers of incidents via BetterStack webhook
When a status report is published on the status page, an inbound BetterStack webhook fans the update out over Slack, email, and Discord. Fires only on published incident updates, not monitor auto-alerts. Deduped per update; each channel no-ops unless its own config is present.
1 parent cc752cc commit b229cd3

17 files changed

Lines changed: 1048 additions & 2 deletions
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
area: webapp
3+
type: feature
4+
---
5+
6+
Add an inbound webhook (`POST /webhooks/v1/betterstack-incidents`) that receives
7+
status-page incident updates and proactively notifies customers over Slack
8+
(channels matching a configurable name prefix), email (org admins, via the
9+
alerts email transport), and Discord (an incoming webhook). Delivery runs on the
10+
alerts redis-worker with per-surface jobs and is deduped on the incident update
11+
id. Gated by `INCIDENT_NOTIFY_ENABLED` plus a shared-secret token in the webhook
12+
URL; each surface no-ops unless its own config is present.

apps/webapp/app/env.server.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,6 +1606,14 @@ const EnvironmentSchema = z
16061606
BETTERSTACK_API_KEY: z.string().optional(),
16071607
BETTERSTACK_STATUS_PAGE_ID: z.string().optional(),
16081608

1609+
// Incident notifications: fan a published status report out over
1610+
// Slack/email/Discord. Each surface no-ops unless configured; the unsigned
1611+
// webhook is gated by a shared secret in the URL.
1612+
INCIDENT_NOTIFY_ENABLED: z.string().default("0"),
1613+
BETTERSTACK_INCIDENT_WEBHOOK_SECRET: z.string().optional(),
1614+
INCIDENT_NOTIFY_SLACK_CHANNEL_PREFIX: z.string().optional(),
1615+
INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL: z.string().optional(),
1616+
16091617
RUN_REPLICATION_REDIS_HOST: z
16101618
.string()
16111619
.optional()
@@ -2010,6 +2018,14 @@ const EnvironmentSchema = z
20102018
.and(GithubAppEnvSchema)
20112019
.and(S2EnvSchema)
20122020
.superRefine((env, ctx) => {
2021+
if (env.INCIDENT_NOTIFY_ENABLED === "1" && !env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET) {
2022+
ctx.addIssue({
2023+
code: z.ZodIssueCode.custom,
2024+
path: ["BETTERSTACK_INCIDENT_WEBHOOK_SECRET"],
2025+
message: "BETTERSTACK_INCIDENT_WEBHOOK_SECRET is required when INCIDENT_NOTIFY_ENABLED=1",
2026+
});
2027+
}
2028+
20132029
const presets = new Set(env.COMPUTE_TEMPLATE_MACHINE_PRESETS);
20142030
for (const required of env.COMPUTE_TEMPLATE_MACHINE_PRESETS_REQUIRED) {
20152031
if (!presets.has(required)) {
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { type ActionFunctionArgs, json } from "@remix-run/server-runtime";
2+
import { createHash, timingSafeEqual } from "node:crypto";
3+
import { env } from "~/env.server";
4+
import {
5+
IncidentWebhookSchema,
6+
isCustomerNotifiableEvent,
7+
normalizeIncidentUpdate,
8+
} from "~/services/betterstack/incidentWebhook";
9+
import { logger } from "~/services/logger.server";
10+
import { alertsWorker } from "~/v3/alertsWorker.server";
11+
12+
// Inbound status-page webhook. BetterStack can't send custom headers, so we
13+
// auth via a `?token=` shared secret (redacted from logs at ingress). 404 when
14+
// disabled or unconfigured. We 200 fast and hand off to the worker; the enqueue
15+
// is deduped on the update id since BetterStack redelivers on failure.
16+
export async function action({ request }: ActionFunctionArgs) {
17+
if (request.method.toUpperCase() !== "POST") {
18+
return json({ error: "Method not allowed" }, { status: 405 });
19+
}
20+
21+
const secret = env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET;
22+
if (env.INCIDENT_NOTIFY_ENABLED !== "1" || !secret) {
23+
return json({ error: "Not found" }, { status: 404 });
24+
}
25+
26+
const token = new URL(request.url).searchParams.get("token") ?? "";
27+
if (!secretsMatch(token, secret)) {
28+
return json({ error: "Invalid token" }, { status: 401 });
29+
}
30+
31+
const rawBody = await request.text();
32+
33+
let parsed: unknown;
34+
try {
35+
parsed = JSON.parse(rawBody);
36+
} catch {
37+
return json({ error: "Invalid JSON" }, { status: 400 });
38+
}
39+
40+
const payload = IncidentWebhookSchema.safeParse(parsed);
41+
if (!payload.success) {
42+
logger.warn("BetterStack incident webhook: invalid payload", {
43+
issues: payload.error.issues,
44+
});
45+
return json({ error: "Invalid payload", issues: payload.error.issues }, { status: 400 });
46+
}
47+
48+
// Maintenance and component-update events are not customer incidents.
49+
if (!isCustomerNotifiableEvent(payload.data)) {
50+
return json({ ignored: true, reason: "non_incident_event" }, { status: 200 });
51+
}
52+
53+
const update = normalizeIncidentUpdate(payload.data);
54+
if (!update) {
55+
return json({ ignored: true, reason: "no_updates" }, { status: 200 });
56+
}
57+
58+
await alertsWorker.enqueueOnce({
59+
id: `incident-notify:${update.updateId}`,
60+
job: "v3.fanoutIncidentNotification",
61+
payload: update,
62+
});
63+
64+
return json({ received: true }, { status: 200 });
65+
}
66+
67+
// Hash both sides so timingSafeEqual gets equal-length buffers without leaking length.
68+
function secretsMatch(a: string, b: string): boolean {
69+
const aHash = createHash("sha256").update(a).digest();
70+
const bHash = createHash("sha256").update(b).digest();
71+
return timingSafeEqual(aHash, bHash);
72+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import { z } from "zod";
2+
3+
// Payload for the BetterStack status-page webhook. The endpoint is unsigned, so
4+
// the route auths via a shared secret in the URL.
5+
6+
// BetterStack sends ids as numbers; accept either and normalize to string.
7+
const IdSchema = z.union([z.string(), z.number()]).transform((v) => String(v));
8+
9+
export const IncidentUpdateSchema = z.object({
10+
id: IdSchema,
11+
status_report_id: IdSchema.optional(),
12+
body: z.string().nullish(),
13+
created_at: z.string().nullish(),
14+
updated_at: z.string().nullish(),
15+
});
16+
17+
export const IncidentWebhookSchema = z.object({
18+
event_type: z.string(),
19+
page: z
20+
.object({
21+
id: IdSchema.optional(),
22+
status_indicator: z.string().nullish(),
23+
status_description: z.string().nullish(),
24+
})
25+
.optional(),
26+
// Optional so non-incident callbacks (maintenance/component) parse and are
27+
// ignored instead of 400ing.
28+
incident: z
29+
.object({
30+
id: IdSchema,
31+
name: z.string().nullish(),
32+
created_at: z.string().nullish(),
33+
updated_at: z.string().nullish(),
34+
shortlink: z.string().nullish(),
35+
incident_updates: z.array(IncidentUpdateSchema).default([]),
36+
})
37+
.optional(),
38+
});
39+
40+
export type IncidentWebhook = z.infer<typeof IncidentWebhookSchema>;
41+
42+
export const NormalizedIncidentUpdateSchema = z.object({
43+
incidentId: z.string(),
44+
updateId: z.string(),
45+
name: z.string(),
46+
statusIndicator: z.string(),
47+
body: z.string(),
48+
shortlink: z.string().nullable(),
49+
updatedAt: z.string().nullable(),
50+
});
51+
52+
export type NormalizedIncidentUpdate = {
53+
incidentId: string;
54+
/** The specific update id — our idempotency key. */
55+
updateId: string;
56+
name: string;
57+
/** operational | degraded | downtime | maintenance */
58+
statusIndicator: string;
59+
body: string;
60+
shortlink: string | null;
61+
updatedAt: string | null;
62+
};
63+
64+
/** Only published "incident" events notify customers, not monitor auto-alerts. */
65+
export function isCustomerNotifiableEvent(payload: IncidentWebhook): boolean {
66+
return payload.event_type === "incident" && !!payload.incident;
67+
}
68+
69+
/** Reduce the webhook to its most recent update, or null if there are none. */
70+
export function normalizeIncidentUpdate(payload: IncidentWebhook): NormalizedIncidentUpdate | null {
71+
if (!payload.incident) {
72+
return null;
73+
}
74+
75+
const updates = payload.incident.incident_updates;
76+
if (updates.length === 0) {
77+
return null;
78+
}
79+
80+
// Sort by created_at so we don't rely on BetterStack's ordering.
81+
const mostRecent = [...updates].sort((a, b) => {
82+
const aTime = a.created_at ? Date.parse(a.created_at) : 0;
83+
const bTime = b.created_at ? Date.parse(b.created_at) : 0;
84+
return bTime - aTime;
85+
})[0];
86+
87+
return {
88+
incidentId: payload.incident.id,
89+
updateId: mostRecent.id,
90+
name: payload.incident.name?.trim() || "Service incident",
91+
statusIndicator: payload.page?.status_indicator?.trim() || "downtime",
92+
body: mostRecent.body?.trim() || "",
93+
shortlink: payload.incident.shortlink?.trim() || null,
94+
updatedAt: mostRecent.created_at ?? payload.incident.updated_at ?? null,
95+
};
96+
}

apps/webapp/app/utils/redactUrl.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Credential query params redacted from logs and traces (some webhooks can only
2+
// auth via a URL token).
3+
export const SENSITIVE_QUERY_PARAMS = ["token", "secret", "access_token", "api_key"];
4+
5+
/**
6+
* Replace sensitive query param values with `[redacted]`. Accepts absolute or
7+
* path+query URLs; returns malformed input unchanged (never throws).
8+
*/
9+
export function redactSensitiveQueryParams(url: string): string {
10+
const queryStart = url.indexOf("?");
11+
if (queryStart === -1) {
12+
return url;
13+
}
14+
15+
try {
16+
const params = new URLSearchParams(url.slice(queryStart + 1));
17+
let didRedact = false;
18+
for (const key of SENSITIVE_QUERY_PARAMS) {
19+
if (params.has(key)) {
20+
params.set(key, "[redacted]");
21+
didRedact = true;
22+
}
23+
}
24+
return didRedact ? `${url.slice(0, queryStart)}?${params.toString()}` : url;
25+
} catch {
26+
return url;
27+
}
28+
}

apps/webapp/app/v3/alertsWorker.server.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,17 @@ import { z } from "zod";
44
import { env } from "~/env.server";
55
import { logger } from "~/services/logger.server";
66
import { singleton } from "~/utils/singleton";
7+
import { NormalizedIncidentUpdateSchema } from "~/services/betterstack/incidentWebhook";
78
import { DeliverAlertService } from "./services/alerts/deliverAlert.server";
89
import { DeliverErrorGroupAlertService } from "./services/alerts/deliverErrorGroupAlert.server";
910
import { ErrorAlertEvaluator } from "./services/alerts/errorAlertEvaluator.server";
11+
import { deliverIncidentToDiscord } from "./services/alerts/incidentNotifications/deliverDiscord.server";
12+
import {
13+
deliverIncidentEmailPage,
14+
deliverIncidentEmailToRecipient,
15+
} from "./services/alerts/incidentNotifications/deliverEmail.server";
16+
import { deliverIncidentToSlack } from "./services/alerts/incidentNotifications/deliverSlack.server";
17+
import { fanoutIncidentNotification } from "./services/alerts/incidentNotifications/fanout.server";
1018
import { PerformDeploymentAlertsService } from "./services/alerts/performDeploymentAlerts.server";
1119
import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server";
1220

@@ -93,6 +101,52 @@ function initializeWorker() {
93101
},
94102
logErrors: true,
95103
},
104+
"v3.fanoutIncidentNotification": {
105+
schema: NormalizedIncidentUpdateSchema,
106+
visibilityTimeoutMs: 30_000,
107+
retry: {
108+
maxAttempts: 3,
109+
},
110+
logErrors: true,
111+
},
112+
"v3.deliverIncidentSlack": {
113+
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
114+
visibilityTimeoutMs: 60_000,
115+
retry: {
116+
maxAttempts: 3,
117+
},
118+
logErrors: true,
119+
},
120+
"v3.deliverIncidentDiscord": {
121+
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
122+
visibilityTimeoutMs: 30_000,
123+
retry: {
124+
maxAttempts: 3,
125+
},
126+
logErrors: true,
127+
},
128+
"v3.deliverIncidentEmail": {
129+
schema: z.object({
130+
update: NormalizedIncidentUpdateSchema,
131+
cursor: z.string().nullable(),
132+
}),
133+
visibilityTimeoutMs: 60_000,
134+
retry: {
135+
maxAttempts: 3,
136+
},
137+
logErrors: true,
138+
},
139+
"v3.deliverIncidentEmailRecipient": {
140+
schema: z.object({
141+
update: NormalizedIncidentUpdateSchema,
142+
recipient: z.object({ userId: z.string(), email: z.string() }),
143+
}),
144+
visibilityTimeoutMs: 30_000,
145+
retry: {
146+
maxAttempts: 3,
147+
},
148+
logErrors: true,
149+
},
96150
},
97151
concurrency: {
98152
workers: env.ALERTS_WORKER_CONCURRENCY_WORKERS,
@@ -126,6 +180,21 @@ function initializeWorker() {
126180
const service = new DeliverErrorGroupAlertService();
127181
await service.call(payload);
128182
},
183+
"v3.fanoutIncidentNotification": async ({ payload }) => {
184+
await fanoutIncidentNotification(payload);
185+
},
186+
"v3.deliverIncidentSlack": async ({ payload }) => {
187+
await deliverIncidentToSlack(payload.update);
188+
},
189+
"v3.deliverIncidentDiscord": async ({ payload }) => {
190+
await deliverIncidentToDiscord(payload.update);
191+
},
192+
"v3.deliverIncidentEmail": async ({ payload }) => {
193+
await deliverIncidentEmailPage(payload);
194+
},
195+
"v3.deliverIncidentEmailRecipient": async ({ payload }) => {
196+
await deliverIncidentEmailToRecipient(payload);
197+
},
129198
},
130199
});
131200

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import { env } from "~/env.server";
2+
import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook";
3+
import { logger } from "~/services/logger.server";
4+
import { buildDiscordPayload } from "./messages";
5+
6+
/** Post to the Discord webhook. No-op if unconfigured; throws on non-2xx to retry. */
7+
export async function deliverIncidentToDiscord(update: NormalizedIncidentUpdate): Promise<void> {
8+
const webhookUrl = env.INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL;
9+
if (!webhookUrl) {
10+
logger.debug("Incident Discord delivery skipped: no webhook URL configured");
11+
return;
12+
}
13+
14+
const response = await fetch(webhookUrl, {
15+
method: "POST",
16+
headers: { "Content-Type": "application/json" },
17+
body: JSON.stringify(buildDiscordPayload(update)),
18+
signal: AbortSignal.timeout(10_000),
19+
});
20+
21+
if (!response.ok) {
22+
const detail = await response.text().catch(() => "");
23+
throw new Error(`Discord webhook returned ${response.status}: ${detail.slice(0, 200)}`);
24+
}
25+
26+
logger.info("Incident Discord delivery complete", { updateId: update.updateId });
27+
}

0 commit comments

Comments
 (0)