From 57af0047faaffbca54d064ed6eb6f0d84742403a Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Mon, 1 Jun 2026 17:57:02 +0200 Subject: [PATCH 1/6] read breadcrumbs from ring file --- src/backends/native/sentry_crash_context.h | 2 + src/backends/native/sentry_crash_daemon.c | 98 +++++++++++++++++++--- src/backends/sentry_backend_native.c | 50 +++++------ 3 files changed, 116 insertions(+), 34 deletions(-) diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index e5d0bd63e..e2dcd9a9b 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -289,6 +289,8 @@ typedef struct { uint64_t shutdown_timeout; uint64_t transfer_timeout; bool system_crash_reporter_enabled; + uint32_t max_breadcrumbs; // Breadcrumb cap, so the daemon merges the ring + // files with the same limit the app enforced // Atomic user consent (sentry_user_consent_t), updated whenever user // consent changes so the daemon can honor it at crash time. diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index 7be96c468..af31d3566 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2118,22 +2118,89 @@ build_stacktrace_from_ctx(const sentry_crash_context_t *ctx) return build_stacktrace_for_thread(ctx, SIZE_MAX); } +/** + * Reads one breadcrumb ring file the crashing process appended on its hot path + * (concatenated msgpack values) into a breadcrumb list. Returns null if the + * file is absent or empty. + */ +static sentry_value_t +read_breadcrumb_ring_file(const sentry_path_t *run_folder, const char *name) +{ + if (!run_folder) { + return sentry_value_new_null(); + } + sentry_path_t *path = sentry__path_join_str(run_folder, name); + if (!path) { + return sentry_value_new_null(); + } + size_t size = 0; + char *buf = sentry__path_read_to_buffer(path, &size); + sentry__path_free(path); + if (!buf || size == 0) { + sentry_free(buf); + return sentry_value_new_null(); + } + sentry_value_t list = sentry__value_from_msgpack(buf, size); + sentry_free(buf); + // `sentry__value_from_msgpack` only builds a list when the file holds 2+ + // concatenated values; a file with a single breadcrumb decodes to a bare + // object. Wrap it so the merge step (which ignores non-lists) keeps it. + if (sentry_value_get_type(list) == SENTRY_VALUE_TYPE_OBJECT) { + sentry_value_t wrapper = sentry_value_new_list(); + sentry_value_append(wrapper, list); + return wrapper; + } + return list; +} + +/** + * Assembles the crash event's breadcrumbs from the two ring files the crashing + * process appended one-at-a-time, merges them in timestamp order, keeps the + * newest `max_breadcrumbs`, and attaches them to `event`. This is what keeps + * breadcrumb persistence off the per-mutation scope-flush path - the app only + * ever appends a single breadcrumb, and the daemon does the assembly here. + * Mirrors the crashpad backend's `report_to_envelope`. + */ +static void +apply_breadcrumbs_from_ring_files(sentry_value_t event, + const sentry_path_t *run_folder, const sentry_crash_context_t *ctx) +{ + sentry_value_t b1 + = read_breadcrumb_ring_file(run_folder, "__sentry-breadcrumb1"); + sentry_value_t b2 + = read_breadcrumb_ring_file(run_folder, "__sentry-breadcrumb2"); + size_t max = ctx && ctx->max_breadcrumbs ? ctx->max_breadcrumbs + : SENTRY_BREADCRUMBS_MAX; + sentry_value_t merged = sentry__value_merge_breadcrumbs(b1, b2, max); + sentry_value_decref(b1); + sentry_value_decref(b2); + // Overwrite any breadcrumbs the base event may carry: the ring files are the + // single source of truth, so this is idempotent and never duplicates. + if (sentry_value_get_type(merged) == SENTRY_VALUE_TYPE_LIST) { + sentry_value_set_by_key(event, "breadcrumbs", merged); + } else { + sentry_value_decref(merged); + } +} + /** * Build a native event from the scope-complete base event, adding the - * caller-specified framing (level, mechanism) plus threads and debug_meta. - * The base event (contexts, tags, user, breadcrumbs, ...) is identical - * regardless of event type; the caller states what this event is. + * caller-specified framing (level, mechanism) plus threads, breadcrumbs (read + * from the ring files), and debug_meta. The base event (contexts, tags, user, + * ...) is identical regardless of event type; the caller states what this + * event is. * * @param ctx Crash context * @param event_file_path Path to base event file from parent process + * @param run_folder Run directory holding the breadcrumb ring files * @param level Event level (e.g. "fatal") * @param mechanism_type Exception mechanism type (e.g. "signalhandler") * @param handled Whether the mechanism was handled */ static sentry_value_t build_native_event(const sentry_crash_context_t *ctx, - const char *event_file_path, const char *level, const char *mechanism_type, - bool handled) + const char *event_file_path, const sentry_path_t *run_folder, + const char *level, const char *mechanism_type, bool handled) { // Read base event from parent's file sentry_value_t event = sentry_value_new_null(); @@ -2155,6 +2222,10 @@ build_native_event(const sentry_crash_context_t *ctx, event = sentry_value_new_event(); } + // Assemble breadcrumbs from the ring files (the base event carries none - + // the app keeps them off the scope-flush hot path). + apply_breadcrumbs_from_ring_files(event, run_folder, ctx); + // Set platform to native sentry_value_set_by_key( event, "platform", sentry_value_new_string("native")); @@ -2485,7 +2556,7 @@ write_envelope_with_native_stacktrace(const sentry_options_t *options, SENTRY_DEBUGF("write_envelope_with_native_stacktrace: minidump_path=%s", minidump_path ? minidump_path : "(null)"); sentry_value_t event = build_native_event( - ctx, event_file_path, "fatal", "signalhandler", false); + ctx, event_file_path, run_folder, "fatal", "signalhandler", false); // Serialize event to JSON size_t event_size = 0; @@ -2734,21 +2805,28 @@ write_envelope_with_minidump(const sentry_options_t *options, const char *event_msgpack_path, const char *minidump_path, sentry_path_t *run_folder) { - // Read event JSON data + // Read the base event, merge in the breadcrumbs from the ring files (the + // base event carries none), and re-serialize. Unlike the native-stacktrace + // path this mode otherwise streams the event verbatim, so we have to + // round-trip through a value to attach breadcrumbs. size_t event_size = 0; char *event_json = NULL; char *event_id = NULL; sentry_path_t *ev_path = sentry__path_from_str(event_msgpack_path); if (ev_path) { - event_json = sentry__path_read_to_buffer(ev_path, &event_size); + size_t base_size = 0; + char *base_json = sentry__path_read_to_buffer(ev_path, &base_size); sentry__path_free(ev_path); - if (event_json && event_size > 0) { + if (base_json && base_size > 0) { sentry_value_t event - = sentry__value_from_json(event_json, event_size); + = sentry__value_from_json(base_json, base_size); + apply_breadcrumbs_from_ring_files(event, run_folder, ctx); event_id = sentry__string_clone(sentry_value_as_string( sentry_value_get_by_key(event, "event_id"))); + event_json = sentry__value_to_json(event, &event_size); sentry_value_decref(event); } + sentry_free(base_json); } // Open envelope file for writing diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index f2b391002..e4d080dda 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -309,6 +309,7 @@ native_backend_startup( ctx->http_retry = options->http_retry; ctx->shutdown_timeout = options->shutdown_timeout; ctx->transfer_timeout = options->transfer_timeout; + ctx->max_breadcrumbs = (uint32_t)options->max_breadcrumbs; sentry__atomic_store( &ctx->user_consent, sentry__atomic_fetch(&options->run->user_consent)); @@ -806,18 +807,19 @@ ensure_device_arch(sentry_value_t event) // fallback. Shared by the continuous scope flush and the crash handler so both // write an identical base regardless of which one wins the race. // -// `mode` controls the expensive, list-shaped parts. The crash handler passes -// SENTRY_SCOPE_BREADCRUMBS to capture them at crash time, but the continuous -// flush passes SENTRY_SCOPE_NONE: it runs on *every* scope mutation, so folding -// the breadcrumb buffer in there would re-serialize the whole ring on every -// set_tag/set_context/... - prohibitive on a hot path such as a 60fps main -// thread. +// Breadcrumbs are deliberately excluded (SENTRY_SCOPE_NONE): they are persisted +// incrementally to the breadcrumb ring files via `add_breadcrumb_func` and +// assembled by the daemon at crash time (see the daemon's +// `apply_breadcrumbs_from_ring_files`). Folding them in here would re-serialize +// the whole breadcrumb buffer on every scope mutation - prohibitive on a hot +// path such as a 60fps main thread. This mirrors the crashpad backend's +// `flush_scope_to_event`. static void -apply_scope(sentry_value_t event, const sentry_options_t *options, - sentry_scope_mode_t mode) +apply_scope(sentry_value_t event, const sentry_options_t *options) { SENTRY_WITH_SCOPE (scope) { - sentry__scope_apply_to_event(scope, options, event, mode); + sentry__scope_apply_to_event( + scope, options, event, SENTRY_SCOPE_NONE); } #if defined(SENTRY_PLATFORM_WINDOWS) ensure_device_arch(event); @@ -843,15 +845,13 @@ native_backend_flush_scope( // Keep the on-disk base event current, so the daemon has the full scope // even if a crash beats the in-process handler to the file. Breadcrumbs are - // deliberately excluded here (SENTRY_SCOPE_NONE): they are flushed - // incrementally to the breadcrumb ring files and the crash handler captures - // them at crash time. This keeps the per-mutation flush off the breadcrumb - // serialization cost. + // not part of this (see apply_scope) - the daemon merges them from the ring + // files at crash time. sentry_value_t event = sentry_value_new_object(); // Default to `FATAL` for all paths, i.e. minidump mode. sentry_value_set_by_key( event, "level", sentry__value_new_level(SENTRY_LEVEL_FATAL)); - apply_scope(event, options, SENTRY_SCOPE_NONE); + apply_scope(event, options); size_t json_len = 0; char *json_str = sentry__value_to_json(event, &json_len); @@ -890,18 +890,22 @@ native_backend_add_breadcrumb(sentry_backend_t *backend, return; } - // Serialize to JSON (so it can be deserialized on next start) - size_t json_len = 0; - char *json_str = sentry__value_to_json(breadcrumb, &json_len); - if (!json_str) { + // Append as msgpack, matching the crashpad backend. msgpack values are + // self-delimiting, so the daemon can read the concatenated ring file back + // into a list via `sentry__value_from_msgpack`. This is the only breadcrumb + // persistence on the hot path: one serialize + one append per breadcrumb, + // never a full scope re-serialization. + size_t mpack_size = 0; + char *mpack = sentry_value_to_msgpack(breadcrumb, &mpack_size); + if (!mpack) { return; } int rv = first_breadcrumb - ? sentry__path_write_buffer(breadcrumb_file, json_str, json_len) - : sentry__path_append_buffer(breadcrumb_file, json_str, json_len); + ? sentry__path_write_buffer(breadcrumb_file, mpack, mpack_size) + : sentry__path_append_buffer(breadcrumb_file, mpack, mpack_size); - sentry_free(json_str); + sentry_free(mpack); if (rv != 0) { SENTRY_WARN("failed to write breadcrumb"); @@ -1034,9 +1038,7 @@ native_backend_except(sentry_backend_t *backend, const sentry_ucontext_t *uctx) } if (should_handle) { - // At crash time we capture breadcrumbs (unlike the continuous - // flush) - this is the process's last chance to record them. - apply_scope(event, options, SENTRY_SCOPE_BREADCRUMBS); + apply_scope(event, options); #ifndef SENTRY_SCREENSHOT_NONE // The screenshot is captured by the daemon out-of-process, so From b157d91f9308f7b62c0237d2129d7f670e149d02 Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Mon, 1 Jun 2026 18:41:47 +0200 Subject: [PATCH 2/6] minified changes here too --- src/backends/native/sentry_crash_context.h | 3 +-- src/backends/native/sentry_crash_daemon.c | 20 ++++---------------- src/backends/sentry_backend_native.c | 4 +--- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index e2dcd9a9b..b6c985cd3 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -289,8 +289,7 @@ typedef struct { uint64_t shutdown_timeout; uint64_t transfer_timeout; bool system_crash_reporter_enabled; - uint32_t max_breadcrumbs; // Breadcrumb cap, so the daemon merges the ring - // files with the same limit the app enforced + uint32_t max_breadcrumbs; // Atomic user consent (sentry_user_consent_t), updated whenever user // consent changes so the daemon can honor it at crash time. diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index af0d0b351..1873ed73a 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2120,8 +2120,7 @@ build_stacktrace_from_ctx(const sentry_crash_context_t *ctx) /** * Reads one breadcrumb ring file the crashing process appended on its hot path - * (concatenated msgpack values) into a breadcrumb list. Returns null if the - * file is absent or empty. + * into a breadcrumb list. Returns null if the file is absent or empty. */ static sentry_value_t read_breadcrumb_ring_file(const sentry_path_t *run_folder, const char *name) @@ -2156,9 +2155,7 @@ read_breadcrumb_ring_file(const sentry_path_t *run_folder, const char *name) /** * Assembles the crash event's breadcrumbs from the two ring files the crashing * process appended one-at-a-time, merges them in timestamp order, keeps the - * newest `max_breadcrumbs`, and attaches them to `event`. This is what keeps - * breadcrumb persistence off the per-mutation scope-flush path - the app only - * ever appends a single breadcrumb, and the daemon does the assembly here. + * newest `max_breadcrumbs`, and attaches them to `event`. * Mirrors the crashpad backend's `report_to_envelope`. */ static void @@ -2184,11 +2181,6 @@ apply_breadcrumbs_from_ring_files(sentry_value_t event, } /** - * Build a native event from the scope-complete base event, adding the - * caller-specified framing (level, mechanism) plus threads, breadcrumbs (read - * from the ring files), and debug_meta. The base event (contexts, tags, user, - * ...) is identical regardless of event type; the caller states what this - * event is. * Build a native event and set the level, mechanism, and handled state * * @param ctx Crash context @@ -2223,8 +2215,6 @@ build_native_event(const sentry_crash_context_t *ctx, event = sentry_value_new_event(); } - // Assemble breadcrumbs from the ring files (the base event carries none - - // the app keeps them off the scope-flush hot path). apply_breadcrumbs_from_ring_files(event, run_folder, ctx); // Set platform to native @@ -2806,10 +2796,8 @@ write_envelope_with_minidump(const sentry_options_t *options, const char *event_msgpack_path, const char *minidump_path, sentry_path_t *run_folder) { - // Read the base event, merge in the breadcrumbs from the ring files (the - // base event carries none), and re-serialize. Unlike the native-stacktrace - // path this mode otherwise streams the event verbatim, so we have to - // round-trip through a value to attach breadcrumbs. + // Read the base event, merge in the breadcrumbs from the ring files, + // re-serialize. size_t event_size = 0; char *event_json = NULL; char *event_id = NULL; diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 682bdbbab..df87cad8e 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -870,9 +870,7 @@ native_backend_add_breadcrumb(sentry_backend_t *backend, // Append as msgpack, matching the crashpad backend. msgpack values are // self-delimiting, so the daemon can read the concatenated ring file back - // into a list via `sentry__value_from_msgpack`. This is the only breadcrumb - // persistence on the hot path: one serialize + one append per breadcrumb, - // never a full scope re-serialization. + // into a list via `sentry__value_from_msgpack`. size_t mpack_size = 0; char *mpack = sentry_value_to_msgpack(breadcrumb, &mpack_size); if (!mpack) { From e94a0f2be0bece790807ce6c3459833b4b1eaa61 Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Wed, 20 May 2026 20:42:53 +0200 Subject: [PATCH 3/6] first iteration --- examples/example.c | 42 +++ include/sentry.h | 42 +++ src/CMakeLists.txt | 2 + src/backends/native/sentry_crash_context.h | 16 + src/backends/native/sentry_crash_daemon.c | 365 ++++++++++++++++++--- src/backends/sentry_backend_native.c | 26 ++ src/sentry_app_hang.c | 123 +++++++ src/sentry_app_hang.h | 72 ++++ src/sentry_options.c | 2 + src/sentry_options.h | 2 + tests/test_integration_native.py | 43 +++ tests/unit/CMakeLists.txt | 1 + tests/unit/test_app_hang.c | 153 +++++++++ tests/unit/tests.inc | 11 + 14 files changed, 863 insertions(+), 37 deletions(-) create mode 100644 src/sentry_app_hang.c create mode 100644 src/sentry_app_hang.h create mode 100644 tests/unit/test_app_hang.c diff --git a/examples/example.c b/examples/example.c index d3a083280..85224e254 100644 --- a/examples/example.c +++ b/examples/example.c @@ -612,6 +612,22 @@ run_threads(thread_func_t func) } #endif +#if defined(SENTRY_PLATFORM_WINDOWS) +static unsigned __stdcall +app_hang_demo_thread(void *arg) +{ + (void)arg; + /* Heartbeat for 500 ms to latch this thread as the target. */ + for (int i = 0; i < 10; i++) { + sentry_app_hang_heartbeat(); + Sleep(50); + } + /* Freeze for 3x the configured timeout (3000 ms). */ + Sleep(3000); + return 0; +} +#endif + int main(int argc, char **argv) { @@ -879,6 +895,13 @@ main(int argc, char **argv) options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); } +#if defined(SENTRY_PLATFORM_WINDOWS) + if (has_arg(argc, argv, "app-hang")) { + sentry_options_set_app_hang_enabled(options, 1); + sentry_options_set_app_hang_timeout_ms(options, 1000); + } +#endif + // E2E test mode: generate unique test ID for event correlation char e2e_test_id[37] = { 0 }; if (has_arg(argc, argv, "e2e-test")) { @@ -890,6 +913,25 @@ main(int argc, char **argv) return EXIT_FAILURE; } +#if defined(SENTRY_PLATFORM_WINDOWS) + /* app-hang: spawn the demo thread BEFORE any other post-init work so it + * begins heartbeating immediately. The thread freezes for 3x the timeout, + * giving the daemon time to detect the hang and ship the envelope. We wait + * for it here so main does not exit before the transport has flushed. + * NOTE: this mode is intentionally exclusive – do not combine with crash/ + * abort/etc. since those would terminate the process first. */ + if (has_arg(argc, argv, "app-hang")) { + HANDLE t = (HANDLE)_beginthreadex( + NULL, 0, app_hang_demo_thread, NULL, 0, NULL); + if (t) { + WaitForSingleObject(t, INFINITE); + CloseHandle(t); + } + sentry_close(); + return EXIT_SUCCESS; + } +#endif + if (has_arg(argc, argv, "user-consent-revoke")) { sentry_user_consent_revoke(); } diff --git a/include/sentry.h b/include/sentry.h index 25416813e..f9c4ba77f 100644 --- a/include/sentry.h +++ b/include/sentry.h @@ -1697,6 +1697,48 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_attach_session_replay( SENTRY_EXPERIMENTAL_API void sentry_options_set_session_replay_duration( sentry_options_t *opts, uint32_t duration_ms); +/** + * Enable app-hang detection in the native crash backend. + * + * When enabled, the out-of-process daemon monitors a designated thread in the + * host via a shared-memory heartbeat. If the heartbeat goes stale for longer + * than the configured timeout, the daemon walks the thread's stack remotely and + * emits an `ApplicationNotResponding` event. The host process keeps running. + * + * Off by default. This setting only has an effect when using the `native` + * backend. In this initial release the feature is Windows-only; the call is a + * silent no-op on other platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_enabled( + sentry_options_t *opts, int enabled); + +/** + * Sets the heartbeat-staleness threshold (in milliseconds) used by the + * app-hang detector. Default 5000 ms. + * + * Read by the daemon once at startup; changes after `sentry_init` have no + * effect. + */ +SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_timeout_ms( + sentry_options_t *opts, uint64_t timeout_ms); + +/** + * Signal that the calling thread is alive. + * + * Call this from the thread you want monitored (typically the main / game + * thread). The first call latches the calling thread's id as the target; + * subsequent calls from the same thread refresh the heartbeat timestamp. Calls + * from any other thread are dropped — so a stray heartbeat from a worker + * thread cannot mask a frozen main thread. + * + * Cost: approximately one system call plus a relaxed 64-bit store. Safe to + * call from a per-frame hook in a game engine. + * + * No-op if app-hang detection is not enabled in options, or if the native + * backend is not active, or on non-Windows platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_app_hang_heartbeat(void); + /** * Sets the path to the crashpad handler if the crashpad backend is used. * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6086dbaaf..a29f7e88b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,8 @@ sentry_target_sources_cwd(sentry sentry_alloc.c sentry_alloc.h + sentry_app_hang.c + sentry_app_hang.h sentry_attachment.c sentry_attachment.h sentry_backend.c diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index b6c985cd3..1f44480ef 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -326,6 +326,22 @@ typedef struct { uint32_t module_count; sentry_module_info_t modules[SENTRY_CRASH_MAX_MODULES]; + /* App-hang detection (Windows-only, native backend only). + * + * Sync model: + * - app_hang_enabled, app_hang_timeout_ms: written by host before daemon + * is signalled ready; read by daemon at startup. No further mutation. + * - app_hang_target_tid: latched once by host on first heartbeat (release + * store via InterlockedCompareExchange64). Daemon reads, never writes. + * - app_hang_last_heartbeat_ms: written on every heartbeat with a relaxed + * 64-bit store. Daemon reads with a relaxed load. Torn reads are not a + * correctness issue — the daemon compares against its remembered value + * from the previous tick. */ + bool app_hang_enabled; + uint64_t app_hang_timeout_ms; + volatile uint64_t app_hang_target_tid; + volatile uint64_t app_hang_last_heartbeat_ms; + } sentry_crash_context_t; // Shared memory size: calculated at compile-time based on actual struct size diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index 1873ed73a..bb2390ad4 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2,6 +2,7 @@ #include "minidump/sentry_minidump_writer.h" #include "sentry_alloc.h" +#include "sentry_app_hang.h" #include "sentry_attachment.h" #include "sentry_core.h" #include "sentry_crash_ipc.h" @@ -2118,6 +2119,56 @@ build_stacktrace_from_ctx(const sentry_crash_context_t *ctx) return build_stacktrace_for_thread(ctx, SIZE_MAX); } +/* Describes which kind of native event we are building. `s_crash_kind` + * drives the crash path; `s_app_hang_kind` drives the app-hang flow on + * Windows. + * + * Invariant: if `include_signal_meta` is true, `exception_type` must be NULL + * (the signal-derived path). Setting an override type AND requesting signal + * metadata is incoherent — there is no signal in the override case. + */ +typedef struct { + /* Override exception `type` string. NULL = derive from the crash signal + * (e.g. "SIGSEGV" on Unix, "EXCEPTION" on Windows). */ + const char *exception_type; + /* Override exception `value` string. Used only when `exception_type` is + * non-NULL; ignored otherwise. */ + const char *exception_value; + /* `mechanism.type` JSON value, e.g. "signalhandler" or "AppHang". */ + const char *mechanism_type; + /* `mechanism.handled` JSON value. false for fatal crashes, true for + * recoverable events like app hangs. */ + bool mechanism_handled; + /* Event `level` JSON value, e.g. "fatal" or "error". */ + const char *level; + /* Attach `mechanism.meta.signal` payload? Must be false when + * `exception_type` is non-NULL (see struct invariant). */ + bool include_signal_meta; +} sentry_native_event_kind_t; + +/* Crash-path event kind: signal-derived type/value, fatal level, unhandled. */ +static const sentry_native_event_kind_t s_crash_kind = { + .exception_type = NULL, + .exception_value = NULL, + .mechanism_type = "signalhandler", + .mechanism_handled = false, + .level = "fatal", + .include_signal_meta = true, +}; + +#if defined(SENTRY_PLATFORM_WINDOWS) +/* App-hang event kind: ANR-style, handled, error level. The per-event + * `exception_value` (freeze duration message) is filled in at capture time. */ +static const sentry_native_event_kind_t s_app_hang_kind = { + .exception_type = "ApplicationNotResponding", + .exception_value = NULL, /* filled in per-event below */ + .mechanism_type = "AppHang", + .mechanism_handled = true, + .level = "error", + .include_signal_meta = false, +}; +#endif + /** * Reads one breadcrumb ring file the crashing process appended on its hot path * into a breadcrumb list. Returns null if the file is absent or empty. @@ -2186,14 +2237,12 @@ apply_breadcrumbs_from_ring_files(sentry_value_t event, * @param ctx Crash context * @param event_file_path Path to base event file from parent process * @param run_folder Run directory holding the breadcrumb ring files - * @param level Event level (e.g. "fatal") - * @param mechanism_type Exception mechanism type (e.g. "signalhandler") - * @param handled Whether the mechanism was handled + * @param kind Event-kind descriptor controlling exception/mechanism/level */ static sentry_value_t -build_native_event(const sentry_crash_context_t *ctx, +build_native_crash_event(const sentry_crash_context_t *ctx, const char *event_file_path, const sentry_path_t *run_folder, - const char *level, const char *mechanism_type, bool handled) + const sentry_native_event_kind_t *kind) { // Read base event from parent's file sentry_value_t event = sentry_value_new_null(); @@ -2221,50 +2270,69 @@ build_native_event(const sentry_crash_context_t *ctx, sentry_value_set_by_key( event, "platform", sentry_value_new_string("native")); - sentry_value_set_by_key(event, "level", sentry_value_new_string(level)); + // Set level (varies by event kind: "fatal" for crash, "error" for app hang) + sentry_value_set_by_key( + event, "level", sentry_value_new_string(kind->level)); // Build exception - const char *signal_name = "UNKNOWN"; + /* Function-scope so exc_value (which may point into this buffer) remains + * valid after the `else` block below. Previously declared inside the + * else: out of scope by the time exc_value is read -> UB per C99 6.2.4. */ + char crash_value_buf[128]; + const char *exc_type; + const char *exc_value; + + if (kind->exception_type) { + exc_type = kind->exception_type; + exc_value = kind->exception_value ? kind->exception_value : ""; + } else { + const char *signal_name; #if defined(SENTRY_PLATFORM_UNIX) - int signal_number = ctx->platform.signum; - signal_name = get_signal_name(signal_number); + signal_name = get_signal_name(ctx->platform.signum); #elif defined(SENTRY_PLATFORM_WINDOWS) - // Exception code is used directly below as unsigned - signal_name = "EXCEPTION"; + signal_name = "EXCEPTION"; +#else + signal_name = "UNKNOWN"; #endif + exc_type = signal_name; + snprintf(crash_value_buf, sizeof(crash_value_buf), "Fatal crash: %s", + signal_name); + exc_value = crash_value_buf; + } sentry_value_t exc = sentry_value_new_object(); - sentry_value_set_by_key(exc, "type", sentry_value_new_string(signal_name)); - - char value_buf[128]; - snprintf(value_buf, sizeof(value_buf), "Fatal crash: %s", signal_name); - sentry_value_set_by_key(exc, "value", sentry_value_new_string(value_buf)); + sentry_value_set_by_key(exc, "type", sentry_value_new_string(exc_type)); + sentry_value_set_by_key(exc, "value", sentry_value_new_string(exc_value)); // Add mechanism sentry_value_t mechanism = sentry_value_new_object(); - sentry_value_set_by_key( - mechanism, "type", sentry_value_new_string(mechanism_type)); + sentry_value_set_by_key(mechanism, "type", + sentry_value_new_string(kind->mechanism_type)); sentry_value_set_by_key( mechanism, "synthetic", sentry_value_new_bool(true)); - sentry_value_set_by_key( - mechanism, "handled", sentry_value_new_bool(handled)); + sentry_value_set_by_key(mechanism, "handled", + sentry_value_new_bool(kind->mechanism_handled)); - // Add signal metadata - sentry_value_t meta = sentry_value_new_object(); - sentry_value_t signal_info = sentry_value_new_object(); + // Add signal metadata (only relevant for signal-handler/crash events) + if (kind->include_signal_meta) { + sentry_value_t meta = sentry_value_new_object(); + sentry_value_t signal_info = sentry_value_new_object(); #if defined(SENTRY_PLATFORM_WINDOWS) - // Windows exception codes are unsigned 32-bit values (e.g., 0xC0000005) - // Use uint64 to preserve the unsigned value for the symbolicator - sentry_value_set_by_key(signal_info, "number", - sentry_value_new_uint64((uint64_t)ctx->platform.exception_code)); + // Windows exception codes are unsigned 32-bit values (e.g., 0xC0000005) + // Use uint64 to preserve the unsigned value for the symbolicator + sentry_value_set_by_key(signal_info, "number", + sentry_value_new_uint64((uint64_t)ctx->platform.exception_code)); #else - sentry_value_set_by_key( - signal_info, "number", sentry_value_new_int32(signal_number)); + sentry_value_set_by_key(signal_info, "number", + sentry_value_new_int32(ctx->platform.signum)); #endif - sentry_value_set_by_key( - signal_info, "name", sentry_value_new_string(signal_name)); - sentry_value_set_by_key(meta, "signal", signal_info); - sentry_value_set_by_key(mechanism, "meta", meta); + /* By the struct invariant, include_signal_meta is only true when + * exception_type is NULL, so exc_type holds the signal name here. */ + sentry_value_set_by_key( + signal_info, "name", sentry_value_new_string(exc_type)); + sentry_value_set_by_key(meta, "signal", signal_info); + sentry_value_set_by_key(mechanism, "meta", meta); + } sentry_value_set_by_key(exc, "mechanism", mechanism); @@ -2541,13 +2609,13 @@ static bool write_envelope_with_native_stacktrace(const sentry_options_t *options, const char *envelope_path, const sentry_crash_context_t *ctx, const char *event_file_path, const char *minidump_path, - sentry_path_t *run_folder) + sentry_path_t *run_folder, const sentry_native_event_kind_t *kind) { // Build native crash event (always include threads with names) SENTRY_DEBUGF("write_envelope_with_native_stacktrace: minidump_path=%s", minidump_path ? minidump_path : "(null)"); - sentry_value_t event = build_native_event( - ctx, event_file_path, run_folder, "fatal", "signalhandler", false); + sentry_value_t event = build_native_crash_event( + ctx, event_file_path, run_folder, kind); // Serialize event to JSON size_t event_size = 0; @@ -2786,6 +2854,126 @@ write_envelope_with_native_stacktrace(const sentry_options_t *options, return true; } +#if defined(SENTRY_PLATFORM_WINDOWS) +/** + * App-hang capture path (Windows). Suspends the latched target thread just long + * enough to snapshot its CONTEXT, then builds and submits an envelope using the + * same native-stacktrace path as crashes (with an AppHang event kind). + */ +static void +capture_and_send_app_hang(const sentry_options_t *options, + sentry_crash_ipc_t *ipc, uint64_t freeze_ms) +{ + /* NOTE (race, experimental Windows-only first cut): This function reads + * and mutates shmem fields (platform.context, threads[0], crashed_tid, + * num_threads) that are also written by the host's signal handler on a + * real crash. The daemon's main loop is single-threaded and the crash + * event has wait-priority 0, so we will not enter this function with a + * pending crash notification already signalled. The remaining narrow + * window is: the host crashes WHILE this function is running, the host's + * signal handler writes to shmem mid-capture, and we then send a + * partially-overwritten event. We accept this risk for the initial + * Windows-only implementation; mitigation (state check at entry / pause + * via an additional shmem flag) is tracked as follow-up work. */ + sentry_crash_context_t *ctx = ipc->shmem; + + /* Populate modules once per session if not already done. */ + if (ctx->module_count == 0) { + capture_modules_from_process(ctx); + } + + DWORD target_tid = (DWORD)ctx->app_hang_target_tid; + + /* Suspend the target thread and capture its CONTEXT. */ + HANDLE hThread = OpenThread(THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME + | THREAD_QUERY_INFORMATION, + FALSE, target_tid); + if (!hThread) { + SENTRY_DEBUGF("app-hang: OpenThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + return; + } + + DWORD suspend_count = SuspendThread(hThread); + if (suspend_count == (DWORD)-1) { + SENTRY_DEBUGF("app-hang: SuspendThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + CloseHandle(hThread); + return; + } + + CONTEXT thread_ctx; + memset(&thread_ctx, 0, sizeof(thread_ctx)); + thread_ctx.ContextFlags = CONTEXT_FULL; + if (!GetThreadContext(hThread, &thread_ctx)) { + SENTRY_DEBUGF( + "app-hang: GetThreadContext failed: %lu", GetLastError()); + ResumeThread(hThread); + CloseHandle(hThread); + return; + } + + /* Resume immediately; we have the snapshot we need. */ + ResumeThread(hThread); + CloseHandle(hThread); + + /* Place the snapshot in the "crashed thread" slot of the context so the + * existing event builder pulls a stacktrace out for the exception + * payload and the threads block. + * + * IMPORTANT: build_stacktrace_from_ctx() calls build_stacktrace_for_thread + * with thread_idx == SIZE_MAX, which on Windows reads from + * ctx->platform.context (NOT threads[0].context). We must populate both + * so the exception stacktrace uses the captured CONTEXT instead of an + * all-zero one (PC=0 -> StackWalk64 produces no frames). */ + ctx->platform.context = thread_ctx; + ctx->crashed_tid = target_tid; + ctx->platform.num_threads = 1; + ctx->platform.threads[0].thread_id = target_tid; + ctx->platform.threads[0].context = thread_ctx; + ctx->platform.threads[0].name[0] = '\0'; + + /* Build the per-event value description with the freeze duration. */ + char value_buf[128]; + snprintf(value_buf, sizeof(value_buf), + "App hang detected. Main thread blocked for %llu ms.", + (unsigned long long)freeze_ms); + sentry_native_event_kind_t kind = s_app_hang_kind; + kind.exception_value = value_buf; + + /* Build an envelope path next to the crash one. */ + char envelope_path[SENTRY_CRASH_MAX_PATH]; + int path_len = snprintf(envelope_path, sizeof(envelope_path), + "%s/sentry-app-hang-%lu-%llu.env", ctx->database_path, + (unsigned long)ctx->crashed_pid, + (unsigned long long)ctx->app_hang_last_heartbeat_ms); + + if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) { + SENTRY_WARN("app-hang: envelope path truncated or invalid"); + return; + } + + bool ok = write_envelope_with_native_stacktrace(options, envelope_path, + ctx, /*event_file_path=*/NULL, /*minidump_path=*/NULL, + /*run_folder=*/NULL, &kind); + if (!ok) { + SENTRY_WARN("app-hang: failed to write envelope"); + return; + } + + /* Read envelope from disk and hand to transport. */ + sentry_path_t *env_path = sentry__path_from_str(envelope_path); + if (env_path) { + sentry_envelope_t *envelope = sentry__envelope_from_path(env_path); + if (envelope && options && options->transport) { + sentry__capture_envelope(options->transport, envelope, options); + } + sentry__path_remove(env_path); + sentry__path_free(env_path); + } +} +#endif /* SENTRY_PLATFORM_WINDOWS */ + /** * Manually write a Sentry envelope with event, minidump, and attachments. * Format matches what Crashpad's Envelope class does. @@ -3278,7 +3466,8 @@ sentry__process_crash(const sentry_options_t *options, sentry_crash_ipc_t *ipc) minidump_path[0] ? minidump_path : "NULL"); envelope_written = write_envelope_with_native_stacktrace(options, envelope_path, ctx, event_path, - minidump_path[0] ? minidump_path : NULL, run_folder); + minidump_path[0] ? minidump_path : NULL, run_folder, + &s_crash_kind); } else { // Mode 0 (MINIDUMP only) SENTRY_DEBUG("Writing envelope with minidump"); @@ -3714,8 +3903,109 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, SENTRY_DEBUG("Entering main loop"); +#if defined(SENTRY_PLATFORM_WINDOWS) + /* Pre-populate crashed_pid so the app-hang path can OpenProcess(host). + * Both capture_modules_from_process and walk_stack_with_dbghelp use + * ctx->crashed_pid, which is otherwise only set by the host's crash + * handler. The crash handler will re-set this from the host context if + * a real crash occurs; that's a no-op (same value). */ + ipc->shmem->crashed_pid = (pid_t)app_pid; +#endif + // Daemon main loop bool crash_processed = false; + +#if defined(SENTRY_PLATFORM_WINDOWS) + /* App-hang detector state. Daemon-local; the daemon caches the timeout + * here so it does not race the host on subsequent shmem mutations. */ + const bool app_hang_enabled = ipc->shmem->app_hang_enabled; + const uint64_t app_hang_timeout_ms = ipc->shmem->app_hang_timeout_ms; + uint64_t last_fired_hb = 0; + int consecutive_stale_ticks = 0; + + HANDLE timer = NULL; + if (app_hang_enabled) { + timer = CreateWaitableTimer(NULL, FALSE, NULL); + if (!timer) { + SENTRY_WARNF("app-hang: CreateWaitableTimer failed: %lu", + GetLastError()); + } else { + /* Negative dueTime: relative; 100ns units; -5_000_000 = 500 ms. + * Period 500 ms. */ + LARGE_INTEGER due_time; + due_time.QuadPart = -5000000LL; + if (!SetWaitableTimer( + timer, &due_time, 500, NULL, NULL, FALSE)) { + SENTRY_WARNF("app-hang: SetWaitableTimer failed: %lu", + GetLastError()); + CloseHandle(timer); + timer = NULL; + } + } + } + + /* Wait set: index 0 = crash event, index 1 = timer (optional). */ + HANDLE wait_handles[2]; + DWORD wait_count = 1; + wait_handles[0] = ipc->event_handle; + if (timer) { + wait_handles[1] = timer; + wait_count = 2; + } + + while (true) { + DWORD result = WaitForMultipleObjects(wait_count, wait_handles, + FALSE, SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS); + + if (result == WAIT_OBJECT_0) { + /* Crash notification — identical logic to the cross-platform + * path below. */ + SENTRY_DEBUG("Event signaled, checking crash state"); + long state = sentry__atomic_fetch(&ipc->shmem->state); + if (state == SENTRY_CRASH_STATE_CRASHED && !crash_processed) { + SENTRY_DEBUG("Crash notification received, processing"); + sentry__process_crash(options, ipc); + crash_processed = true; + SENTRY_DEBUG("Crash processed, daemon exiting"); + break; + } + SENTRY_DEBUG("Spurious notification or already processed"); + } else if (timer && result == WAIT_OBJECT_0 + 1) { + /* Timer tick — evaluate app-hang state with strike accumulation. */ + sentry_crash_context_t *shctx = ipc->shmem; + const uint64_t hb = shctx->app_hang_last_heartbeat_ms; + const uint64_t now = sentry__app_hang_now_ms(); + int new_strikes = 0; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + app_hang_enabled, hb, now, app_hang_timeout_ms, + last_fired_hb, consecutive_stale_ticks, &new_strikes); + consecutive_stale_ticks = new_strikes; + if (d == SENTRY_APP_HANG_FIRE) { + capture_and_send_app_hang(options, ipc, now - hb); + /* Always advance last_fired_hb, even if capture failed — + * prevents a retry storm against a wedged thread. The next + * heartbeat advance re-arms detection naturally. */ + last_fired_hb = hb; + } + } else if (result == WAIT_TIMEOUT) { + /* Fall through to parent-liveness check below. */ + } else { + SENTRY_WARNF("daemon wait failed: %lu err=%lu", result, + GetLastError()); + break; + } + + if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { + SENTRY_DEBUG("Parent process exited without crash"); + break; + } + } + + if (timer) { + CancelWaitableTimer(timer); + CloseHandle(timer); + } +#else while (true) { // Wait for crash notification (with timeout to check parent health) bool wait_result @@ -3760,6 +4050,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, break; } } +#endif SENTRY_DEBUG("Daemon exiting"); diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index df87cad8e..6a8efe808 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -20,6 +20,7 @@ #include "sentry_alloc.h" #include "sentry_backend.h" #include "sentry_core.h" +#include "sentry_app_hang.h" #include "sentry_crash_context.h" #include "sentry_crash_daemon.h" #include "sentry_crash_handler.h" @@ -313,6 +314,18 @@ native_backend_startup( sentry__atomic_store( &ctx->user_consent, sentry__atomic_fetch(&options->run->user_consent)); + /* App-hang detection configuration. Written before the daemon is + * signalled ready, so the daemon sees consistent values at startup. + * + * NOTE: sentry__app_hang_set_shmem(ctx) is intentionally deferred until + * just before the function's successful `return 0;` below. If a later + * fallible call fails (e.g., daemon spawn) we free the IPC; registering + * the global pointer early would leave it dangling. */ + ctx->app_hang_enabled = options->app_hang_enabled; + ctx->app_hang_timeout_ms = options->app_hang_timeout_ms; + ctx->app_hang_target_tid = 0; + ctx->app_hang_last_heartbeat_ms = 0; + // Set up event and breadcrumb paths sentry_path_t *run_path = options->run->run_path; sentry_path_t *db_path = options->database_path; @@ -553,6 +566,14 @@ native_backend_startup( } #endif +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) + /* Make this shmem block visible to sentry_app_hang_heartbeat now that + * all fallible startup steps have succeeded. If any earlier step had + * failed we would have freed the IPC and returned without ever + * registering — keeping g_app_hang_shmem == NULL. */ + sentry__app_hang_set_shmem(ctx); +#endif + SENTRY_DEBUG("native backend started successfully"); return 0; } @@ -668,6 +689,11 @@ native_backend_shutdown(sentry_backend_t *backend) // Cleanup IPC if (state->ipc) { +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) + /* Clear the global heartbeat pointer before the shmem backing it goes + * away, so sentry_app_hang_heartbeat() cannot write to freed memory. */ + sentry__app_hang_set_shmem(NULL); +#endif sentry__crash_ipc_free(state->ipc); state->ipc = NULL; // Prevent use-after-free } diff --git a/src/sentry_app_hang.c b/src/sentry_app_hang.c new file mode 100644 index 000000000..65e7bdb5d --- /dev/null +++ b/src/sentry_app_hang.c @@ -0,0 +1,123 @@ +#include "sentry_app_hang.h" + +#include "sentry_options.h" + +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ + && defined(SENTRY_BACKEND_NATIVE) +# include +#endif + +sentry_app_hang_decision_t +sentry__app_hang_decide(bool enabled, uint64_t hb, uint64_t now, + uint64_t timeout_ms, uint64_t last_fired_hb, + int consecutive_stale_ticks, int *out_consecutive_stale_ticks) +{ + /* Fresh or disabled paths reset the counter. */ + if (!enabled || hb == 0) { + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if (now < hb) { + /* Torn shmem read (possible on x86 for a non-atomic 64-bit load). + * Treat as fresh — daemon will see the real value on the next tick. */ + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if ((now - hb) < timeout_ms) { + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if (hb == last_fired_hb) { + /* Already fired for this freeze. Stay quiet and hold the counter at + * zero so we re-arm cleanly once the host heartbeats again. */ + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + /* Stale and not in cooldown — accumulate a strike. */ + int new_count = consecutive_stale_ticks + 1; + *out_consecutive_stale_ticks = new_count; + if (new_count >= SENTRY_APP_HANG_STRIKES_REQUIRED) { + return SENTRY_APP_HANG_FIRE; + } + return SENTRY_APP_HANG_NO_ACTION; +} + +/* Public setters (always compiled, no platform guard — they only mutate the + * options struct). */ +void +sentry_options_set_app_hang_enabled(sentry_options_t *opts, int enabled) +{ + if (opts) { + opts->app_hang_enabled = !!enabled; + } +} + +void +sentry_options_set_app_hang_timeout_ms( + sentry_options_t *opts, uint64_t timeout_ms) +{ + if (opts) { + opts->app_hang_timeout_ms = timeout_ms; + } +} + +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ + && defined(SENTRY_BACKEND_NATIVE) + +static sentry_crash_context_t *volatile g_app_hang_shmem = NULL; + +uint64_t +sentry__app_hang_now_ms(void) +{ + ULONGLONG ticks_100ns = 0; + /* QueryUnbiasedInterruptTime is documented signal/SEH/wait-free; the + * same source is read on both sides of the IPC. */ + if (!QueryUnbiasedInterruptTime(&ticks_100ns)) { + return 0; + } + return (uint64_t)(ticks_100ns / 10000ULL); +} + +void +sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) +{ + g_app_hang_shmem = ctx; +} + +void +sentry_app_hang_heartbeat(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + DWORD current_tid = GetCurrentThreadId(); + LONG64 latched = (LONG64)ctx->app_hang_target_tid; + if (latched == 0) { + /* Try to latch this thread as the target. If another thread races + * us, the loser is dropped. */ + LONG64 prev = InterlockedCompareExchange64( + (LONG64 volatile *)&ctx->app_hang_target_tid, + (LONG64)(uint64_t)current_tid, 0); + if (prev != 0 && prev != (LONG64)(uint64_t)current_tid) { + return; + } + } else if ((DWORD)latched != current_tid) { + return; + } + + /* Relaxed 64-bit store. On x64 this is a single mov. On x86 the value + * may tear, but that is OK — see the comment in sentry_crash_context.h. */ + ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); +} + +#else /* non-Windows or Xbox */ + +void +sentry_app_hang_heartbeat(void) +{ + /* No-op on non-Windows targets in this initial cut. */ +} + +#endif diff --git a/src/sentry_app_hang.h b/src/sentry_app_hang.h new file mode 100644 index 000000000..f146280ae --- /dev/null +++ b/src/sentry_app_hang.h @@ -0,0 +1,72 @@ +#ifndef SENTRY_APP_HANG_H_INCLUDED +#define SENTRY_APP_HANG_H_INCLUDED + +#include "sentry_boot.h" + +#include +#include + +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ + && defined(SENTRY_BACKEND_NATIVE) +# include "sentry_crash_context.h" +#endif + +/** + * Decision returned by the pure decision function. Kept tiny so it can be + * exercised in unit tests without involving the daemon or shared memory. + */ +typedef enum { + SENTRY_APP_HANG_NO_ACTION = 0, + SENTRY_APP_HANG_FIRE = 1, +} sentry_app_hang_decision_t; + +/* Number of consecutive timer ticks the daemon must observe a stale + * heartbeat before firing. Smooths over brief hiccups (GC pauses, swap, OS + * scheduler quanta) at the cost of ~SENTRY_APP_HANG_STRIKES_REQUIRED-1 + * extra poll periods of detection latency. */ +#define SENTRY_APP_HANG_STRIKES_REQUIRED 3 + +/** + * Pure function: should we fire an app-hang event right now? + * + * - `enabled`: the host has app-hang detection turned on. + * - `hb`: last heartbeat timestamp (host clock; 0 means + * "never heartbeated yet"). + * - `now`: daemon's current observation of the same clock. + * - `timeout_ms`: staleness threshold. + * - `last_fired_hb`: the `hb` value the daemon last fired for; used + * as cooldown so a sustained freeze fires once. + * - `consecutive_stale_ticks`: caller-tracked count of consecutive ticks on + * which the heartbeat was observed stale. + * - `out_consecutive_stale_ticks` (out): updated counter the caller should + * store. 0 if reset, otherwise incremented. + * + * Returns SENTRY_APP_HANG_FIRE iff: enabled, hb != 0, (now - hb) >= timeout_ms, + * hb != last_fired_hb, AND the updated stale-tick counter reaches + * SENTRY_APP_HANG_STRIKES_REQUIRED. + */ +sentry_app_hang_decision_t sentry__app_hang_decide(bool enabled, uint64_t hb, + uint64_t now, uint64_t timeout_ms, uint64_t last_fired_hb, + int consecutive_stale_ticks, int *out_consecutive_stale_ticks); + +#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ + && defined(SENTRY_BACKEND_NATIVE) +/** + * Called from the native backend startup path. Stores `ctx` so that + * subsequent `sentry_app_hang_heartbeat()` calls have somewhere to write. + * Passing NULL clears the registration on backend shutdown. + * + * The pointer is stored in a `volatile` global; ordering with shmem field + * initialization is the caller's responsibility (the backend writes options + * into shmem before calling this). + */ +void sentry__app_hang_set_shmem(sentry_crash_context_t *ctx); + +/** + * Return a millisecond-resolution unbiased timestamp shared between host and + * daemon. Exposed for the daemon to call as well. + */ +uint64_t sentry__app_hang_now_ms(void); +#endif + +#endif diff --git a/src/sentry_options.c b/src/sentry_options.c index cb5bb936a..f43a82f11 100644 --- a/src/sentry_options.c +++ b/src/sentry_options.c @@ -67,6 +67,8 @@ sentry_options_new(void) opts->propagate_traceparent = false; opts->strict_trace_continuation = false; opts->crashpad_limit_stack_capture_to_sp = false; + opts->app_hang_enabled = false; + opts->app_hang_timeout_ms = 5000; opts->enable_metrics = true; opts->enable_logs = true; opts->cache_keep = SENTRY_CACHE_KEEP_NONE; diff --git a/src/sentry_options.h b/src/sentry_options.h index 6f64bba43..8696f0f05 100644 --- a/src/sentry_options.h +++ b/src/sentry_options.h @@ -51,6 +51,8 @@ struct sentry_options_s { bool propagate_traceparent; bool strict_trace_continuation; bool crashpad_limit_stack_capture_to_sp; + bool app_hang_enabled; + uint64_t app_hang_timeout_ms; sentry_cache_keep_t cache_keep; time_t cache_max_age; diff --git a/tests/test_integration_native.py b/tests/test_integration_native.py index a694d1848..6cf6292b9 100644 --- a/tests/test_integration_native.py +++ b/tests/test_integration_native.py @@ -1042,3 +1042,46 @@ def test_native_restart_on_crash(cmake, httpserver): for req in httpserver.log: envelope = Envelope.deserialize(req[0].get_data()) assert_native_crash(envelope) + + +@pytest.mark.skipif( + sys.platform != "win32", + reason="app-hang detection is Windows-only in this release", +) +def test_native_app_hang(cmake, httpserver): + """App hang detection emits exactly one ApplicationNotResponding event.""" + tmp_path = cmake(["sentry_example"], {"SENTRY_BACKEND": "native"}) + + httpserver.expect_oneshot_request("/api/123456/envelope/").respond_with_data( + "OK" + ) + + with httpserver.wait(timeout=20) as waiting: + # The example's app-hang mode heartbeats for 500 ms, then freezes for + # 3000 ms (3x the 1000 ms timeout). The daemon polls every 500 ms. + # `run` (not `run_crash`) because the example exits cleanly after the + # hang demonstration — `run_crash` expects abnormal exit. + run( + tmp_path, + "sentry_example", + ["log", "app-hang"], + env=dict(os.environ, SENTRY_DSN=make_dsn(httpserver)), + ) + assert waiting.result + + envelope = Envelope.deserialize(httpserver.log[0][0].get_data()) + event = envelope.get_event() + assert event is not None + exc = event["exception"]["values"][0] + assert exc["type"] == "ApplicationNotResponding" + assert exc["mechanism"]["type"] == "AppHang" + assert exc["mechanism"]["handled"] is True + assert exc["mechanism"]["synthetic"] is True + assert "stacktrace" in exc + frames = exc["stacktrace"]["frames"] + assert isinstance(frames, list) + assert len(frames) > 0, "stacktrace is empty — capture path may be broken" + # At least one frame should have a non-zero instruction address. + assert any( + int(f.get("instruction_addr", "0"), 16) > 0 for f in frames + ), "no frame has a non-zero instruction_addr" diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index a143fd540..98bdf747c 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable(sentry_test_unit ${SENTRY_SOURCES} main.c sentry_testsupport.h + test_app_hang.c test_attachments.c test_basic.c test_cache.c diff --git a/tests/unit/test_app_hang.c b/tests/unit/test_app_hang.c new file mode 100644 index 000000000..2ab96f649 --- /dev/null +++ b/tests/unit/test_app_hang.c @@ -0,0 +1,153 @@ +#include "sentry_app_hang.h" +#include "sentry_testsupport.h" + +#include + +SENTRY_TEST(app_hang_decide_disabled_returns_no_action) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/false, /*hb=*/100, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + /* Disabled path resets the counter. */ + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_no_heartbeat_yet_returns_no_action) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/0, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_fresh_heartbeat_returns_no_action_and_resets) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9500, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + /* Fresh heartbeat resets the strike counter even mid-accumulation. */ + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_first_stale_tick_increments_does_not_fire) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 1); +} + +SENTRY_TEST(app_hang_decide_second_stale_tick_increments_does_not_fire) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/1, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 2); +} + +SENTRY_TEST(app_hang_decide_third_stale_tick_fires) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(new_count, 3); +} + +SENTRY_TEST(app_hang_decide_brief_hiccup_resets_strike_count) +{ + /* Simulate: 2 stale ticks, then a fresh heartbeat (counter resets), + * then 1 stale tick → must NOT fire because we lost our accumulated + * strikes when the heartbeat refreshed. */ + int after_hiccup = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9800, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &after_hiccup); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_hiccup, 0); + + int after_one_stale = -1; + d = sentry__app_hang_decide(/*enabled=*/true, /*hb=*/9800, + /*now=*/11000, /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/after_hiccup, &after_one_stale); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_one_stale, 1); +} + +SENTRY_TEST(app_hang_decide_cooldown_holds_when_hb_unchanged) +{ + /* Already fired for hb=5000. Subsequent ticks must NOT re-fire even + * if 100 more stale ticks accumulate. Counter held at 0. */ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/20000, + /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_re_arms_after_advance_then_stall) +{ + /* hb advanced past last_fired_hb → cooldown released; need 3 fresh + * strikes again. */ + int after_strike1 = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/7000, /*now=*/12000, + /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/0, &after_strike1); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_strike1, 1); + + int after_strike3 = -1; + d = sentry__app_hang_decide(/*enabled=*/true, /*hb=*/7000, + /*now=*/12000, /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/2, &after_strike3); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(after_strike3, 3); +} + +SENTRY_TEST(app_hang_decide_exact_timeout_boundary_with_third_strike_fires) +{ + /* now - hb == timeout_ms is still stale (>= semantics) AND the third + * strike has accumulated — fires. */ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(new_count, 3); +} + +SENTRY_TEST(app_hang_decide_torn_read_now_less_than_hb_resets) +{ + /* On x86 a non-atomic 64-bit load can tear, producing now < hb. The + * decision function treats this as fresh (no FIRE) and resets the + * strike counter so the next non-torn observation starts clean. */ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/10000, /*now=*/5000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} diff --git a/tests/unit/tests.inc b/tests/unit/tests.inc index 9770f857f..686f7e3e4 100644 --- a/tests/unit/tests.inc +++ b/tests/unit/tests.inc @@ -1,3 +1,14 @@ +XX(app_hang_decide_brief_hiccup_resets_strike_count) +XX(app_hang_decide_cooldown_holds_when_hb_unchanged) +XX(app_hang_decide_disabled_returns_no_action) +XX(app_hang_decide_exact_timeout_boundary_with_third_strike_fires) +XX(app_hang_decide_first_stale_tick_increments_does_not_fire) +XX(app_hang_decide_fresh_heartbeat_returns_no_action_and_resets) +XX(app_hang_decide_no_heartbeat_yet_returns_no_action) +XX(app_hang_decide_re_arms_after_advance_then_stall) +XX(app_hang_decide_second_stale_tick_increments_does_not_fire) +XX(app_hang_decide_third_stale_tick_fires) +XX(app_hang_decide_torn_read_now_less_than_hb_resets) XX(assert_sdk_name) XX(assert_sdk_user_agent) XX(assert_sdk_version) From 83759fd94708be147bb577d7988d96ce2bb545dc Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Thu, 21 May 2026 17:15:10 +0200 Subject: [PATCH 4/6] more context --- src/backends/native/sentry_crash_daemon.c | 27 +++++++++++++++++++++-- src/backends/sentry_backend_native.c | 3 ++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index bb2390ad4..fb935df6c 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2953,9 +2953,32 @@ capture_and_send_app_hang(const sentry_options_t *options, return; } + /* Reuse the scope file the host keeps up-to-date via flush_scope so the + * app-hang event carries the same scope context as a crash event: + * full contexts (os/device/gpu/app/runtime/unity/...), user, tags, + * extra, fingerprint, release/dist/env, sdk metadata, and breadcrumbs. + * The base event JSON is at ctx->event_path; the sibling run folder + * holds the `__sentry-attachments` manifest, scope attachments, + * screenshot, and session replay — all pulled in by + * write_envelope_with_native_stacktrace when run_folder is non-NULL. */ + const char *event_file_path + = ctx->event_path[0] ? ctx->event_path : NULL; + sentry_path_t *run_folder = NULL; + if (event_file_path) { + sentry_path_t *ev_path = sentry__path_from_str(event_file_path); + if (ev_path) { + run_folder = sentry__path_dir(ev_path); + sentry__path_free(ev_path); + } + } + bool ok = write_envelope_with_native_stacktrace(options, envelope_path, - ctx, /*event_file_path=*/NULL, /*minidump_path=*/NULL, - /*run_folder=*/NULL, &kind); + ctx, event_file_path, /*minidump_path=*/NULL, run_folder, &kind); + + if (run_folder) { + sentry__path_free(run_folder); + } + if (!ok) { SENTRY_WARN("app-hang: failed to write envelope"); return; diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 6a8efe808..0b8a94b37 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -844,7 +844,8 @@ native_backend_flush_scope( return; } - // Create event with current scope + // Create event with current scope. The daemon also reads this base event + // at app-hang time on Windows, so keep it current. sentry_value_t event = sentry_value_new_object(); sentry_value_set_by_key( event, "level", sentry__value_new_level(SENTRY_LEVEL_FATAL)); From c42d2ee57848c5278cf9a8a6a2e4350cc2a7a149 Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Thu, 28 May 2026 17:19:25 +0200 Subject: [PATCH 5/6] add explicit set_thread for heartbeat --- examples/example.c | 4 +++- include/sentry.h | 29 +++++++++++++++++++++++------ src/sentry_app_hang.c | 42 +++++++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/examples/example.c b/examples/example.c index 85224e254..0cf32073d 100644 --- a/examples/example.c +++ b/examples/example.c @@ -617,7 +617,9 @@ static unsigned __stdcall app_hang_demo_thread(void *arg) { (void)arg; - /* Heartbeat for 500 ms to latch this thread as the target. */ + /* Latch this thread as the target once, then heartbeat for 500 ms so the + * daemon sees a healthy baseline before the freeze. */ + sentry_app_hang_set_target_thread(); for (int i = 0; i < 10; i++) { sentry_app_hang_heartbeat(); Sleep(50); diff --git a/include/sentry.h b/include/sentry.h index f9c4ba77f..977eb92ad 100644 --- a/include/sentry.h +++ b/include/sentry.h @@ -1723,13 +1723,30 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_timeout_ms( sentry_options_t *opts, uint64_t timeout_ms); /** - * Signal that the calling thread is alive. + * Designate the calling thread as the one monitored by the app-hang detector. * - * Call this from the thread you want monitored (typically the main / game - * thread). The first call latches the calling thread's id as the target; - * subsequent calls from the same thread refresh the heartbeat timestamp. Calls - * from any other thread are dropped — so a stray heartbeat from a worker - * thread cannot mask a frozen main thread. + * Call this once, from the thread you want monitored (typically the main / + * game thread), before the first heartbeat. The latch is sticky for the + * lifetime of the SDK session: subsequent calls from any other thread are + * dropped. Calling again from the same thread is a harmless no-op. + * + * Until this is called, `sentry_app_hang_heartbeat()` is a no-op — there is + * no implicit "first caller wins" latch, so a stray heartbeat from a worker + * thread during startup cannot accidentally claim the role and silently + * disable monitoring of the real main thread. + * + * No-op if app-hang detection is not enabled in options, or if the native + * backend is not active, or on non-Windows platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_app_hang_set_target_thread(void); + +/** + * Refresh the heartbeat for the monitored thread. + * + * Call this from the thread previously designated via + * `sentry_app_hang_set_target_thread()`. Calls from any other thread, or + * before a target has been set, are dropped — so a stray heartbeat from a + * worker thread cannot mask a frozen main thread. * * Cost: approximately one system call plus a relaxed 64-bit store. Safe to * call from a per-frame hook in a game engine. diff --git a/src/sentry_app_hang.c b/src/sentry_app_hang.c index 65e7bdb5d..a530b0fbc 100644 --- a/src/sentry_app_hang.c +++ b/src/sentry_app_hang.c @@ -84,6 +84,24 @@ sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) g_app_hang_shmem = ctx; } +void +sentry_app_hang_set_target_thread(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* CAS the current TID into the latch slot iff still unset. If another + * thread races and wins, our call is silently dropped — the API contract + * is "first caller wins, idempotent for that caller". CAS (rather than a + * plain store) prevents a late call from a different thread from + * silently overwriting a prior latch. */ + DWORD current_tid = GetCurrentThreadId(); + InterlockedCompareExchange64((LONG64 volatile *)&ctx->app_hang_target_tid, + (LONG64)(uint64_t)current_tid, 0); +} + void sentry_app_hang_heartbeat(void) { @@ -92,18 +110,14 @@ sentry_app_hang_heartbeat(void) return; } + /* Refresh-only: requires a prior sentry_app_hang_set_target_thread() + * call from this thread. Drops the heartbeat if no target is latched, + * or if the latched thread is not us. The non-atomic read can tear on + * x86; in that case the compare fails and we drop a heartbeat, which + * the daemon's strike counter absorbs. */ DWORD current_tid = GetCurrentThreadId(); - LONG64 latched = (LONG64)ctx->app_hang_target_tid; - if (latched == 0) { - /* Try to latch this thread as the target. If another thread races - * us, the loser is dropped. */ - LONG64 prev = InterlockedCompareExchange64( - (LONG64 volatile *)&ctx->app_hang_target_tid, - (LONG64)(uint64_t)current_tid, 0); - if (prev != 0 && prev != (LONG64)(uint64_t)current_tid) { - return; - } - } else if ((DWORD)latched != current_tid) { + uint64_t latched = ctx->app_hang_target_tid; + if (latched == 0 || (DWORD)latched != current_tid) { return; } @@ -114,6 +128,12 @@ sentry_app_hang_heartbeat(void) #else /* non-Windows or Xbox */ +void +sentry_app_hang_set_target_thread(void) +{ + /* No-op on non-Windows targets in this initial cut. */ +} + void sentry_app_hang_heartbeat(void) { From 6a69c1237f1c0af37c40d47bd2927043f4312451 Mon Sep 17 00:00:00 2001 From: bitsandfoxes Date: Wed, 3 Jun 2026 10:47:13 +0200 Subject: [PATCH 6/6] added macos support --- examples/example.c | 39 +- src/backends/native/sentry_crash_context.h | 10 +- src/backends/native/sentry_crash_daemon.c | 500 ++++++++++++++++++++- src/backends/sentry_backend_native.c | 9 +- src/sentry_app_hang.c | 104 ++++- src/sentry_app_hang.h | 13 +- tests/test_integration_native.py | 35 +- 7 files changed, 669 insertions(+), 41 deletions(-) diff --git a/examples/example.c b/examples/example.c index 0cf32073d..a2fdf90fc 100644 --- a/examples/example.c +++ b/examples/example.c @@ -624,10 +624,38 @@ app_hang_demo_thread(void *arg) sentry_app_hang_heartbeat(); Sleep(50); } + /* Add a couple of breadcrumbs before freezing so the captured app-hang + * event carries them (the daemon reads the breadcrumb ring files the host + * writes on each sentry_add_breadcrumb). */ + sentry_add_breadcrumb( + sentry_value_new_breadcrumb(NULL, "app-hang demo: about to freeze")); + sentry_add_breadcrumb(create_debug_crumb("app-hang demo breadcrumb")); /* Freeze for 3x the configured timeout (3000 ms). */ Sleep(3000); return 0; } +#elif defined(SENTRY_PLATFORM_MACOS) +static void * +app_hang_demo_thread(void *arg) +{ + (void)arg; + /* Latch this thread as the target once, then heartbeat for 500 ms so the + * daemon sees a healthy baseline before the freeze. */ + sentry_app_hang_set_target_thread(); + for (int i = 0; i < 10; i++) { + sentry_app_hang_heartbeat(); + usleep(50 * 1000); + } + /* Add a couple of breadcrumbs before freezing so the captured app-hang + * event carries them (the daemon reads the breadcrumb ring files the host + * writes on each sentry_add_breadcrumb). */ + sentry_add_breadcrumb( + sentry_value_new_breadcrumb(NULL, "app-hang demo: about to freeze")); + sentry_add_breadcrumb(create_debug_crumb("app-hang demo breadcrumb")); + /* Freeze for 3x the configured timeout (3000 ms). */ + usleep(3000 * 1000); + return NULL; +} #endif int @@ -897,7 +925,7 @@ main(int argc, char **argv) options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); } -#if defined(SENTRY_PLATFORM_WINDOWS) +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) if (has_arg(argc, argv, "app-hang")) { sentry_options_set_app_hang_enabled(options, 1); sentry_options_set_app_hang_timeout_ms(options, 1000); @@ -915,7 +943,7 @@ main(int argc, char **argv) return EXIT_FAILURE; } -#if defined(SENTRY_PLATFORM_WINDOWS) +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) /* app-hang: spawn the demo thread BEFORE any other post-init work so it * begins heartbeating immediately. The thread freezes for 3x the timeout, * giving the daemon time to detect the hang and ship the envelope. We wait @@ -923,12 +951,19 @@ main(int argc, char **argv) * NOTE: this mode is intentionally exclusive – do not combine with crash/ * abort/etc. since those would terminate the process first. */ if (has_arg(argc, argv, "app-hang")) { +# if defined(SENTRY_PLATFORM_WINDOWS) HANDLE t = (HANDLE)_beginthreadex( NULL, 0, app_hang_demo_thread, NULL, 0, NULL); if (t) { WaitForSingleObject(t, INFINITE); CloseHandle(t); } +# else + pthread_t t; + if (0 == pthread_create(&t, NULL, app_hang_demo_thread, NULL)) { + pthread_join(t, NULL); + } +# endif sentry_close(); return EXIT_SUCCESS; } diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index 1f44480ef..55fd646fa 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -326,17 +326,19 @@ typedef struct { uint32_t module_count; sentry_module_info_t modules[SENTRY_CRASH_MAX_MODULES]; - /* App-hang detection (Windows-only, native backend only). + /* App-hang detection (Windows + macOS, native backend only). * * Sync model: * - app_hang_enabled, app_hang_timeout_ms: written by host before daemon * is signalled ready; read by daemon at startup. No further mutation. - * - app_hang_target_tid: latched once by host on first heartbeat (release - * store via InterlockedCompareExchange64). Daemon reads, never writes. + * - app_hang_target_tid: latched once by host on first heartbeat via a + * compare-exchange (InterlockedCompareExchange64 on Windows, + * atomic_compare_exchange_strong on macOS). Daemon reads, never writes. * - app_hang_last_heartbeat_ms: written on every heartbeat with a relaxed * 64-bit store. Daemon reads with a relaxed load. Torn reads are not a * correctness issue — the daemon compares against its remembered value - * from the previous tick. */ + * from the previous tick. (On 64-bit Windows/macOS the aligned store is + * atomic; the tear note applies to 32-bit Windows.) */ bool app_hang_enabled; uint64_t app_hang_timeout_ms; volatile uint64_t app_hang_target_tid; diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index fb935df6c..78589492c 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -46,6 +46,9 @@ # if defined(SENTRY_PLATFORM_MACOS) # include # include +# include +# include +# include # include # endif #elif defined(SENTRY_PLATFORM_WINDOWS) @@ -2121,7 +2124,7 @@ build_stacktrace_from_ctx(const sentry_crash_context_t *ctx) /* Describes which kind of native event we are building. `s_crash_kind` * drives the crash path; `s_app_hang_kind` drives the app-hang flow on - * Windows. + * Windows and macOS. * * Invariant: if `include_signal_meta` is true, `exception_type` must be NULL * (the signal-derived path). Setting an override type AND requesting signal @@ -2156,7 +2159,7 @@ static const sentry_native_event_kind_t s_crash_kind = { .include_signal_meta = true, }; -#if defined(SENTRY_PLATFORM_WINDOWS) +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) /* App-hang event kind: ANR-style, handled, error level. The per-event * `exception_value` (freeze duration message) is filled in at capture time. */ static const sentry_native_event_kind_t s_app_hang_kind = { @@ -2995,7 +2998,444 @@ capture_and_send_app_hang(const sentry_options_t *options, sentry__path_free(env_path); } } -#endif /* SENTRY_PLATFORM_WINDOWS */ + +#elif defined(SENTRY_PLATFORM_MACOS) + +/* Read `size` bytes at `addr` from another task into `buf`. Mirrors the + * minidump writer's read_task_memory (mach_vm_read_overwrite). */ +static kern_return_t +app_hang_read_task_memory( + task_t task, mach_vm_address_t addr, void *buf, mach_vm_size_t size) +{ + mach_vm_size_t got = 0; + kern_return_t kr + = mach_vm_read_overwrite(task, addr, size, (mach_vm_address_t)buf, &got); + if (kr == KERN_SUCCESS && got != size) { + return KERN_FAILURE; + } + return kr; +} + +/* Enumerate the host's loaded dyld images out-of-process via the donated/ + * task_for_pid task port and populate ctx->modules[] (base, __TEXT vmsize, + * UUID, name). This is the out-of-process analogue of the in-process + * _dyld_image_count() loop the crash signal handler runs — needed for app + * hangs because no signal handler runs to capture modules, and the daemon's + * own dyld images are unrelated to the host's. Best-effort: on any read + * failure we stop and keep whatever was gathered. */ +static void +app_hang_capture_modules(task_t task, sentry_crash_context_t *ctx) +{ + ctx->module_count = 0; + + /* Locate dyld_all_image_infos in the target task. */ + struct task_dyld_info dyld_info; + mach_msg_type_number_t count = TASK_DYLD_INFO_COUNT; + if (task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &count) + != KERN_SUCCESS) { + SENTRY_DEBUG("app-hang: task_info(TASK_DYLD_INFO) failed"); + return; + } + + struct dyld_all_image_infos all_infos; + if (app_hang_read_task_memory(task, + (mach_vm_address_t)dyld_info.all_image_info_addr, &all_infos, + sizeof(all_infos)) + != KERN_SUCCESS) { + SENTRY_DEBUG("app-hang: failed to read dyld_all_image_infos"); + return; + } + + uint32_t image_count = all_infos.infoArrayCount; + if (image_count > SENTRY_CRASH_MAX_MODULES) { + image_count = SENTRY_CRASH_MAX_MODULES; + } + + for (uint32_t i = 0; + i < image_count && ctx->module_count < SENTRY_CRASH_MAX_MODULES; i++) { + /* Read one dyld_image_info entry from the remote infoArray. */ + struct dyld_image_info info; + mach_vm_address_t entry_addr = (mach_vm_address_t)all_infos.infoArray + + (mach_vm_address_t)i * sizeof(struct dyld_image_info); + if (app_hang_read_task_memory(task, entry_addr, &info, sizeof(info)) + != KERN_SUCCESS) { + break; + } + + uint64_t base = (uint64_t)info.imageLoadAddress; + if (base == 0) { + continue; + } + + sentry_module_info_t *module = &ctx->modules[ctx->module_count]; + memset(module, 0, sizeof(*module)); + module->base_address = base; + + /* Read the image path from the remote address. */ + if (info.imageFilePath) { + char namebuf[SENTRY_CRASH_MAX_PATH]; + memset(namebuf, 0, sizeof(namebuf)); + /* Read in a bounded chunk; tolerate a short read at the tail. */ + for (size_t off = 0; off < sizeof(namebuf) - 1; off += 256) { + size_t chunk = sizeof(namebuf) - 1 - off; + if (chunk > 256) { + chunk = 256; + } + if (app_hang_read_task_memory(task, + (mach_vm_address_t)info.imageFilePath + off, + namebuf + off, chunk) + != KERN_SUCCESS) { + break; + } + if (memchr(namebuf + off, '\0', chunk)) { + break; + } + } + namebuf[sizeof(namebuf) - 1] = '\0'; + strncpy(module->name, namebuf, sizeof(module->name) - 1); + } + + /* Read the Mach-O header + load commands to get __TEXT vmsize and + * UUID, mirroring the in-process loop in the signal handler. */ + struct mach_header_64 header; + if (app_hang_read_task_memory( + task, (mach_vm_address_t)base, &header, sizeof(header)) + == KERN_SUCCESS + && (header.magic == MH_MAGIC_64 || header.magic == MH_CIGAM_64)) { + uint32_t ncmds = header.ncmds; + if (ncmds > 256) { + ncmds = 256; + } + /* Read the load-command region in one shot (capped). */ + uint32_t cmds_size = header.sizeofcmds; + if (cmds_size > 0 && cmds_size <= 64 * 1024) { + uint8_t *cmds = sentry_malloc(cmds_size); + if (cmds + && app_hang_read_task_memory(task, + (mach_vm_address_t)base + sizeof(header), cmds, + cmds_size) + == KERN_SUCCESS) { + const uint8_t *p = cmds; + const uint8_t *end = cmds + cmds_size; + bool has_size = false, has_uuid = false; + for (uint32_t j = 0; + j < ncmds && (!has_size || !has_uuid) + && p + sizeof(struct load_command) <= end; + j++) { + const struct load_command *lc + = (const struct load_command *)p; + if (lc->cmdsize == 0 + || p + lc->cmdsize > end) { + break; + } + if (lc->cmd == LC_SEGMENT_64 + && lc->cmdsize >= sizeof(struct segment_command_64)) { + const struct segment_command_64 *seg + = (const struct segment_command_64 *)lc; + if (memcmp(seg->segname, "__TEXT", 7) == 0) { + module->size = seg->vmsize; + has_size = true; + } + } else if (lc->cmd == LC_UUID + && lc->cmdsize >= sizeof(struct uuid_command)) { + const struct uuid_command *uc + = (const struct uuid_command *)lc; + memcpy(module->uuid, uc->uuid, 16); + has_uuid = true; + } + p += lc->cmdsize; + } + } + sentry_free(cmds); + } + } + + ctx->module_count++; + } + + SENTRY_DEBUGF( + "app-hang: captured %u modules out-of-process", ctx->module_count); +} + +/* Read the hung thread's stack memory (from SP upward) out-of-process and save + * it to a file, populating threads[0].stack_path / stack_size so the existing + * FP-unwinder in build_stacktrace_for_thread can walk real frames — the same + * file-backed mechanism the signal handler uses for crashes. Best-effort. */ +static void +app_hang_capture_stack( + task_t task, sentry_crash_context_t *ctx, uint64_t sp) +{ + ctx->platform.threads[0].stack_path[0] = '\0'; + ctx->platform.threads[0].stack_size = 0; + if (sp == 0) { + return; + } + + mach_vm_size_t want = SENTRY_CRASH_MAX_STACK_CAPTURE; + uint8_t *buf = sentry_malloc(want); + if (!buf) { + return; + } + + /* Shrink the read until it succeeds — the top of stack may be near a guard + * page, so a full-size read can straddle unmapped memory and fail. */ + mach_vm_size_t got = 0; + while (want >= 4096) { + if (app_hang_read_task_memory(task, (mach_vm_address_t)sp, buf, want) + == KERN_SUCCESS) { + got = want; + break; + } + want /= 2; + } + if (got == 0) { + SENTRY_DEBUG("app-hang: failed to read hung thread stack"); + sentry_free(buf); + return; + } + + char stack_path[SENTRY_CRASH_MAX_PATH]; + int n = snprintf(stack_path, sizeof(stack_path), + "%s/sentry-app-hang-stack-%lu.bin", ctx->database_path, + (unsigned long)ctx->crashed_pid); + if (n < 0 || n >= (int)sizeof(stack_path)) { + sentry_free(buf); + return; + } + int fd = open(stack_path, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd >= 0) { + if (write(fd, buf, (size_t)got) == (ssize_t)got) { + strncpy(ctx->platform.threads[0].stack_path, stack_path, + sizeof(ctx->platform.threads[0].stack_path) - 1); + ctx->platform.threads[0].stack_size = got; + SENTRY_DEBUGF("app-hang: captured %llu bytes of stack", + (unsigned long long)got); + } + close(fd); + } + sentry_free(buf); +} + +/** + * App-hang capture path (macOS). The host is alive but frozen, so unlike a + * crash there is no in-process signal-handler snapshot to fall back on — the + * daemon must sample the hung thread itself. It does so out-of-process via + * `task_for_pid` (the same mechanism the crash "full path" minidump writer + * relies on): locate the Mach thread whose THREAD_IDENTIFIER_INFO.thread_id + * matches the latched target tid, suspend it just long enough to read its + * register state, then resume and build/submit an AppHang envelope using the + * same native-stacktrace path as crashes. + * + * Requires `task_for_pid` to be permitted (same-user, non-hardened local/dev + * builds). On a hardened release runtime without the debugger entitlement it + * is denied; the entitlement-free port-donation replacement is a separate + * follow-up. + */ +static void +capture_and_send_app_hang(const sentry_options_t *options, + sentry_crash_ipc_t *ipc, uint64_t freeze_ms) +{ + /* NOTE (race, same as the Windows variant): this function reads and + * mutates shmem fields (platform.mcontext, threads[0], crashed_tid, + * num_threads) that the host's signal handler also writes on a real + * crash. The daemon loop is single-threaded and processes a pending crash + * before reaching here, so the only remaining window is the host crashing + * mid-capture. Accepted for the spike, same as Windows. */ + sentry_crash_context_t *ctx = ipc->shmem; + + const uint64_t target_tid = ctx->app_hang_target_tid; + + /* Acquire the host task. No in-process snapshot exists for a hang, so a + * failure here means we simply cannot capture this hang. */ + task_t task = MACH_PORT_NULL; + kern_return_t kr + = task_for_pid(mach_task_self(), (int)ctx->crashed_pid, &task); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: task_for_pid(%d) failed: %d (%s) — no " + "snapshot available for a hang", + (int)ctx->crashed_pid, kr, mach_error_string(kr)); + return; + } + + /* Enumerate the host's dyld modules out-of-process so debug_meta is + * populated and frames symbolicate server-side (the in-process signal + * handler that normally does this never runs for a hang). */ + app_hang_capture_modules(task, ctx); + + /* Enumerate threads and find the latched target by its portable tid. */ + thread_act_array_t threads = NULL; + mach_msg_type_number_t thread_count = 0; + kr = task_threads(task, &threads, &thread_count); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: task_threads failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + thread_t target = MACH_PORT_NULL; + for (mach_msg_type_number_t i = 0; i < thread_count; i++) { + thread_identifier_info_data_t id_info; + mach_msg_type_number_t id_count = THREAD_IDENTIFIER_INFO_COUNT; + if (thread_info(threads[i], THREAD_IDENTIFIER_INFO, + (thread_info_t)&id_info, &id_count) + == KERN_SUCCESS + && id_info.thread_id == target_tid) { + target = threads[i]; + } else { + /* Deallocate the ports we are not keeping. */ + mach_port_deallocate(mach_task_self(), threads[i]); + } + } + + if (target == MACH_PORT_NULL) { + SENTRY_DEBUGF("app-hang: target thread tid=%llu not found among %u " + "threads", + (unsigned long long)target_tid, thread_count); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Suspend the target just long enough to read its register state. */ + kr = thread_suspend(target); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: thread_suspend failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Read the integer register set directly into mcontext.__ss. Use the + * arch-specific flavor (ARM_THREAD_STATE64 / x86_THREAD_STATE64) that + * matches __ss's layout — NOT MACHINE_THREAD_STATE, which is the tagged + * *unified* state (arm_unified_thread_state_t) and would land with the + * wrong layout, yielding garbage IP/FP/SP. */ + _STRUCT_MCONTEXT mcontext; + memset(&mcontext, 0, sizeof(mcontext)); +# if defined(__x86_64__) + mach_msg_type_number_t state_count = x86_THREAD_STATE64_COUNT; + kr = thread_get_state(target, x86_THREAD_STATE64, + (thread_state_t)&mcontext.__ss, &state_count); +# elif defined(__aarch64__) + mach_msg_type_number_t state_count = ARM_THREAD_STATE64_COUNT; + kr = thread_get_state(target, ARM_THREAD_STATE64, + (thread_state_t)&mcontext.__ss, &state_count); +# else + mach_msg_type_number_t state_count = MACHINE_THREAD_STATE_COUNT; + kr = thread_get_state( + target, MACHINE_THREAD_STATE, (thread_state_t)&mcontext.__ss, + &state_count); +# endif + + thread_resume(target); + + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: thread_get_state failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Place the snapshot in the "crashed thread" slot of the context so the + * existing event builder pulls a stacktrace and register block out for the + * exception payload and the threads block. + * + * build_stacktrace_from_ctx() (thread_idx == SIZE_MAX) reads from + * ctx->platform.mcontext, while the per-thread register block reads from + * threads[0].state — populate both so the captured registers are used and + * not an all-zero context (PC=0 -> no frames). */ + ctx->platform.mcontext = mcontext; + ctx->crashed_tid = (pid_t)target_tid; + ctx->platform.num_threads = 1; + ctx->platform.threads[0].thread = target; /* port; valid only here */ + ctx->platform.threads[0].tid = target_tid; + ctx->platform.threads[0].state = mcontext; + ctx->platform.threads[0].stack_path[0] = '\0'; + ctx->platform.threads[0].stack_size = 0; + + /* Capture the hung thread's stack (from SP upward) out-of-process so the + * FP-unwinder can walk real frames instead of just the top PC. Must happen + * while we still hold the task port. */ + uint64_t target_sp = 0; +# if defined(__x86_64__) + target_sp = mcontext.__ss.__rsp; +# elif defined(__aarch64__) + target_sp = SENTRY__ARM64_GET_SP(mcontext.__ss); +# endif + app_hang_capture_stack(task, ctx, target_sp); + + /* Done reading from the host task; release the Mach ports. */ + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + + /* Build the per-event value description with the freeze duration. */ + char value_buf[128]; + snprintf(value_buf, sizeof(value_buf), + "App hang detected. Main thread blocked for %llu ms.", + (unsigned long long)freeze_ms); + sentry_native_event_kind_t kind = s_app_hang_kind; + kind.exception_value = value_buf; + + /* Build an envelope path next to the crash one. */ + char envelope_path[SENTRY_CRASH_MAX_PATH]; + int path_len = snprintf(envelope_path, sizeof(envelope_path), + "%s/sentry-app-hang-%lu-%llu.env", ctx->database_path, + (unsigned long)ctx->crashed_pid, + (unsigned long long)ctx->app_hang_last_heartbeat_ms); + + if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) { + SENTRY_WARN("app-hang: envelope path truncated or invalid"); + return; + } + + /* Reuse the host-maintained scope file and run folder so the app-hang + * event carries the same scope context as a crash event (see the Windows + * variant for the detailed rationale). */ + const char *event_file_path + = ctx->event_path[0] ? ctx->event_path : NULL; + sentry_path_t *run_folder = NULL; + if (event_file_path) { + sentry_path_t *ev_path = sentry__path_from_str(event_file_path); + if (ev_path) { + run_folder = sentry__path_dir(ev_path); + sentry__path_free(ev_path); + } + } + + bool ok = write_envelope_with_native_stacktrace(options, envelope_path, + ctx, event_file_path, /*minidump_path=*/NULL, run_folder, &kind); + + if (run_folder) { + sentry__path_free(run_folder); + } + + if (!ok) { + SENTRY_WARN("app-hang: failed to write envelope"); + return; + } + + /* Read envelope from disk and hand to transport. */ + sentry_path_t *env_path = sentry__path_from_str(envelope_path); + if (env_path) { + sentry_envelope_t *envelope = sentry__envelope_from_path(env_path); + if (envelope && options && options->transport) { + sentry__capture_envelope(options->transport, envelope, options); + } + sentry__path_remove(env_path); + sentry__path_free(env_path); + } +} +#endif /* SENTRY_PLATFORM_WINDOWS / SENTRY_PLATFORM_MACOS */ /** * Manually write a Sentry envelope with event, minidump, and attachments. @@ -3926,12 +4366,13 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, SENTRY_DEBUG("Entering main loop"); -#if defined(SENTRY_PLATFORM_WINDOWS) - /* Pre-populate crashed_pid so the app-hang path can OpenProcess(host). - * Both capture_modules_from_process and walk_stack_with_dbghelp use - * ctx->crashed_pid, which is otherwise only set by the host's crash - * handler. The crash handler will re-set this from the host context if - * a real crash occurs; that's a no-op (same value). */ +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) + /* Pre-populate crashed_pid so the app-hang path can reach the host + * out-of-process (OpenProcess on Windows, task_for_pid on macOS). On + * Windows this also feeds capture_modules_from_process and + * walk_stack_with_dbghelp. ctx->crashed_pid is otherwise only set by the + * host's crash handler; the crash handler re-sets it from the host context + * on a real crash — a no-op (same value). */ ipc->shmem->crashed_pid = (pid_t)app_pid; #endif @@ -4029,10 +4470,25 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, CloseHandle(timer); } #else +# if defined(SENTRY_PLATFORM_MACOS) + /* App-hang detector state. Daemon-local; the timeout is cached here so it + * does not race the host on subsequent shmem mutations. When enabled, the + * loop polls on a short cadence (so it can evaluate the heartbeat each + * tick) instead of the longer health-check interval. */ + const bool app_hang_enabled = ipc->shmem->app_hang_enabled; + const uint64_t app_hang_timeout_ms = ipc->shmem->app_hang_timeout_ms; + uint64_t last_fired_hb = 0; + int consecutive_stale_ticks = 0; + const int wait_timeout_ms = app_hang_enabled + ? 500 + : SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; +# else + const int wait_timeout_ms = SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; +# endif + while (true) { // Wait for crash notification (with timeout to check parent health) - bool wait_result - = sentry__crash_ipc_wait(ipc, SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS); + bool wait_result = sentry__crash_ipc_wait(ipc, wait_timeout_ms); if (wait_result) { // Crash occurred! SENTRY_DEBUG("Event signaled, checking crash state"); @@ -4066,6 +4522,28 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, // If crash already processed, just ignore spurious notifications SENTRY_DEBUG("Spurious notification or already processed"); } +# if defined(SENTRY_PLATFORM_MACOS) + else if (app_hang_enabled && !crash_processed) { + /* No crash notification this wake (timeout or spurious) — evaluate + * the app-hang heartbeat with strike accumulation, mirroring the + * Windows timer tick. */ + sentry_crash_context_t *shctx = ipc->shmem; + const uint64_t hb = shctx->app_hang_last_heartbeat_ms; + const uint64_t now = sentry__app_hang_now_ms(); + int new_strikes = 0; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + app_hang_enabled, hb, now, app_hang_timeout_ms, + last_fired_hb, consecutive_stale_ticks, &new_strikes); + consecutive_stale_ticks = new_strikes; + if (d == SENTRY_APP_HANG_FIRE) { + capture_and_send_app_hang(options, ipc, now - hb); + /* Always advance last_fired_hb, even if capture failed — + * prevents a retry storm against a wedged thread. The next + * heartbeat advance re-arms detection naturally. */ + last_fired_hb = hb; + } + } +# endif // Check if parent is still alive (only if no crash processed yet) if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 0b8a94b37..bcabf415e 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -18,9 +18,9 @@ #include #include "sentry_alloc.h" +#include "sentry_app_hang.h" #include "sentry_backend.h" #include "sentry_core.h" -#include "sentry_app_hang.h" #include "sentry_crash_context.h" #include "sentry_crash_daemon.h" #include "sentry_crash_handler.h" @@ -314,8 +314,7 @@ native_backend_startup( sentry__atomic_store( &ctx->user_consent, sentry__atomic_fetch(&options->run->user_consent)); - /* App-hang detection configuration. Written before the daemon is - * signalled ready, so the daemon sees consistent values at startup. + /* App-hang detection configuration. * * NOTE: sentry__app_hang_set_shmem(ctx) is intentionally deferred until * just before the function's successful `return 0;` below. If a later @@ -566,7 +565,7 @@ native_backend_startup( } #endif -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) /* Make this shmem block visible to sentry_app_hang_heartbeat now that * all fallible startup steps have succeeded. If any earlier step had * failed we would have freed the IPC and returned without ever @@ -689,7 +688,7 @@ native_backend_shutdown(sentry_backend_t *backend) // Cleanup IPC if (state->ipc) { -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) /* Clear the global heartbeat pointer before the shmem backing it goes * away, so sentry_app_hang_heartbeat() cannot write to freed memory. */ sentry__app_hang_set_shmem(NULL); diff --git a/src/sentry_app_hang.c b/src/sentry_app_hang.c index a530b0fbc..d297457be 100644 --- a/src/sentry_app_hang.c +++ b/src/sentry_app_hang.c @@ -1,10 +1,22 @@ +/* pthread_threadid_np() and CLOCK_UPTIME_RAW are Darwin extensions hidden when + * a strict POSIX feature macro (e.g. _XOPEN_SOURCE, set transitively by + * sentry_crash_context.h) is active. Re-expose them before any include. */ +#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE) +# define _DARWIN_C_SOURCE +#endif + #include "sentry_app_hang.h" #include "sentry_options.h" -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ - && defined(SENTRY_BACKEND_NATIVE) -# include +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) +# if defined(SENTRY_PLATFORM_WINDOWS) +# include +# elif defined(SENTRY_PLATFORM_MACOS) +# include +# include +# include +# endif #endif sentry_app_hang_decision_t @@ -61,11 +73,18 @@ sentry_options_set_app_hang_timeout_ms( } } -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ - && defined(SENTRY_BACKEND_NATIVE) +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) static sentry_crash_context_t *volatile g_app_hang_shmem = NULL; +void +sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) +{ + g_app_hang_shmem = ctx; +} + +# if defined(SENTRY_PLATFORM_WINDOWS) + uint64_t sentry__app_hang_now_ms(void) { @@ -78,12 +97,6 @@ sentry__app_hang_now_ms(void) return (uint64_t)(ticks_100ns / 10000ULL); } -void -sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) -{ - g_app_hang_shmem = ctx; -} - void sentry_app_hang_set_target_thread(void) { @@ -126,7 +139,74 @@ sentry_app_hang_heartbeat(void) ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); } -#else /* non-Windows or Xbox */ +# elif defined(SENTRY_PLATFORM_MACOS) + +uint64_t +sentry__app_hang_now_ms(void) +{ + /* CLOCK_UPTIME_RAW is the macOS analogue of Windows' + * QueryUnbiasedInterruptTime: a monotonic clock that excludes time the + * system was asleep, read identically by host and daemon. */ + struct timespec ts; + if (clock_gettime(CLOCK_UPTIME_RAW, &ts) != 0) { + return 0; + } + return (uint64_t)ts.tv_sec * 1000ULL + (uint64_t)ts.tv_nsec / 1000000ULL; +} + +void +sentry_app_hang_set_target_thread(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* Obtain the portable 64-bit Mach thread id of the current thread; this + * is the same value the daemon matches against via + * thread_info(THREAD_IDENTIFIER_INFO). */ + uint64_t current_tid = 0; + if (pthread_threadid_np(NULL, ¤t_tid) != 0 || current_tid == 0) { + return; + } + + /* CAS the current TID into the latch slot iff still unset — first caller + * wins, idempotent for that caller. The shmem field is declared + * `volatile uint64_t`; view it as an atomic for the compare-exchange. */ + _Atomic uint64_t *slot + = (_Atomic uint64_t *)(void *)&ctx->app_hang_target_tid; + uint64_t expected = 0; + atomic_compare_exchange_strong(slot, &expected, current_tid); +} + +void +sentry_app_hang_heartbeat(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* Refresh-only: requires a prior sentry_app_hang_set_target_thread() + * call from this thread. Drops the heartbeat if no target is latched, or + * if the latched thread is not us. */ + uint64_t current_tid = 0; + if (pthread_threadid_np(NULL, ¤t_tid) != 0 || current_tid == 0) { + return; + } + uint64_t latched = ctx->app_hang_target_tid; + if (latched == 0 || latched != current_tid) { + return; + } + + /* Relaxed 64-bit store; aligned on a 64-bit target so it is atomic and + * cannot tear. The daemon reads it with a relaxed load. */ + ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); +} + +# endif + +#else /* host heartbeat not supported on this target */ void sentry_app_hang_set_target_thread(void) diff --git a/src/sentry_app_hang.h b/src/sentry_app_hang.h index f146280ae..079950861 100644 --- a/src/sentry_app_hang.h +++ b/src/sentry_app_hang.h @@ -6,8 +6,16 @@ #include #include -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ +/* The host-side heartbeat machinery (clock, latch, shmem registration) is + * available on the native backend on Windows (non-Xbox) and macOS. Linux and + * other targets fall back to no-op stubs. */ +#if (((defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX)) \ + || defined(SENTRY_PLATFORM_MACOS))) \ && defined(SENTRY_BACKEND_NATIVE) +# define SENTRY_APP_HANG_HOST_SUPPORTED 1 +#endif + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) # include "sentry_crash_context.h" #endif @@ -49,8 +57,7 @@ sentry_app_hang_decision_t sentry__app_hang_decide(bool enabled, uint64_t hb, uint64_t now, uint64_t timeout_ms, uint64_t last_fired_hb, int consecutive_stale_ticks, int *out_consecutive_stale_ticks); -#if defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX) \ - && defined(SENTRY_BACKEND_NATIVE) +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) /** * Called from the native backend startup path. Stores `ctx` so that * subsequent `sentry_app_hang_heartbeat()` calls have somewhere to write. diff --git a/tests/test_integration_native.py b/tests/test_integration_native.py index 6cf6292b9..18a86eb48 100644 --- a/tests/test_integration_native.py +++ b/tests/test_integration_native.py @@ -1045,12 +1045,32 @@ def test_native_restart_on_crash(cmake, httpserver): @pytest.mark.skipif( - sys.platform != "win32", - reason="app-hang detection is Windows-only in this release", + sys.platform not in ("win32", "darwin"), + reason="app-hang detection is implemented on Windows and macOS", ) def test_native_app_hang(cmake, httpserver): - """App hang detection emits exactly one ApplicationNotResponding event.""" - tmp_path = cmake(["sentry_example"], {"SENTRY_BACKEND": "native"}) + """App hang detection emits exactly one ApplicationNotResponding event. + + On macOS the daemon samples the hung thread out-of-process via + ``task_for_pid``, which requires the example + daemon to be ad-hoc + codesigned with the debugger entitlement (same setup as the SMART-mode + heap test). If the capture still cannot acquire the task port in this + environment the daemon degrades gracefully and ships nothing — the test + skips rather than fails in that case. + """ + # macOS hardened-runtime self-signing needs a static build so the example + # can load itself without the dyld "different team IDs" check tripping on + # ad-hoc-signed dylibs (mirrors the SMART-mode heap test). + config = {"SENTRY_BACKEND": "native"} + if sys.platform == "darwin": + config["BUILD_SHARED_LIBS"] = "OFF" + tmp_path = cmake(["sentry_example"], config) + + if sys.platform == "darwin": + _codesign_for_task_for_pid( + str(tmp_path / "sentry_example"), + str(tmp_path / "sentry-crash"), + ) httpserver.expect_oneshot_request("/api/123456/envelope/").respond_with_data( "OK" @@ -1067,6 +1087,13 @@ def test_native_app_hang(cmake, httpserver): ["log", "app-hang"], env=dict(os.environ, SENTRY_DSN=make_dsn(httpserver)), ) + + if sys.platform == "darwin" and not waiting.result: + pytest.skip( + "no app-hang envelope received — task_for_pid is likely denied " + "in this environment (hardened-runtime/SIP); capture degraded " + "gracefully" + ) assert waiting.result envelope = Envelope.deserialize(httpserver.log[0][0].get_data())