diff --git a/examples/example.c b/examples/example.c index d3a083280..a2fdf90fc 100644 --- a/examples/example.c +++ b/examples/example.c @@ -612,6 +612,52 @@ run_threads(thread_func_t func) } #endif +#if defined(SENTRY_PLATFORM_WINDOWS) +static unsigned __stdcall +app_hang_demo_thread(void *arg) +{ + (void)arg; + /* Latch this thread as the target once, then heartbeat for 500 ms so the + * daemon sees a healthy baseline before the freeze. */ + sentry_app_hang_set_target_thread(); + for (int i = 0; i < 10; i++) { + sentry_app_hang_heartbeat(); + Sleep(50); + } + /* Add a couple of breadcrumbs before freezing so the captured app-hang + * event carries them (the daemon reads the breadcrumb ring files the host + * writes on each sentry_add_breadcrumb). */ + sentry_add_breadcrumb( + sentry_value_new_breadcrumb(NULL, "app-hang demo: about to freeze")); + sentry_add_breadcrumb(create_debug_crumb("app-hang demo breadcrumb")); + /* Freeze for 3x the configured timeout (3000 ms). */ + Sleep(3000); + return 0; +} +#elif defined(SENTRY_PLATFORM_MACOS) +static void * +app_hang_demo_thread(void *arg) +{ + (void)arg; + /* Latch this thread as the target once, then heartbeat for 500 ms so the + * daemon sees a healthy baseline before the freeze. */ + sentry_app_hang_set_target_thread(); + for (int i = 0; i < 10; i++) { + sentry_app_hang_heartbeat(); + usleep(50 * 1000); + } + /* Add a couple of breadcrumbs before freezing so the captured app-hang + * event carries them (the daemon reads the breadcrumb ring files the host + * writes on each sentry_add_breadcrumb). */ + sentry_add_breadcrumb( + sentry_value_new_breadcrumb(NULL, "app-hang demo: about to freeze")); + sentry_add_breadcrumb(create_debug_crumb("app-hang demo breadcrumb")); + /* Freeze for 3x the configured timeout (3000 ms). */ + usleep(3000 * 1000); + return NULL; +} +#endif + int main(int argc, char **argv) { @@ -879,6 +925,13 @@ main(int argc, char **argv) options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); } +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) + if (has_arg(argc, argv, "app-hang")) { + sentry_options_set_app_hang_enabled(options, 1); + sentry_options_set_app_hang_timeout_ms(options, 1000); + } +#endif + // E2E test mode: generate unique test ID for event correlation char e2e_test_id[37] = { 0 }; if (has_arg(argc, argv, "e2e-test")) { @@ -890,6 +943,32 @@ main(int argc, char **argv) return EXIT_FAILURE; } +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) + /* app-hang: spawn the demo thread BEFORE any other post-init work so it + * begins heartbeating immediately. The thread freezes for 3x the timeout, + * giving the daemon time to detect the hang and ship the envelope. We wait + * for it here so main does not exit before the transport has flushed. + * NOTE: this mode is intentionally exclusive – do not combine with crash/ + * abort/etc. since those would terminate the process first. */ + if (has_arg(argc, argv, "app-hang")) { +# if defined(SENTRY_PLATFORM_WINDOWS) + HANDLE t = (HANDLE)_beginthreadex( + NULL, 0, app_hang_demo_thread, NULL, 0, NULL); + if (t) { + WaitForSingleObject(t, INFINITE); + CloseHandle(t); + } +# else + pthread_t t; + if (0 == pthread_create(&t, NULL, app_hang_demo_thread, NULL)) { + pthread_join(t, NULL); + } +# endif + sentry_close(); + return EXIT_SUCCESS; + } +#endif + if (has_arg(argc, argv, "user-consent-revoke")) { sentry_user_consent_revoke(); } diff --git a/include/sentry.h b/include/sentry.h index 25416813e..977eb92ad 100644 --- a/include/sentry.h +++ b/include/sentry.h @@ -1697,6 +1697,65 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_attach_session_replay( SENTRY_EXPERIMENTAL_API void sentry_options_set_session_replay_duration( sentry_options_t *opts, uint32_t duration_ms); +/** + * Enable app-hang detection in the native crash backend. + * + * When enabled, the out-of-process daemon monitors a designated thread in the + * host via a shared-memory heartbeat. If the heartbeat goes stale for longer + * than the configured timeout, the daemon walks the thread's stack remotely and + * emits an `ApplicationNotResponding` event. The host process keeps running. + * + * Off by default. This setting only has an effect when using the `native` + * backend. In this initial release the feature is Windows-only; the call is a + * silent no-op on other platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_enabled( + sentry_options_t *opts, int enabled); + +/** + * Sets the heartbeat-staleness threshold (in milliseconds) used by the + * app-hang detector. Default 5000 ms. + * + * Read by the daemon once at startup; changes after `sentry_init` have no + * effect. + */ +SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_timeout_ms( + sentry_options_t *opts, uint64_t timeout_ms); + +/** + * Designate the calling thread as the one monitored by the app-hang detector. + * + * Call this once, from the thread you want monitored (typically the main / + * game thread), before the first heartbeat. The latch is sticky for the + * lifetime of the SDK session: subsequent calls from any other thread are + * dropped. Calling again from the same thread is a harmless no-op. + * + * Until this is called, `sentry_app_hang_heartbeat()` is a no-op — there is + * no implicit "first caller wins" latch, so a stray heartbeat from a worker + * thread during startup cannot accidentally claim the role and silently + * disable monitoring of the real main thread. + * + * No-op if app-hang detection is not enabled in options, or if the native + * backend is not active, or on non-Windows platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_app_hang_set_target_thread(void); + +/** + * Refresh the heartbeat for the monitored thread. + * + * Call this from the thread previously designated via + * `sentry_app_hang_set_target_thread()`. Calls from any other thread, or + * before a target has been set, are dropped — so a stray heartbeat from a + * worker thread cannot mask a frozen main thread. + * + * Cost: approximately one system call plus a relaxed 64-bit store. Safe to + * call from a per-frame hook in a game engine. + * + * No-op if app-hang detection is not enabled in options, or if the native + * backend is not active, or on non-Windows platforms. + */ +SENTRY_EXPERIMENTAL_API void sentry_app_hang_heartbeat(void); + /** * Sets the path to the crashpad handler if the crashpad backend is used. * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6086dbaaf..a29f7e88b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,8 @@ sentry_target_sources_cwd(sentry sentry_alloc.c sentry_alloc.h + sentry_app_hang.c + sentry_app_hang.h sentry_attachment.c sentry_attachment.h sentry_backend.c diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index e5d0bd63e..55fd646fa 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -289,6 +289,7 @@ typedef struct { uint64_t shutdown_timeout; uint64_t transfer_timeout; bool system_crash_reporter_enabled; + uint32_t max_breadcrumbs; // Atomic user consent (sentry_user_consent_t), updated whenever user // consent changes so the daemon can honor it at crash time. @@ -325,6 +326,24 @@ typedef struct { uint32_t module_count; sentry_module_info_t modules[SENTRY_CRASH_MAX_MODULES]; + /* App-hang detection (Windows + macOS, native backend only). + * + * Sync model: + * - app_hang_enabled, app_hang_timeout_ms: written by host before daemon + * is signalled ready; read by daemon at startup. No further mutation. + * - app_hang_target_tid: latched once by host on first heartbeat via a + * compare-exchange (InterlockedCompareExchange64 on Windows, + * atomic_compare_exchange_strong on macOS). Daemon reads, never writes. + * - app_hang_last_heartbeat_ms: written on every heartbeat with a relaxed + * 64-bit store. Daemon reads with a relaxed load. Torn reads are not a + * correctness issue — the daemon compares against its remembered value + * from the previous tick. (On 64-bit Windows/macOS the aligned store is + * atomic; the tear note applies to 32-bit Windows.) */ + bool app_hang_enabled; + uint64_t app_hang_timeout_ms; + volatile uint64_t app_hang_target_tid; + volatile uint64_t app_hang_last_heartbeat_ms; + } sentry_crash_context_t; // Shared memory size: calculated at compile-time based on actual struct size diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index a2e03f7d3..78589492c 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2,6 +2,7 @@ #include "minidump/sentry_minidump_writer.h" #include "sentry_alloc.h" +#include "sentry_app_hang.h" #include "sentry_attachment.h" #include "sentry_core.h" #include "sentry_crash_ipc.h" @@ -45,6 +46,9 @@ # if defined(SENTRY_PLATFORM_MACOS) # include # include +# include +# include +# include # include # endif #elif defined(SENTRY_PLATFORM_WINDOWS) @@ -2118,19 +2122,130 @@ build_stacktrace_from_ctx(const sentry_crash_context_t *ctx) return build_stacktrace_for_thread(ctx, SIZE_MAX); } +/* Describes which kind of native event we are building. `s_crash_kind` + * drives the crash path; `s_app_hang_kind` drives the app-hang flow on + * Windows and macOS. + * + * Invariant: if `include_signal_meta` is true, `exception_type` must be NULL + * (the signal-derived path). Setting an override type AND requesting signal + * metadata is incoherent — there is no signal in the override case. + */ +typedef struct { + /* Override exception `type` string. NULL = derive from the crash signal + * (e.g. "SIGSEGV" on Unix, "EXCEPTION" on Windows). */ + const char *exception_type; + /* Override exception `value` string. Used only when `exception_type` is + * non-NULL; ignored otherwise. */ + const char *exception_value; + /* `mechanism.type` JSON value, e.g. "signalhandler" or "AppHang". */ + const char *mechanism_type; + /* `mechanism.handled` JSON value. false for fatal crashes, true for + * recoverable events like app hangs. */ + bool mechanism_handled; + /* Event `level` JSON value, e.g. "fatal" or "error". */ + const char *level; + /* Attach `mechanism.meta.signal` payload? Must be false when + * `exception_type` is non-NULL (see struct invariant). */ + bool include_signal_meta; +} sentry_native_event_kind_t; + +/* Crash-path event kind: signal-derived type/value, fatal level, unhandled. */ +static const sentry_native_event_kind_t s_crash_kind = { + .exception_type = NULL, + .exception_value = NULL, + .mechanism_type = "signalhandler", + .mechanism_handled = false, + .level = "fatal", + .include_signal_meta = true, +}; + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) +/* App-hang event kind: ANR-style, handled, error level. The per-event + * `exception_value` (freeze duration message) is filled in at capture time. */ +static const sentry_native_event_kind_t s_app_hang_kind = { + .exception_type = "ApplicationNotResponding", + .exception_value = NULL, /* filled in per-event below */ + .mechanism_type = "AppHang", + .mechanism_handled = true, + .level = "error", + .include_signal_meta = false, +}; +#endif + +/** + * Reads one breadcrumb ring file the crashing process appended on its hot path + * into a breadcrumb list. Returns null if the file is absent or empty. + */ +static sentry_value_t +read_breadcrumb_ring_file(const sentry_path_t *run_folder, const char *name) +{ + if (!run_folder) { + return sentry_value_new_null(); + } + sentry_path_t *path = sentry__path_join_str(run_folder, name); + if (!path) { + return sentry_value_new_null(); + } + size_t size = 0; + char *buf = sentry__path_read_to_buffer(path, &size); + sentry__path_free(path); + if (!buf || size == 0) { + sentry_free(buf); + return sentry_value_new_null(); + } + sentry_value_t list = sentry__value_from_msgpack(buf, size); + sentry_free(buf); + // `sentry__value_from_msgpack` only builds a list when the file holds 2+ + // concatenated values; a file with a single breadcrumb decodes to a bare + // object. Wrap it so the merge step (which ignores non-lists) keeps it. + if (sentry_value_get_type(list) == SENTRY_VALUE_TYPE_OBJECT) { + sentry_value_t wrapper = sentry_value_new_list(); + sentry_value_append(wrapper, list); + return wrapper; + } + return list; +} + +/** + * Assembles the crash event's breadcrumbs from the two ring files the crashing + * process appended one-at-a-time, merges them in timestamp order, keeps the + * newest `max_breadcrumbs`, and attaches them to `event`. + * Mirrors the crashpad backend's `report_to_envelope`. + */ +static void +apply_breadcrumbs_from_ring_files(sentry_value_t event, + const sentry_path_t *run_folder, const sentry_crash_context_t *ctx) +{ + sentry_value_t b1 + = read_breadcrumb_ring_file(run_folder, "__sentry-breadcrumb1"); + sentry_value_t b2 + = read_breadcrumb_ring_file(run_folder, "__sentry-breadcrumb2"); + size_t max = ctx && ctx->max_breadcrumbs ? ctx->max_breadcrumbs + : SENTRY_BREADCRUMBS_MAX; + sentry_value_t merged = sentry__value_merge_breadcrumbs(b1, b2, max); + sentry_value_decref(b1); + sentry_value_decref(b2); + // Overwrite any breadcrumbs the base event may carry: the ring files are + // the single source of truth, so this is idempotent and never duplicates. + if (sentry_value_get_type(merged) == SENTRY_VALUE_TYPE_LIST) { + sentry_value_set_by_key(event, "breadcrumbs", merged); + } else { + sentry_value_decref(merged); + } +} + /** * Build a native event and set the level, mechanism, and handled state * * @param ctx Crash context * @param event_file_path Path to base event file from parent process - * @param level Event level (e.g. "fatal") - * @param mechanism_type Exception mechanism type (e.g. "signalhandler") - * @param handled Whether the mechanism was handled + * @param run_folder Run directory holding the breadcrumb ring files + * @param kind Event-kind descriptor controlling exception/mechanism/level */ static sentry_value_t -build_native_event(const sentry_crash_context_t *ctx, - const char *event_file_path, const char *level, const char *mechanism_type, - bool handled) +build_native_crash_event(const sentry_crash_context_t *ctx, + const char *event_file_path, const sentry_path_t *run_folder, + const sentry_native_event_kind_t *kind) { // Read base event from parent's file sentry_value_t event = sentry_value_new_null(); @@ -2152,54 +2267,75 @@ build_native_event(const sentry_crash_context_t *ctx, event = sentry_value_new_event(); } + apply_breadcrumbs_from_ring_files(event, run_folder, ctx); + // Set platform to native sentry_value_set_by_key( event, "platform", sentry_value_new_string("native")); - sentry_value_set_by_key(event, "level", sentry_value_new_string(level)); + // Set level (varies by event kind: "fatal" for crash, "error" for app hang) + sentry_value_set_by_key( + event, "level", sentry_value_new_string(kind->level)); // Build exception - const char *signal_name = "UNKNOWN"; + /* Function-scope so exc_value (which may point into this buffer) remains + * valid after the `else` block below. Previously declared inside the + * else: out of scope by the time exc_value is read -> UB per C99 6.2.4. */ + char crash_value_buf[128]; + const char *exc_type; + const char *exc_value; + + if (kind->exception_type) { + exc_type = kind->exception_type; + exc_value = kind->exception_value ? kind->exception_value : ""; + } else { + const char *signal_name; #if defined(SENTRY_PLATFORM_UNIX) - int signal_number = ctx->platform.signum; - signal_name = get_signal_name(signal_number); + signal_name = get_signal_name(ctx->platform.signum); #elif defined(SENTRY_PLATFORM_WINDOWS) - // Exception code is used directly below as unsigned - signal_name = "EXCEPTION"; + signal_name = "EXCEPTION"; +#else + signal_name = "UNKNOWN"; #endif + exc_type = signal_name; + snprintf(crash_value_buf, sizeof(crash_value_buf), "Fatal crash: %s", + signal_name); + exc_value = crash_value_buf; + } sentry_value_t exc = sentry_value_new_object(); - sentry_value_set_by_key(exc, "type", sentry_value_new_string(signal_name)); - - char value_buf[128]; - snprintf(value_buf, sizeof(value_buf), "Fatal crash: %s", signal_name); - sentry_value_set_by_key(exc, "value", sentry_value_new_string(value_buf)); + sentry_value_set_by_key(exc, "type", sentry_value_new_string(exc_type)); + sentry_value_set_by_key(exc, "value", sentry_value_new_string(exc_value)); // Add mechanism sentry_value_t mechanism = sentry_value_new_object(); - sentry_value_set_by_key( - mechanism, "type", sentry_value_new_string(mechanism_type)); + sentry_value_set_by_key(mechanism, "type", + sentry_value_new_string(kind->mechanism_type)); sentry_value_set_by_key( mechanism, "synthetic", sentry_value_new_bool(true)); - sentry_value_set_by_key( - mechanism, "handled", sentry_value_new_bool(handled)); + sentry_value_set_by_key(mechanism, "handled", + sentry_value_new_bool(kind->mechanism_handled)); - // Add signal metadata - sentry_value_t meta = sentry_value_new_object(); - sentry_value_t signal_info = sentry_value_new_object(); + // Add signal metadata (only relevant for signal-handler/crash events) + if (kind->include_signal_meta) { + sentry_value_t meta = sentry_value_new_object(); + sentry_value_t signal_info = sentry_value_new_object(); #if defined(SENTRY_PLATFORM_WINDOWS) - // Windows exception codes are unsigned 32-bit values (e.g., 0xC0000005) - // Use uint64 to preserve the unsigned value for the symbolicator - sentry_value_set_by_key(signal_info, "number", - sentry_value_new_uint64((uint64_t)ctx->platform.exception_code)); + // Windows exception codes are unsigned 32-bit values (e.g., 0xC0000005) + // Use uint64 to preserve the unsigned value for the symbolicator + sentry_value_set_by_key(signal_info, "number", + sentry_value_new_uint64((uint64_t)ctx->platform.exception_code)); #else - sentry_value_set_by_key( - signal_info, "number", sentry_value_new_int32(signal_number)); + sentry_value_set_by_key(signal_info, "number", + sentry_value_new_int32(ctx->platform.signum)); #endif - sentry_value_set_by_key( - signal_info, "name", sentry_value_new_string(signal_name)); - sentry_value_set_by_key(meta, "signal", signal_info); - sentry_value_set_by_key(mechanism, "meta", meta); + /* By the struct invariant, include_signal_meta is only true when + * exception_type is NULL, so exc_type holds the signal name here. */ + sentry_value_set_by_key( + signal_info, "name", sentry_value_new_string(exc_type)); + sentry_value_set_by_key(meta, "signal", signal_info); + sentry_value_set_by_key(mechanism, "meta", meta); + } sentry_value_set_by_key(exc, "mechanism", mechanism); @@ -2476,13 +2612,13 @@ static bool write_envelope_with_native_stacktrace(const sentry_options_t *options, const char *envelope_path, const sentry_crash_context_t *ctx, const char *event_file_path, const char *minidump_path, - sentry_path_t *run_folder) + sentry_path_t *run_folder, const sentry_native_event_kind_t *kind) { // Build native crash event (always include threads with names) SENTRY_DEBUGF("write_envelope_with_native_stacktrace: minidump_path=%s", minidump_path ? minidump_path : "(null)"); - sentry_value_t event = build_native_event( - ctx, event_file_path, "fatal", "signalhandler", false); + sentry_value_t event = build_native_crash_event( + ctx, event_file_path, run_folder, kind); // Serialize event to JSON size_t event_size = 0; @@ -2721,6 +2857,586 @@ write_envelope_with_native_stacktrace(const sentry_options_t *options, return true; } +#if defined(SENTRY_PLATFORM_WINDOWS) +/** + * App-hang capture path (Windows). Suspends the latched target thread just long + * enough to snapshot its CONTEXT, then builds and submits an envelope using the + * same native-stacktrace path as crashes (with an AppHang event kind). + */ +static void +capture_and_send_app_hang(const sentry_options_t *options, + sentry_crash_ipc_t *ipc, uint64_t freeze_ms) +{ + /* NOTE (race, experimental Windows-only first cut): This function reads + * and mutates shmem fields (platform.context, threads[0], crashed_tid, + * num_threads) that are also written by the host's signal handler on a + * real crash. The daemon's main loop is single-threaded and the crash + * event has wait-priority 0, so we will not enter this function with a + * pending crash notification already signalled. The remaining narrow + * window is: the host crashes WHILE this function is running, the host's + * signal handler writes to shmem mid-capture, and we then send a + * partially-overwritten event. We accept this risk for the initial + * Windows-only implementation; mitigation (state check at entry / pause + * via an additional shmem flag) is tracked as follow-up work. */ + sentry_crash_context_t *ctx = ipc->shmem; + + /* Populate modules once per session if not already done. */ + if (ctx->module_count == 0) { + capture_modules_from_process(ctx); + } + + DWORD target_tid = (DWORD)ctx->app_hang_target_tid; + + /* Suspend the target thread and capture its CONTEXT. */ + HANDLE hThread = OpenThread(THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME + | THREAD_QUERY_INFORMATION, + FALSE, target_tid); + if (!hThread) { + SENTRY_DEBUGF("app-hang: OpenThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + return; + } + + DWORD suspend_count = SuspendThread(hThread); + if (suspend_count == (DWORD)-1) { + SENTRY_DEBUGF("app-hang: SuspendThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + CloseHandle(hThread); + return; + } + + CONTEXT thread_ctx; + memset(&thread_ctx, 0, sizeof(thread_ctx)); + thread_ctx.ContextFlags = CONTEXT_FULL; + if (!GetThreadContext(hThread, &thread_ctx)) { + SENTRY_DEBUGF( + "app-hang: GetThreadContext failed: %lu", GetLastError()); + ResumeThread(hThread); + CloseHandle(hThread); + return; + } + + /* Resume immediately; we have the snapshot we need. */ + ResumeThread(hThread); + CloseHandle(hThread); + + /* Place the snapshot in the "crashed thread" slot of the context so the + * existing event builder pulls a stacktrace out for the exception + * payload and the threads block. + * + * IMPORTANT: build_stacktrace_from_ctx() calls build_stacktrace_for_thread + * with thread_idx == SIZE_MAX, which on Windows reads from + * ctx->platform.context (NOT threads[0].context). We must populate both + * so the exception stacktrace uses the captured CONTEXT instead of an + * all-zero one (PC=0 -> StackWalk64 produces no frames). */ + ctx->platform.context = thread_ctx; + ctx->crashed_tid = target_tid; + ctx->platform.num_threads = 1; + ctx->platform.threads[0].thread_id = target_tid; + ctx->platform.threads[0].context = thread_ctx; + ctx->platform.threads[0].name[0] = '\0'; + + /* Build the per-event value description with the freeze duration. */ + char value_buf[128]; + snprintf(value_buf, sizeof(value_buf), + "App hang detected. Main thread blocked for %llu ms.", + (unsigned long long)freeze_ms); + sentry_native_event_kind_t kind = s_app_hang_kind; + kind.exception_value = value_buf; + + /* Build an envelope path next to the crash one. */ + char envelope_path[SENTRY_CRASH_MAX_PATH]; + int path_len = snprintf(envelope_path, sizeof(envelope_path), + "%s/sentry-app-hang-%lu-%llu.env", ctx->database_path, + (unsigned long)ctx->crashed_pid, + (unsigned long long)ctx->app_hang_last_heartbeat_ms); + + if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) { + SENTRY_WARN("app-hang: envelope path truncated or invalid"); + return; + } + + /* Reuse the scope file the host keeps up-to-date via flush_scope so the + * app-hang event carries the same scope context as a crash event: + * full contexts (os/device/gpu/app/runtime/unity/...), user, tags, + * extra, fingerprint, release/dist/env, sdk metadata, and breadcrumbs. + * The base event JSON is at ctx->event_path; the sibling run folder + * holds the `__sentry-attachments` manifest, scope attachments, + * screenshot, and session replay — all pulled in by + * write_envelope_with_native_stacktrace when run_folder is non-NULL. */ + const char *event_file_path + = ctx->event_path[0] ? ctx->event_path : NULL; + sentry_path_t *run_folder = NULL; + if (event_file_path) { + sentry_path_t *ev_path = sentry__path_from_str(event_file_path); + if (ev_path) { + run_folder = sentry__path_dir(ev_path); + sentry__path_free(ev_path); + } + } + + bool ok = write_envelope_with_native_stacktrace(options, envelope_path, + ctx, event_file_path, /*minidump_path=*/NULL, run_folder, &kind); + + if (run_folder) { + sentry__path_free(run_folder); + } + + if (!ok) { + SENTRY_WARN("app-hang: failed to write envelope"); + return; + } + + /* Read envelope from disk and hand to transport. */ + sentry_path_t *env_path = sentry__path_from_str(envelope_path); + if (env_path) { + sentry_envelope_t *envelope = sentry__envelope_from_path(env_path); + if (envelope && options && options->transport) { + sentry__capture_envelope(options->transport, envelope, options); + } + sentry__path_remove(env_path); + sentry__path_free(env_path); + } +} + +#elif defined(SENTRY_PLATFORM_MACOS) + +/* Read `size` bytes at `addr` from another task into `buf`. Mirrors the + * minidump writer's read_task_memory (mach_vm_read_overwrite). */ +static kern_return_t +app_hang_read_task_memory( + task_t task, mach_vm_address_t addr, void *buf, mach_vm_size_t size) +{ + mach_vm_size_t got = 0; + kern_return_t kr + = mach_vm_read_overwrite(task, addr, size, (mach_vm_address_t)buf, &got); + if (kr == KERN_SUCCESS && got != size) { + return KERN_FAILURE; + } + return kr; +} + +/* Enumerate the host's loaded dyld images out-of-process via the donated/ + * task_for_pid task port and populate ctx->modules[] (base, __TEXT vmsize, + * UUID, name). This is the out-of-process analogue of the in-process + * _dyld_image_count() loop the crash signal handler runs — needed for app + * hangs because no signal handler runs to capture modules, and the daemon's + * own dyld images are unrelated to the host's. Best-effort: on any read + * failure we stop and keep whatever was gathered. */ +static void +app_hang_capture_modules(task_t task, sentry_crash_context_t *ctx) +{ + ctx->module_count = 0; + + /* Locate dyld_all_image_infos in the target task. */ + struct task_dyld_info dyld_info; + mach_msg_type_number_t count = TASK_DYLD_INFO_COUNT; + if (task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &count) + != KERN_SUCCESS) { + SENTRY_DEBUG("app-hang: task_info(TASK_DYLD_INFO) failed"); + return; + } + + struct dyld_all_image_infos all_infos; + if (app_hang_read_task_memory(task, + (mach_vm_address_t)dyld_info.all_image_info_addr, &all_infos, + sizeof(all_infos)) + != KERN_SUCCESS) { + SENTRY_DEBUG("app-hang: failed to read dyld_all_image_infos"); + return; + } + + uint32_t image_count = all_infos.infoArrayCount; + if (image_count > SENTRY_CRASH_MAX_MODULES) { + image_count = SENTRY_CRASH_MAX_MODULES; + } + + for (uint32_t i = 0; + i < image_count && ctx->module_count < SENTRY_CRASH_MAX_MODULES; i++) { + /* Read one dyld_image_info entry from the remote infoArray. */ + struct dyld_image_info info; + mach_vm_address_t entry_addr = (mach_vm_address_t)all_infos.infoArray + + (mach_vm_address_t)i * sizeof(struct dyld_image_info); + if (app_hang_read_task_memory(task, entry_addr, &info, sizeof(info)) + != KERN_SUCCESS) { + break; + } + + uint64_t base = (uint64_t)info.imageLoadAddress; + if (base == 0) { + continue; + } + + sentry_module_info_t *module = &ctx->modules[ctx->module_count]; + memset(module, 0, sizeof(*module)); + module->base_address = base; + + /* Read the image path from the remote address. */ + if (info.imageFilePath) { + char namebuf[SENTRY_CRASH_MAX_PATH]; + memset(namebuf, 0, sizeof(namebuf)); + /* Read in a bounded chunk; tolerate a short read at the tail. */ + for (size_t off = 0; off < sizeof(namebuf) - 1; off += 256) { + size_t chunk = sizeof(namebuf) - 1 - off; + if (chunk > 256) { + chunk = 256; + } + if (app_hang_read_task_memory(task, + (mach_vm_address_t)info.imageFilePath + off, + namebuf + off, chunk) + != KERN_SUCCESS) { + break; + } + if (memchr(namebuf + off, '\0', chunk)) { + break; + } + } + namebuf[sizeof(namebuf) - 1] = '\0'; + strncpy(module->name, namebuf, sizeof(module->name) - 1); + } + + /* Read the Mach-O header + load commands to get __TEXT vmsize and + * UUID, mirroring the in-process loop in the signal handler. */ + struct mach_header_64 header; + if (app_hang_read_task_memory( + task, (mach_vm_address_t)base, &header, sizeof(header)) + == KERN_SUCCESS + && (header.magic == MH_MAGIC_64 || header.magic == MH_CIGAM_64)) { + uint32_t ncmds = header.ncmds; + if (ncmds > 256) { + ncmds = 256; + } + /* Read the load-command region in one shot (capped). */ + uint32_t cmds_size = header.sizeofcmds; + if (cmds_size > 0 && cmds_size <= 64 * 1024) { + uint8_t *cmds = sentry_malloc(cmds_size); + if (cmds + && app_hang_read_task_memory(task, + (mach_vm_address_t)base + sizeof(header), cmds, + cmds_size) + == KERN_SUCCESS) { + const uint8_t *p = cmds; + const uint8_t *end = cmds + cmds_size; + bool has_size = false, has_uuid = false; + for (uint32_t j = 0; + j < ncmds && (!has_size || !has_uuid) + && p + sizeof(struct load_command) <= end; + j++) { + const struct load_command *lc + = (const struct load_command *)p; + if (lc->cmdsize == 0 + || p + lc->cmdsize > end) { + break; + } + if (lc->cmd == LC_SEGMENT_64 + && lc->cmdsize >= sizeof(struct segment_command_64)) { + const struct segment_command_64 *seg + = (const struct segment_command_64 *)lc; + if (memcmp(seg->segname, "__TEXT", 7) == 0) { + module->size = seg->vmsize; + has_size = true; + } + } else if (lc->cmd == LC_UUID + && lc->cmdsize >= sizeof(struct uuid_command)) { + const struct uuid_command *uc + = (const struct uuid_command *)lc; + memcpy(module->uuid, uc->uuid, 16); + has_uuid = true; + } + p += lc->cmdsize; + } + } + sentry_free(cmds); + } + } + + ctx->module_count++; + } + + SENTRY_DEBUGF( + "app-hang: captured %u modules out-of-process", ctx->module_count); +} + +/* Read the hung thread's stack memory (from SP upward) out-of-process and save + * it to a file, populating threads[0].stack_path / stack_size so the existing + * FP-unwinder in build_stacktrace_for_thread can walk real frames — the same + * file-backed mechanism the signal handler uses for crashes. Best-effort. */ +static void +app_hang_capture_stack( + task_t task, sentry_crash_context_t *ctx, uint64_t sp) +{ + ctx->platform.threads[0].stack_path[0] = '\0'; + ctx->platform.threads[0].stack_size = 0; + if (sp == 0) { + return; + } + + mach_vm_size_t want = SENTRY_CRASH_MAX_STACK_CAPTURE; + uint8_t *buf = sentry_malloc(want); + if (!buf) { + return; + } + + /* Shrink the read until it succeeds — the top of stack may be near a guard + * page, so a full-size read can straddle unmapped memory and fail. */ + mach_vm_size_t got = 0; + while (want >= 4096) { + if (app_hang_read_task_memory(task, (mach_vm_address_t)sp, buf, want) + == KERN_SUCCESS) { + got = want; + break; + } + want /= 2; + } + if (got == 0) { + SENTRY_DEBUG("app-hang: failed to read hung thread stack"); + sentry_free(buf); + return; + } + + char stack_path[SENTRY_CRASH_MAX_PATH]; + int n = snprintf(stack_path, sizeof(stack_path), + "%s/sentry-app-hang-stack-%lu.bin", ctx->database_path, + (unsigned long)ctx->crashed_pid); + if (n < 0 || n >= (int)sizeof(stack_path)) { + sentry_free(buf); + return; + } + int fd = open(stack_path, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd >= 0) { + if (write(fd, buf, (size_t)got) == (ssize_t)got) { + strncpy(ctx->platform.threads[0].stack_path, stack_path, + sizeof(ctx->platform.threads[0].stack_path) - 1); + ctx->platform.threads[0].stack_size = got; + SENTRY_DEBUGF("app-hang: captured %llu bytes of stack", + (unsigned long long)got); + } + close(fd); + } + sentry_free(buf); +} + +/** + * App-hang capture path (macOS). The host is alive but frozen, so unlike a + * crash there is no in-process signal-handler snapshot to fall back on — the + * daemon must sample the hung thread itself. It does so out-of-process via + * `task_for_pid` (the same mechanism the crash "full path" minidump writer + * relies on): locate the Mach thread whose THREAD_IDENTIFIER_INFO.thread_id + * matches the latched target tid, suspend it just long enough to read its + * register state, then resume and build/submit an AppHang envelope using the + * same native-stacktrace path as crashes. + * + * Requires `task_for_pid` to be permitted (same-user, non-hardened local/dev + * builds). On a hardened release runtime without the debugger entitlement it + * is denied; the entitlement-free port-donation replacement is a separate + * follow-up. + */ +static void +capture_and_send_app_hang(const sentry_options_t *options, + sentry_crash_ipc_t *ipc, uint64_t freeze_ms) +{ + /* NOTE (race, same as the Windows variant): this function reads and + * mutates shmem fields (platform.mcontext, threads[0], crashed_tid, + * num_threads) that the host's signal handler also writes on a real + * crash. The daemon loop is single-threaded and processes a pending crash + * before reaching here, so the only remaining window is the host crashing + * mid-capture. Accepted for the spike, same as Windows. */ + sentry_crash_context_t *ctx = ipc->shmem; + + const uint64_t target_tid = ctx->app_hang_target_tid; + + /* Acquire the host task. No in-process snapshot exists for a hang, so a + * failure here means we simply cannot capture this hang. */ + task_t task = MACH_PORT_NULL; + kern_return_t kr + = task_for_pid(mach_task_self(), (int)ctx->crashed_pid, &task); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: task_for_pid(%d) failed: %d (%s) — no " + "snapshot available for a hang", + (int)ctx->crashed_pid, kr, mach_error_string(kr)); + return; + } + + /* Enumerate the host's dyld modules out-of-process so debug_meta is + * populated and frames symbolicate server-side (the in-process signal + * handler that normally does this never runs for a hang). */ + app_hang_capture_modules(task, ctx); + + /* Enumerate threads and find the latched target by its portable tid. */ + thread_act_array_t threads = NULL; + mach_msg_type_number_t thread_count = 0; + kr = task_threads(task, &threads, &thread_count); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: task_threads failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + thread_t target = MACH_PORT_NULL; + for (mach_msg_type_number_t i = 0; i < thread_count; i++) { + thread_identifier_info_data_t id_info; + mach_msg_type_number_t id_count = THREAD_IDENTIFIER_INFO_COUNT; + if (thread_info(threads[i], THREAD_IDENTIFIER_INFO, + (thread_info_t)&id_info, &id_count) + == KERN_SUCCESS + && id_info.thread_id == target_tid) { + target = threads[i]; + } else { + /* Deallocate the ports we are not keeping. */ + mach_port_deallocate(mach_task_self(), threads[i]); + } + } + + if (target == MACH_PORT_NULL) { + SENTRY_DEBUGF("app-hang: target thread tid=%llu not found among %u " + "threads", + (unsigned long long)target_tid, thread_count); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Suspend the target just long enough to read its register state. */ + kr = thread_suspend(target); + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: thread_suspend failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Read the integer register set directly into mcontext.__ss. Use the + * arch-specific flavor (ARM_THREAD_STATE64 / x86_THREAD_STATE64) that + * matches __ss's layout — NOT MACHINE_THREAD_STATE, which is the tagged + * *unified* state (arm_unified_thread_state_t) and would land with the + * wrong layout, yielding garbage IP/FP/SP. */ + _STRUCT_MCONTEXT mcontext; + memset(&mcontext, 0, sizeof(mcontext)); +# if defined(__x86_64__) + mach_msg_type_number_t state_count = x86_THREAD_STATE64_COUNT; + kr = thread_get_state(target, x86_THREAD_STATE64, + (thread_state_t)&mcontext.__ss, &state_count); +# elif defined(__aarch64__) + mach_msg_type_number_t state_count = ARM_THREAD_STATE64_COUNT; + kr = thread_get_state(target, ARM_THREAD_STATE64, + (thread_state_t)&mcontext.__ss, &state_count); +# else + mach_msg_type_number_t state_count = MACHINE_THREAD_STATE_COUNT; + kr = thread_get_state( + target, MACHINE_THREAD_STATE, (thread_state_t)&mcontext.__ss, + &state_count); +# endif + + thread_resume(target); + + if (kr != KERN_SUCCESS) { + SENTRY_DEBUGF("app-hang: thread_get_state failed: %d (%s)", kr, + mach_error_string(kr)); + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + return; + } + + /* Place the snapshot in the "crashed thread" slot of the context so the + * existing event builder pulls a stacktrace and register block out for the + * exception payload and the threads block. + * + * build_stacktrace_from_ctx() (thread_idx == SIZE_MAX) reads from + * ctx->platform.mcontext, while the per-thread register block reads from + * threads[0].state — populate both so the captured registers are used and + * not an all-zero context (PC=0 -> no frames). */ + ctx->platform.mcontext = mcontext; + ctx->crashed_tid = (pid_t)target_tid; + ctx->platform.num_threads = 1; + ctx->platform.threads[0].thread = target; /* port; valid only here */ + ctx->platform.threads[0].tid = target_tid; + ctx->platform.threads[0].state = mcontext; + ctx->platform.threads[0].stack_path[0] = '\0'; + ctx->platform.threads[0].stack_size = 0; + + /* Capture the hung thread's stack (from SP upward) out-of-process so the + * FP-unwinder can walk real frames instead of just the top PC. Must happen + * while we still hold the task port. */ + uint64_t target_sp = 0; +# if defined(__x86_64__) + target_sp = mcontext.__ss.__rsp; +# elif defined(__aarch64__) + target_sp = SENTRY__ARM64_GET_SP(mcontext.__ss); +# endif + app_hang_capture_stack(task, ctx, target_sp); + + /* Done reading from the host task; release the Mach ports. */ + mach_port_deallocate(mach_task_self(), target); + vm_deallocate(mach_task_self(), (vm_address_t)threads, + thread_count * sizeof(thread_t)); + mach_port_deallocate(mach_task_self(), task); + + /* Build the per-event value description with the freeze duration. */ + char value_buf[128]; + snprintf(value_buf, sizeof(value_buf), + "App hang detected. Main thread blocked for %llu ms.", + (unsigned long long)freeze_ms); + sentry_native_event_kind_t kind = s_app_hang_kind; + kind.exception_value = value_buf; + + /* Build an envelope path next to the crash one. */ + char envelope_path[SENTRY_CRASH_MAX_PATH]; + int path_len = snprintf(envelope_path, sizeof(envelope_path), + "%s/sentry-app-hang-%lu-%llu.env", ctx->database_path, + (unsigned long)ctx->crashed_pid, + (unsigned long long)ctx->app_hang_last_heartbeat_ms); + + if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) { + SENTRY_WARN("app-hang: envelope path truncated or invalid"); + return; + } + + /* Reuse the host-maintained scope file and run folder so the app-hang + * event carries the same scope context as a crash event (see the Windows + * variant for the detailed rationale). */ + const char *event_file_path + = ctx->event_path[0] ? ctx->event_path : NULL; + sentry_path_t *run_folder = NULL; + if (event_file_path) { + sentry_path_t *ev_path = sentry__path_from_str(event_file_path); + if (ev_path) { + run_folder = sentry__path_dir(ev_path); + sentry__path_free(ev_path); + } + } + + bool ok = write_envelope_with_native_stacktrace(options, envelope_path, + ctx, event_file_path, /*minidump_path=*/NULL, run_folder, &kind); + + if (run_folder) { + sentry__path_free(run_folder); + } + + if (!ok) { + SENTRY_WARN("app-hang: failed to write envelope"); + return; + } + + /* Read envelope from disk and hand to transport. */ + sentry_path_t *env_path = sentry__path_from_str(envelope_path); + if (env_path) { + sentry_envelope_t *envelope = sentry__envelope_from_path(env_path); + if (envelope && options && options->transport) { + sentry__capture_envelope(options->transport, envelope, options); + } + sentry__path_remove(env_path); + sentry__path_free(env_path); + } +} +#endif /* SENTRY_PLATFORM_WINDOWS / SENTRY_PLATFORM_MACOS */ + /** * Manually write a Sentry envelope with event, minidump, and attachments. * Format matches what Crashpad's Envelope class does. @@ -2731,21 +3447,26 @@ write_envelope_with_minidump(const sentry_options_t *options, const char *event_msgpack_path, const char *minidump_path, sentry_path_t *run_folder) { - // Read event JSON data + // Read the base event, merge in the breadcrumbs from the ring files, + // re-serialize. size_t event_size = 0; char *event_json = NULL; char *event_id = NULL; sentry_path_t *ev_path = sentry__path_from_str(event_msgpack_path); if (ev_path) { - event_json = sentry__path_read_to_buffer(ev_path, &event_size); + size_t base_size = 0; + char *base_json = sentry__path_read_to_buffer(ev_path, &base_size); sentry__path_free(ev_path); - if (event_json && event_size > 0) { + if (base_json && base_size > 0) { sentry_value_t event - = sentry__value_from_json(event_json, event_size); + = sentry__value_from_json(base_json, base_size); + apply_breadcrumbs_from_ring_files(event, run_folder, ctx); event_id = sentry__string_clone(sentry_value_as_string( sentry_value_get_by_key(event, "event_id"))); + event_json = sentry__value_to_json(event, &event_size); sentry_value_decref(event); } + sentry_free(base_json); } // Open envelope file for writing @@ -3208,7 +3929,8 @@ sentry__process_crash(const sentry_options_t *options, sentry_crash_ipc_t *ipc) minidump_path[0] ? minidump_path : "NULL"); envelope_written = write_envelope_with_native_stacktrace(options, envelope_path, ctx, event_path, - minidump_path[0] ? minidump_path : NULL, run_folder); + minidump_path[0] ? minidump_path : NULL, run_folder, + &s_crash_kind); } else { // Mode 0 (MINIDUMP only) SENTRY_DEBUG("Writing envelope with minidump"); @@ -3644,12 +4366,129 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, SENTRY_DEBUG("Entering main loop"); +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) + /* Pre-populate crashed_pid so the app-hang path can reach the host + * out-of-process (OpenProcess on Windows, task_for_pid on macOS). On + * Windows this also feeds capture_modules_from_process and + * walk_stack_with_dbghelp. ctx->crashed_pid is otherwise only set by the + * host's crash handler; the crash handler re-sets it from the host context + * on a real crash — a no-op (same value). */ + ipc->shmem->crashed_pid = (pid_t)app_pid; +#endif + // Daemon main loop bool crash_processed = false; + +#if defined(SENTRY_PLATFORM_WINDOWS) + /* App-hang detector state. Daemon-local; the daemon caches the timeout + * here so it does not race the host on subsequent shmem mutations. */ + const bool app_hang_enabled = ipc->shmem->app_hang_enabled; + const uint64_t app_hang_timeout_ms = ipc->shmem->app_hang_timeout_ms; + uint64_t last_fired_hb = 0; + int consecutive_stale_ticks = 0; + + HANDLE timer = NULL; + if (app_hang_enabled) { + timer = CreateWaitableTimer(NULL, FALSE, NULL); + if (!timer) { + SENTRY_WARNF("app-hang: CreateWaitableTimer failed: %lu", + GetLastError()); + } else { + /* Negative dueTime: relative; 100ns units; -5_000_000 = 500 ms. + * Period 500 ms. */ + LARGE_INTEGER due_time; + due_time.QuadPart = -5000000LL; + if (!SetWaitableTimer( + timer, &due_time, 500, NULL, NULL, FALSE)) { + SENTRY_WARNF("app-hang: SetWaitableTimer failed: %lu", + GetLastError()); + CloseHandle(timer); + timer = NULL; + } + } + } + + /* Wait set: index 0 = crash event, index 1 = timer (optional). */ + HANDLE wait_handles[2]; + DWORD wait_count = 1; + wait_handles[0] = ipc->event_handle; + if (timer) { + wait_handles[1] = timer; + wait_count = 2; + } + + while (true) { + DWORD result = WaitForMultipleObjects(wait_count, wait_handles, + FALSE, SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS); + + if (result == WAIT_OBJECT_0) { + /* Crash notification — identical logic to the cross-platform + * path below. */ + SENTRY_DEBUG("Event signaled, checking crash state"); + long state = sentry__atomic_fetch(&ipc->shmem->state); + if (state == SENTRY_CRASH_STATE_CRASHED && !crash_processed) { + SENTRY_DEBUG("Crash notification received, processing"); + sentry__process_crash(options, ipc); + crash_processed = true; + SENTRY_DEBUG("Crash processed, daemon exiting"); + break; + } + SENTRY_DEBUG("Spurious notification or already processed"); + } else if (timer && result == WAIT_OBJECT_0 + 1) { + /* Timer tick — evaluate app-hang state with strike accumulation. */ + sentry_crash_context_t *shctx = ipc->shmem; + const uint64_t hb = shctx->app_hang_last_heartbeat_ms; + const uint64_t now = sentry__app_hang_now_ms(); + int new_strikes = 0; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + app_hang_enabled, hb, now, app_hang_timeout_ms, + last_fired_hb, consecutive_stale_ticks, &new_strikes); + consecutive_stale_ticks = new_strikes; + if (d == SENTRY_APP_HANG_FIRE) { + capture_and_send_app_hang(options, ipc, now - hb); + /* Always advance last_fired_hb, even if capture failed — + * prevents a retry storm against a wedged thread. The next + * heartbeat advance re-arms detection naturally. */ + last_fired_hb = hb; + } + } else if (result == WAIT_TIMEOUT) { + /* Fall through to parent-liveness check below. */ + } else { + SENTRY_WARNF("daemon wait failed: %lu err=%lu", result, + GetLastError()); + break; + } + + if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { + SENTRY_DEBUG("Parent process exited without crash"); + break; + } + } + + if (timer) { + CancelWaitableTimer(timer); + CloseHandle(timer); + } +#else +# if defined(SENTRY_PLATFORM_MACOS) + /* App-hang detector state. Daemon-local; the timeout is cached here so it + * does not race the host on subsequent shmem mutations. When enabled, the + * loop polls on a short cadence (so it can evaluate the heartbeat each + * tick) instead of the longer health-check interval. */ + const bool app_hang_enabled = ipc->shmem->app_hang_enabled; + const uint64_t app_hang_timeout_ms = ipc->shmem->app_hang_timeout_ms; + uint64_t last_fired_hb = 0; + int consecutive_stale_ticks = 0; + const int wait_timeout_ms = app_hang_enabled + ? 500 + : SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; +# else + const int wait_timeout_ms = SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; +# endif + while (true) { // Wait for crash notification (with timeout to check parent health) - bool wait_result - = sentry__crash_ipc_wait(ipc, SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS); + bool wait_result = sentry__crash_ipc_wait(ipc, wait_timeout_ms); if (wait_result) { // Crash occurred! SENTRY_DEBUG("Event signaled, checking crash state"); @@ -3683,6 +4522,28 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, // If crash already processed, just ignore spurious notifications SENTRY_DEBUG("Spurious notification or already processed"); } +# if defined(SENTRY_PLATFORM_MACOS) + else if (app_hang_enabled && !crash_processed) { + /* No crash notification this wake (timeout or spurious) — evaluate + * the app-hang heartbeat with strike accumulation, mirroring the + * Windows timer tick. */ + sentry_crash_context_t *shctx = ipc->shmem; + const uint64_t hb = shctx->app_hang_last_heartbeat_ms; + const uint64_t now = sentry__app_hang_now_ms(); + int new_strikes = 0; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + app_hang_enabled, hb, now, app_hang_timeout_ms, + last_fired_hb, consecutive_stale_ticks, &new_strikes); + consecutive_stale_ticks = new_strikes; + if (d == SENTRY_APP_HANG_FIRE) { + capture_and_send_app_hang(options, ipc, now - hb); + /* Always advance last_fired_hb, even if capture failed — + * prevents a retry storm against a wedged thread. The next + * heartbeat advance re-arms detection naturally. */ + last_fired_hb = hb; + } + } +# endif // Check if parent is still alive (only if no crash processed yet) if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { @@ -3690,6 +4551,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, break; } } +#endif SENTRY_DEBUG("Daemon exiting"); diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 7768fe030..bcabf415e 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -18,6 +18,7 @@ #include #include "sentry_alloc.h" +#include "sentry_app_hang.h" #include "sentry_backend.h" #include "sentry_core.h" #include "sentry_crash_context.h" @@ -309,9 +310,21 @@ native_backend_startup( ctx->http_retry = options->http_retry; ctx->shutdown_timeout = options->shutdown_timeout; ctx->transfer_timeout = options->transfer_timeout; + ctx->max_breadcrumbs = (uint32_t)options->max_breadcrumbs; sentry__atomic_store( &ctx->user_consent, sentry__atomic_fetch(&options->run->user_consent)); + /* App-hang detection configuration. + * + * NOTE: sentry__app_hang_set_shmem(ctx) is intentionally deferred until + * just before the function's successful `return 0;` below. If a later + * fallible call fails (e.g., daemon spawn) we free the IPC; registering + * the global pointer early would leave it dangling. */ + ctx->app_hang_enabled = options->app_hang_enabled; + ctx->app_hang_timeout_ms = options->app_hang_timeout_ms; + ctx->app_hang_target_tid = 0; + ctx->app_hang_last_heartbeat_ms = 0; + // Set up event and breadcrumb paths sentry_path_t *run_path = options->run->run_path; sentry_path_t *db_path = options->database_path; @@ -552,6 +565,14 @@ native_backend_startup( } #endif +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) + /* Make this shmem block visible to sentry_app_hang_heartbeat now that + * all fallible startup steps have succeeded. If any earlier step had + * failed we would have freed the IPC and returned without ever + * registering — keeping g_app_hang_shmem == NULL. */ + sentry__app_hang_set_shmem(ctx); +#endif + SENTRY_DEBUG("native backend started successfully"); return 0; } @@ -667,6 +688,11 @@ native_backend_shutdown(sentry_backend_t *backend) // Cleanup IPC if (state->ipc) { +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) + /* Clear the global heartbeat pointer before the shmem backing it goes + * away, so sentry_app_hang_heartbeat() cannot write to freed memory. */ + sentry__app_hang_set_shmem(NULL); +#endif sentry__crash_ipc_free(state->ipc); state->ipc = NULL; // Prevent use-after-free } @@ -817,7 +843,8 @@ native_backend_flush_scope( return; } - // Create event with current scope + // Create event with current scope. The daemon also reads this base event + // at app-hang time on Windows, so keep it current. sentry_value_t event = sentry_value_new_object(); sentry_value_set_by_key( event, "level", sentry__value_new_level(SENTRY_LEVEL_FATAL)); @@ -867,18 +894,20 @@ native_backend_add_breadcrumb(sentry_backend_t *backend, return; } - // Serialize to JSON (so it can be deserialized on next start) - size_t json_len = 0; - char *json_str = sentry__value_to_json(breadcrumb, &json_len); - if (!json_str) { + // Append as msgpack, matching the crashpad backend. msgpack values are + // self-delimiting, so the daemon can read the concatenated ring file back + // into a list via `sentry__value_from_msgpack`. + size_t mpack_size = 0; + char *mpack = sentry_value_to_msgpack(breadcrumb, &mpack_size); + if (!mpack) { return; } int rv = first_breadcrumb - ? sentry__path_write_buffer(breadcrumb_file, json_str, json_len) - : sentry__path_append_buffer(breadcrumb_file, json_str, json_len); + ? sentry__path_write_buffer(breadcrumb_file, mpack, mpack_size) + : sentry__path_append_buffer(breadcrumb_file, mpack, mpack_size); - sentry_free(json_str); + sentry_free(mpack); if (rv != 0) { SENTRY_WARN("failed to write breadcrumb"); diff --git a/src/sentry_app_hang.c b/src/sentry_app_hang.c new file mode 100644 index 000000000..d297457be --- /dev/null +++ b/src/sentry_app_hang.c @@ -0,0 +1,223 @@ +/* pthread_threadid_np() and CLOCK_UPTIME_RAW are Darwin extensions hidden when + * a strict POSIX feature macro (e.g. _XOPEN_SOURCE, set transitively by + * sentry_crash_context.h) is active. Re-expose them before any include. */ +#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE) +# define _DARWIN_C_SOURCE +#endif + +#include "sentry_app_hang.h" + +#include "sentry_options.h" + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) +# if defined(SENTRY_PLATFORM_WINDOWS) +# include +# elif defined(SENTRY_PLATFORM_MACOS) +# include +# include +# include +# endif +#endif + +sentry_app_hang_decision_t +sentry__app_hang_decide(bool enabled, uint64_t hb, uint64_t now, + uint64_t timeout_ms, uint64_t last_fired_hb, + int consecutive_stale_ticks, int *out_consecutive_stale_ticks) +{ + /* Fresh or disabled paths reset the counter. */ + if (!enabled || hb == 0) { + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if (now < hb) { + /* Torn shmem read (possible on x86 for a non-atomic 64-bit load). + * Treat as fresh — daemon will see the real value on the next tick. */ + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if ((now - hb) < timeout_ms) { + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + if (hb == last_fired_hb) { + /* Already fired for this freeze. Stay quiet and hold the counter at + * zero so we re-arm cleanly once the host heartbeats again. */ + *out_consecutive_stale_ticks = 0; + return SENTRY_APP_HANG_NO_ACTION; + } + /* Stale and not in cooldown — accumulate a strike. */ + int new_count = consecutive_stale_ticks + 1; + *out_consecutive_stale_ticks = new_count; + if (new_count >= SENTRY_APP_HANG_STRIKES_REQUIRED) { + return SENTRY_APP_HANG_FIRE; + } + return SENTRY_APP_HANG_NO_ACTION; +} + +/* Public setters (always compiled, no platform guard — they only mutate the + * options struct). */ +void +sentry_options_set_app_hang_enabled(sentry_options_t *opts, int enabled) +{ + if (opts) { + opts->app_hang_enabled = !!enabled; + } +} + +void +sentry_options_set_app_hang_timeout_ms( + sentry_options_t *opts, uint64_t timeout_ms) +{ + if (opts) { + opts->app_hang_timeout_ms = timeout_ms; + } +} + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) + +static sentry_crash_context_t *volatile g_app_hang_shmem = NULL; + +void +sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) +{ + g_app_hang_shmem = ctx; +} + +# if defined(SENTRY_PLATFORM_WINDOWS) + +uint64_t +sentry__app_hang_now_ms(void) +{ + ULONGLONG ticks_100ns = 0; + /* QueryUnbiasedInterruptTime is documented signal/SEH/wait-free; the + * same source is read on both sides of the IPC. */ + if (!QueryUnbiasedInterruptTime(&ticks_100ns)) { + return 0; + } + return (uint64_t)(ticks_100ns / 10000ULL); +} + +void +sentry_app_hang_set_target_thread(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* CAS the current TID into the latch slot iff still unset. If another + * thread races and wins, our call is silently dropped — the API contract + * is "first caller wins, idempotent for that caller". CAS (rather than a + * plain store) prevents a late call from a different thread from + * silently overwriting a prior latch. */ + DWORD current_tid = GetCurrentThreadId(); + InterlockedCompareExchange64((LONG64 volatile *)&ctx->app_hang_target_tid, + (LONG64)(uint64_t)current_tid, 0); +} + +void +sentry_app_hang_heartbeat(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* Refresh-only: requires a prior sentry_app_hang_set_target_thread() + * call from this thread. Drops the heartbeat if no target is latched, + * or if the latched thread is not us. The non-atomic read can tear on + * x86; in that case the compare fails and we drop a heartbeat, which + * the daemon's strike counter absorbs. */ + DWORD current_tid = GetCurrentThreadId(); + uint64_t latched = ctx->app_hang_target_tid; + if (latched == 0 || (DWORD)latched != current_tid) { + return; + } + + /* Relaxed 64-bit store. On x64 this is a single mov. On x86 the value + * may tear, but that is OK — see the comment in sentry_crash_context.h. */ + ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); +} + +# elif defined(SENTRY_PLATFORM_MACOS) + +uint64_t +sentry__app_hang_now_ms(void) +{ + /* CLOCK_UPTIME_RAW is the macOS analogue of Windows' + * QueryUnbiasedInterruptTime: a monotonic clock that excludes time the + * system was asleep, read identically by host and daemon. */ + struct timespec ts; + if (clock_gettime(CLOCK_UPTIME_RAW, &ts) != 0) { + return 0; + } + return (uint64_t)ts.tv_sec * 1000ULL + (uint64_t)ts.tv_nsec / 1000000ULL; +} + +void +sentry_app_hang_set_target_thread(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* Obtain the portable 64-bit Mach thread id of the current thread; this + * is the same value the daemon matches against via + * thread_info(THREAD_IDENTIFIER_INFO). */ + uint64_t current_tid = 0; + if (pthread_threadid_np(NULL, ¤t_tid) != 0 || current_tid == 0) { + return; + } + + /* CAS the current TID into the latch slot iff still unset — first caller + * wins, idempotent for that caller. The shmem field is declared + * `volatile uint64_t`; view it as an atomic for the compare-exchange. */ + _Atomic uint64_t *slot + = (_Atomic uint64_t *)(void *)&ctx->app_hang_target_tid; + uint64_t expected = 0; + atomic_compare_exchange_strong(slot, &expected, current_tid); +} + +void +sentry_app_hang_heartbeat(void) +{ + sentry_crash_context_t *ctx = g_app_hang_shmem; + if (!ctx || !ctx->app_hang_enabled) { + return; + } + + /* Refresh-only: requires a prior sentry_app_hang_set_target_thread() + * call from this thread. Drops the heartbeat if no target is latched, or + * if the latched thread is not us. */ + uint64_t current_tid = 0; + if (pthread_threadid_np(NULL, ¤t_tid) != 0 || current_tid == 0) { + return; + } + uint64_t latched = ctx->app_hang_target_tid; + if (latched == 0 || latched != current_tid) { + return; + } + + /* Relaxed 64-bit store; aligned on a 64-bit target so it is atomic and + * cannot tear. The daemon reads it with a relaxed load. */ + ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); +} + +# endif + +#else /* host heartbeat not supported on this target */ + +void +sentry_app_hang_set_target_thread(void) +{ + /* No-op on non-Windows targets in this initial cut. */ +} + +void +sentry_app_hang_heartbeat(void) +{ + /* No-op on non-Windows targets in this initial cut. */ +} + +#endif diff --git a/src/sentry_app_hang.h b/src/sentry_app_hang.h new file mode 100644 index 000000000..079950861 --- /dev/null +++ b/src/sentry_app_hang.h @@ -0,0 +1,79 @@ +#ifndef SENTRY_APP_HANG_H_INCLUDED +#define SENTRY_APP_HANG_H_INCLUDED + +#include "sentry_boot.h" + +#include +#include + +/* The host-side heartbeat machinery (clock, latch, shmem registration) is + * available on the native backend on Windows (non-Xbox) and macOS. Linux and + * other targets fall back to no-op stubs. */ +#if (((defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX)) \ + || defined(SENTRY_PLATFORM_MACOS))) \ + && defined(SENTRY_BACKEND_NATIVE) +# define SENTRY_APP_HANG_HOST_SUPPORTED 1 +#endif + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) +# include "sentry_crash_context.h" +#endif + +/** + * Decision returned by the pure decision function. Kept tiny so it can be + * exercised in unit tests without involving the daemon or shared memory. + */ +typedef enum { + SENTRY_APP_HANG_NO_ACTION = 0, + SENTRY_APP_HANG_FIRE = 1, +} sentry_app_hang_decision_t; + +/* Number of consecutive timer ticks the daemon must observe a stale + * heartbeat before firing. Smooths over brief hiccups (GC pauses, swap, OS + * scheduler quanta) at the cost of ~SENTRY_APP_HANG_STRIKES_REQUIRED-1 + * extra poll periods of detection latency. */ +#define SENTRY_APP_HANG_STRIKES_REQUIRED 3 + +/** + * Pure function: should we fire an app-hang event right now? + * + * - `enabled`: the host has app-hang detection turned on. + * - `hb`: last heartbeat timestamp (host clock; 0 means + * "never heartbeated yet"). + * - `now`: daemon's current observation of the same clock. + * - `timeout_ms`: staleness threshold. + * - `last_fired_hb`: the `hb` value the daemon last fired for; used + * as cooldown so a sustained freeze fires once. + * - `consecutive_stale_ticks`: caller-tracked count of consecutive ticks on + * which the heartbeat was observed stale. + * - `out_consecutive_stale_ticks` (out): updated counter the caller should + * store. 0 if reset, otherwise incremented. + * + * Returns SENTRY_APP_HANG_FIRE iff: enabled, hb != 0, (now - hb) >= timeout_ms, + * hb != last_fired_hb, AND the updated stale-tick counter reaches + * SENTRY_APP_HANG_STRIKES_REQUIRED. + */ +sentry_app_hang_decision_t sentry__app_hang_decide(bool enabled, uint64_t hb, + uint64_t now, uint64_t timeout_ms, uint64_t last_fired_hb, + int consecutive_stale_ticks, int *out_consecutive_stale_ticks); + +#if defined(SENTRY_APP_HANG_HOST_SUPPORTED) +/** + * Called from the native backend startup path. Stores `ctx` so that + * subsequent `sentry_app_hang_heartbeat()` calls have somewhere to write. + * Passing NULL clears the registration on backend shutdown. + * + * The pointer is stored in a `volatile` global; ordering with shmem field + * initialization is the caller's responsibility (the backend writes options + * into shmem before calling this). + */ +void sentry__app_hang_set_shmem(sentry_crash_context_t *ctx); + +/** + * Return a millisecond-resolution unbiased timestamp shared between host and + * daemon. Exposed for the daemon to call as well. + */ +uint64_t sentry__app_hang_now_ms(void); +#endif + +#endif diff --git a/src/sentry_options.c b/src/sentry_options.c index cb5bb936a..f43a82f11 100644 --- a/src/sentry_options.c +++ b/src/sentry_options.c @@ -67,6 +67,8 @@ sentry_options_new(void) opts->propagate_traceparent = false; opts->strict_trace_continuation = false; opts->crashpad_limit_stack_capture_to_sp = false; + opts->app_hang_enabled = false; + opts->app_hang_timeout_ms = 5000; opts->enable_metrics = true; opts->enable_logs = true; opts->cache_keep = SENTRY_CACHE_KEEP_NONE; diff --git a/src/sentry_options.h b/src/sentry_options.h index 6f64bba43..8696f0f05 100644 --- a/src/sentry_options.h +++ b/src/sentry_options.h @@ -51,6 +51,8 @@ struct sentry_options_s { bool propagate_traceparent; bool strict_trace_continuation; bool crashpad_limit_stack_capture_to_sp; + bool app_hang_enabled; + uint64_t app_hang_timeout_ms; sentry_cache_keep_t cache_keep; time_t cache_max_age; diff --git a/tests/test_integration_native.py b/tests/test_integration_native.py index a694d1848..18a86eb48 100644 --- a/tests/test_integration_native.py +++ b/tests/test_integration_native.py @@ -1042,3 +1042,73 @@ def test_native_restart_on_crash(cmake, httpserver): for req in httpserver.log: envelope = Envelope.deserialize(req[0].get_data()) assert_native_crash(envelope) + + +@pytest.mark.skipif( + sys.platform not in ("win32", "darwin"), + reason="app-hang detection is implemented on Windows and macOS", +) +def test_native_app_hang(cmake, httpserver): + """App hang detection emits exactly one ApplicationNotResponding event. + + On macOS the daemon samples the hung thread out-of-process via + ``task_for_pid``, which requires the example + daemon to be ad-hoc + codesigned with the debugger entitlement (same setup as the SMART-mode + heap test). If the capture still cannot acquire the task port in this + environment the daemon degrades gracefully and ships nothing — the test + skips rather than fails in that case. + """ + # macOS hardened-runtime self-signing needs a static build so the example + # can load itself without the dyld "different team IDs" check tripping on + # ad-hoc-signed dylibs (mirrors the SMART-mode heap test). + config = {"SENTRY_BACKEND": "native"} + if sys.platform == "darwin": + config["BUILD_SHARED_LIBS"] = "OFF" + tmp_path = cmake(["sentry_example"], config) + + if sys.platform == "darwin": + _codesign_for_task_for_pid( + str(tmp_path / "sentry_example"), + str(tmp_path / "sentry-crash"), + ) + + httpserver.expect_oneshot_request("/api/123456/envelope/").respond_with_data( + "OK" + ) + + with httpserver.wait(timeout=20) as waiting: + # The example's app-hang mode heartbeats for 500 ms, then freezes for + # 3000 ms (3x the 1000 ms timeout). The daemon polls every 500 ms. + # `run` (not `run_crash`) because the example exits cleanly after the + # hang demonstration — `run_crash` expects abnormal exit. + run( + tmp_path, + "sentry_example", + ["log", "app-hang"], + env=dict(os.environ, SENTRY_DSN=make_dsn(httpserver)), + ) + + if sys.platform == "darwin" and not waiting.result: + pytest.skip( + "no app-hang envelope received — task_for_pid is likely denied " + "in this environment (hardened-runtime/SIP); capture degraded " + "gracefully" + ) + assert waiting.result + + envelope = Envelope.deserialize(httpserver.log[0][0].get_data()) + event = envelope.get_event() + assert event is not None + exc = event["exception"]["values"][0] + assert exc["type"] == "ApplicationNotResponding" + assert exc["mechanism"]["type"] == "AppHang" + assert exc["mechanism"]["handled"] is True + assert exc["mechanism"]["synthetic"] is True + assert "stacktrace" in exc + frames = exc["stacktrace"]["frames"] + assert isinstance(frames, list) + assert len(frames) > 0, "stacktrace is empty — capture path may be broken" + # At least one frame should have a non-zero instruction address. + assert any( + int(f.get("instruction_addr", "0"), 16) > 0 for f in frames + ), "no frame has a non-zero instruction_addr" diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index a143fd540..98bdf747c 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable(sentry_test_unit ${SENTRY_SOURCES} main.c sentry_testsupport.h + test_app_hang.c test_attachments.c test_basic.c test_cache.c diff --git a/tests/unit/test_app_hang.c b/tests/unit/test_app_hang.c new file mode 100644 index 000000000..2ab96f649 --- /dev/null +++ b/tests/unit/test_app_hang.c @@ -0,0 +1,153 @@ +#include "sentry_app_hang.h" +#include "sentry_testsupport.h" + +#include + +SENTRY_TEST(app_hang_decide_disabled_returns_no_action) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/false, /*hb=*/100, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + /* Disabled path resets the counter. */ + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_no_heartbeat_yet_returns_no_action) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/0, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_fresh_heartbeat_returns_no_action_and_resets) +{ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9500, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + /* Fresh heartbeat resets the strike counter even mid-accumulation. */ + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_first_stale_tick_increments_does_not_fire) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 1); +} + +SENTRY_TEST(app_hang_decide_second_stale_tick_increments_does_not_fire) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/1, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 2); +} + +SENTRY_TEST(app_hang_decide_third_stale_tick_fires) +{ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(new_count, 3); +} + +SENTRY_TEST(app_hang_decide_brief_hiccup_resets_strike_count) +{ + /* Simulate: 2 stale ticks, then a fresh heartbeat (counter resets), + * then 1 stale tick → must NOT fire because we lost our accumulated + * strikes when the heartbeat refreshed. */ + int after_hiccup = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9800, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &after_hiccup); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_hiccup, 0); + + int after_one_stale = -1; + d = sentry__app_hang_decide(/*enabled=*/true, /*hb=*/9800, + /*now=*/11000, /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/after_hiccup, &after_one_stale); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_one_stale, 1); +} + +SENTRY_TEST(app_hang_decide_cooldown_holds_when_hb_unchanged) +{ + /* Already fired for hb=5000. Subsequent ticks must NOT re-fire even + * if 100 more stale ticks accumulate. Counter held at 0. */ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/5000, /*now=*/20000, + /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/0, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} + +SENTRY_TEST(app_hang_decide_re_arms_after_advance_then_stall) +{ + /* hb advanced past last_fired_hb → cooldown released; need 3 fresh + * strikes again. */ + int after_strike1 = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/7000, /*now=*/12000, + /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/0, &after_strike1); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(after_strike1, 1); + + int after_strike3 = -1; + d = sentry__app_hang_decide(/*enabled=*/true, /*hb=*/7000, + /*now=*/12000, /*timeout_ms=*/1000, /*last_fired_hb=*/5000, + /*consecutive_stale_ticks=*/2, &after_strike3); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(after_strike3, 3); +} + +SENTRY_TEST(app_hang_decide_exact_timeout_boundary_with_third_strike_fires) +{ + /* now - hb == timeout_ms is still stale (>= semantics) AND the third + * strike has accumulated — fires. */ + int new_count = -1; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/9000, /*now=*/10000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_FIRE); + TEST_CHECK_INT_EQUAL(new_count, 3); +} + +SENTRY_TEST(app_hang_decide_torn_read_now_less_than_hb_resets) +{ + /* On x86 a non-atomic 64-bit load can tear, producing now < hb. The + * decision function treats this as fresh (no FIRE) and resets the + * strike counter so the next non-torn observation starts clean. */ + int new_count = 99; + sentry_app_hang_decision_t d = sentry__app_hang_decide( + /*enabled=*/true, /*hb=*/10000, /*now=*/5000, + /*timeout_ms=*/1000, /*last_fired_hb=*/0, + /*consecutive_stale_ticks=*/2, &new_count); + TEST_CHECK_INT_EQUAL(d, SENTRY_APP_HANG_NO_ACTION); + TEST_CHECK_INT_EQUAL(new_count, 0); +} diff --git a/tests/unit/tests.inc b/tests/unit/tests.inc index 9770f857f..686f7e3e4 100644 --- a/tests/unit/tests.inc +++ b/tests/unit/tests.inc @@ -1,3 +1,14 @@ +XX(app_hang_decide_brief_hiccup_resets_strike_count) +XX(app_hang_decide_cooldown_holds_when_hb_unchanged) +XX(app_hang_decide_disabled_returns_no_action) +XX(app_hang_decide_exact_timeout_boundary_with_third_strike_fires) +XX(app_hang_decide_first_stale_tick_increments_does_not_fire) +XX(app_hang_decide_fresh_heartbeat_returns_no_action_and_resets) +XX(app_hang_decide_no_heartbeat_yet_returns_no_action) +XX(app_hang_decide_re_arms_after_advance_then_stall) +XX(app_hang_decide_second_stale_tick_increments_does_not_fire) +XX(app_hang_decide_third_stale_tick_fires) +XX(app_hang_decide_torn_read_now_less_than_hb_resets) XX(assert_sdk_name) XX(assert_sdk_user_agent) XX(assert_sdk_version)