From e3307778b1ce697b4d4bfad39fa27fdcb5609c2b Mon Sep 17 00:00:00 2001 From: Chris Portscheller Date: Tue, 30 Jun 2026 20:06:45 -0500 Subject: [PATCH 1/4] feat(detection): stealth-browser detection for botasaurus-class scrapers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add native-function lie/tampering detection (new 'stealth' category), AudioContext scoring, and broaden software-WebGL detection to match the client's suspiciousRenderer set. Targets botasaurus browser-mode evasions that strip navigator.webdriver but leave patched natives. - client: collect native-function integrity (_getLieDetection) - detection: analyzeLies (stealth category), analyzeAudioContext - weights: add stealth category (0.30) — 0 for clean browsers, no FP - harness: signal-profile measurement (packages/webdecoy/harness) Measured (synthetic fixtures): headful botasaurus 24% (allow) -> 52% (challenge), cloud 33% -> 57%; real Chrome unchanged at 6%. 58 detection tests pass. Fixtures are synthetic; live-botasaurus validation pending. --- packages/client/src/collectors/environment.ts | 54 +++++ packages/webdecoy/harness/fixtures.ts | 214 ++++++++++++++++++ packages/webdecoy/harness/measure.ts | 70 ++++++ .../src/detection/detectors/advanced.ts | 4 + .../webdecoy/src/detection/detectors/audio.ts | 44 ++++ .../src/detection/detectors/headless.ts | 11 +- .../webdecoy/src/detection/detectors/index.ts | 2 + .../webdecoy/src/detection/detectors/lies.ts | 35 +++ packages/webdecoy/src/detection/types.ts | 23 ++ packages/webdecoy/src/detection/weights.ts | 12 +- 10 files changed, 466 insertions(+), 3 deletions(-) create mode 100644 packages/webdecoy/harness/fixtures.ts create mode 100644 packages/webdecoy/harness/measure.ts create mode 100644 packages/webdecoy/src/detection/detectors/audio.ts create mode 100644 packages/webdecoy/src/detection/detectors/lies.ts diff --git a/packages/client/src/collectors/environment.ts b/packages/client/src/collectors/environment.ts index 3d6c795..69c82c4 100644 --- a/packages/client/src/collectors/environment.ts +++ b/packages/client/src/collectors/environment.ts @@ -29,6 +29,9 @@ export class EnvironmentalCollector { cssMediaQueries: this._getCSSMediaQueries(), permissionsInfo: this._getPermissionsInfo(), fontsInfo: this._getFontsInfo(), + + // Native-function integrity (stealth self-hiding leaves patched toStrings) + lieDetection: this._getLieDetection(), }; } @@ -277,6 +280,57 @@ export class EnvironmentalCollector { } } + /** + * Native-function integrity check. Stealth automation hides itself by + * overriding native functions; a patched native's `toString()` no longer + * reports `[native code]`. A genuine browser never patches its own natives, + * so any hit here is deliberate evasion (scored in the `stealth` category). + */ + _getLieDetection(): Record { + try { + const w = window as any; + const patched: string[] = []; + + const isPatched = (fn: unknown): boolean => { + try { + return typeof fn === 'function' && (fn as { toString(): string }).toString().indexOf('[native code]') === -1; + } catch { + return false; + } + }; + const check = (fn: unknown, name: string): void => { + if (isPatched(fn)) patched.push(name); + }; + + // If toString itself is patched, every other check is unreliable — flag it. + check(Function.prototype.toString, 'Function.prototype.toString'); + check(navigator.permissions && navigator.permissions.query, 'navigator.permissions.query'); + check(w.Notification && w.Notification.requestPermission, 'Notification.requestPermission'); + check(w.HTMLCanvasElement && w.HTMLCanvasElement.prototype.toDataURL, 'HTMLCanvasElement.toDataURL'); + check( + w.WebGLRenderingContext && w.WebGLRenderingContext.prototype.getParameter, + 'WebGLRenderingContext.getParameter', + ); + check(navigator.mediaDevices && navigator.mediaDevices.enumerateDevices, 'mediaDevices.enumerateDevices'); + + // The navigator.webdriver getter is the most-patched native. + try { + const desc = + Object.getOwnPropertyDescriptor(Object.getPrototypeOf(navigator), 'webdriver') || + Object.getOwnPropertyDescriptor(navigator, 'webdriver'); + if (desc && typeof desc.get === 'function' && desc.get.toString().indexOf('[native code]') === -1) { + patched.push('navigator.webdriver getter'); + } + } catch { + // ignore + } + + return { supported: true, patched, patchedCount: patched.length }; + } catch (e) { + return { supported: false }; + } + } + _getHeadlessIndicators(): Record { const nav = navigator as any; return { diff --git a/packages/webdecoy/harness/fixtures.ts b/packages/webdecoy/harness/fixtures.ts new file mode 100644 index 0000000..572ad55 --- /dev/null +++ b/packages/webdecoy/harness/fixtures.ts @@ -0,0 +1,214 @@ +/** + * Signal fixtures for the stealth-detection harness. + * + * IMPORTANT: these are *synthetic approximations* of what each client emits, + * used for fast, deterministic regression measurement. They encode our current + * model of botasaurus's output — they are NOT proof we catch the real tool. + * Ground truth comes from the live harness (server.ts + page.html) with a real + * botasaurus run. Treat fixture scores as "does the scoring logic fire on these + * signal shapes", not "we catch botasaurus". + */ + +import type { Signals } from '../src/detection/types'; + +/** A real Chrome-on-Windows User-Agent. */ +export const CHROME_UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'; + +/** A real Chrome-on-Linux UA (botasaurus commonly runs on Linux servers). */ +export const CHROME_LINUX_UA = + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'; + +/** Identical human-like behavioral block shared by the browser profiles so + * behavioral scoring is held constant and only environmental signals vary. */ +const humanBehavioral = { + totalPoints: 120, + trajectoryLength: 3200, + approachPoints: 40, + velocityVariance: 0.8, + straightLineRatio: 0.34, + directionChanges: 42, + eventDeltaVariance: 15, + clickPrecision: 0.55, + approachDirectness: 0.5, + explorationRatio: 0.45, + mouseEventRate: 28, + interactionDuration: 8200, + microTremorScore: 0.6, + overshootCorrections: 3, + keyEvents: 24, + touchEvents: 0, +}; + +const humanTemporal = { pageLoadToFirstInteraction: 1300 }; + +/** Realistic headers a real browser sends. */ +export const REAL_HEADERS: Record = { + accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'accept-language': 'en-US,en;q=0.9', + 'accept-encoding': 'gzip, deflate, br', +}; + +// --------------------------------------------------------------------------- +// Profile 1 — real Chrome (baseline; must score low / allow) +// --------------------------------------------------------------------------- +export const realChrome: Signals = { + behavioral: humanBehavioral, + temporal: humanTemporal, + environmental: { + webdriver: false, + automationFlags: { + plugins: 5, + languages: true, + chrome: true, + chromeRuntime: true, + platform: 'Win32', + maxTouchPoints: 0, + hardwareConcurrency: 8, + }, + navigator: { platform: 'Win32', maxTouchPoints: 0 }, + webglInfo: { + supported: true, + vendor: 'Google Inc. (NVIDIA)', + renderer: 'ANGLE (NVIDIA, NVIDIA GeForce RTX 3060 Direct3D11 vs_5_0 ps_5_0, D3D11)', + suspiciousRenderer: false, + }, + audioInfo: { supported: true, sampleRate: 48000, state: 'suspended', baseLatency: 0.005333 }, + headlessIndicators: { + hasOuterDimensions: true, + innerEqualsOuter: false, + notificationPermission: 'default', + }, + cdp: { detected: false, signals: [] }, + playwright: { detected: false, signals: [] }, + workerConsistency: { supported: true, consistent: true, mismatchCount: 0, mismatches: [] }, + permissionsInfo: { + supported: true, + hasPermissionsAPI: true, + hasClipboard: true, + hasCredentials: true, + hasGeolocation: true, + hasUsb: true, + }, + fontsInfo: { supported: true, count: 24, hasSegoeUI: true }, + canvasHash: { hash: '3f2a9c', supported: true }, + lieDetection: { supported: true, patched: [], patchedCount: 0 }, + }, +}; + +// --------------------------------------------------------------------------- +// Profile 2a — botasaurus browser mode, CLOUD/headless (no GPU) +// The catchable case: software WebGL + mocked audio + worker mismatch + +// patched webdriver getter + patched native functions. +// --------------------------------------------------------------------------- +export const botasaurusBrowserCloud: Signals = { + behavioral: humanBehavioral, // botasaurus does "humanized" movement — held constant + temporal: humanTemporal, + environmental: { + webdriver: false, // stripped + automationFlags: { + plugins: 3, + languages: true, + chrome: true, + chromeRuntime: false, // headless tell + platform: 'Linux x86_64', + maxTouchPoints: 0, + hardwareConcurrency: 4, + }, + navigator: { platform: 'Linux x86_64', maxTouchPoints: 0 }, + webglInfo: { + supported: true, + vendor: 'Google Inc.', + renderer: 'Google SwiftShader', + suspiciousRenderer: true, + }, + audioInfo: { supported: true, sampleRate: 44100, state: 'suspended' /* baseLatency missing */ }, + headlessIndicators: { + hasOuterDimensions: true, + innerEqualsOuter: false, + notificationPermission: 'default', + }, + cdp: { detected: true, signals: ['webdriver_getter_modified'] }, + playwright: { detected: true, signals: ['webdriver_configurable'] }, + workerConsistency: { + supported: true, + consistent: false, + mismatchCount: 2, + mismatches: ['userAgent', 'hardwareConcurrency'], + }, + permissionsInfo: { supported: true, hasPermissionsAPI: true, hasClipboard: true, hasGeolocation: true }, + fontsInfo: { supported: true, count: 8, hasSegoeUI: false }, + canvasHash: { hash: 'a91b02', supported: true }, + lieDetection: { + supported: true, + patched: ['Function.prototype.toString', 'navigator.permissions.query'], + patchedCount: 2, + }, + }, +}; + +// --------------------------------------------------------------------------- +// Profile 2b — botasaurus browser mode, DESKTOP/headful (real GPU) +// The HARD case: real WebGL + real audio + consistent workers. The ONLY +// surviving tells are the patched webdriver getter and patched native +// functions (lie detection). Shows why the lie detector matters. +// --------------------------------------------------------------------------- +export const botasaurusBrowserDesktop: Signals = { + behavioral: humanBehavioral, + temporal: humanTemporal, + environmental: { + webdriver: false, + automationFlags: { + plugins: 5, + languages: true, + chrome: true, + chromeRuntime: true, + platform: 'Win32', + maxTouchPoints: 0, + hardwareConcurrency: 8, + }, + navigator: { platform: 'Win32', maxTouchPoints: 0 }, + webglInfo: { + supported: true, + vendor: 'Google Inc. (NVIDIA)', + renderer: 'ANGLE (NVIDIA, NVIDIA GeForce RTX 3060 Direct3D11 vs_5_0 ps_5_0, D3D11)', + suspiciousRenderer: false, + }, + audioInfo: { supported: true, sampleRate: 48000, state: 'suspended', baseLatency: 0.005333 }, + headlessIndicators: { + hasOuterDimensions: true, + innerEqualsOuter: false, + notificationPermission: 'default', + }, + // The stealth patches that survive even on real hardware: + cdp: { detected: true, signals: ['webdriver_getter_modified'] }, + playwright: { detected: true, signals: ['webdriver_configurable'] }, + workerConsistency: { supported: true, consistent: true, mismatchCount: 0, mismatches: [] }, + permissionsInfo: { supported: true, hasPermissionsAPI: true, hasClipboard: true, hasGeolocation: true, hasUsb: true }, + fontsInfo: { supported: true, count: 22, hasSegoeUI: true }, + canvasHash: { hash: 'c40dd1', supported: true }, + lieDetection: { + supported: true, + patched: ['Function.prototype.toString', 'navigator.permissions.query', 'WebGLRenderingContext.getParameter'], + patchedCount: 3, + }, + }, +}; + +// --------------------------------------------------------------------------- +// Profile 3 — botasaurus request mode (@request): no JS executed. +// No environmental/behavioral signals at all. This is the F2 territory +// (TLS + beacon-absence). Included to show what F1 alone does with it. +// --------------------------------------------------------------------------- +export const botasaurusRequest: Signals = { + // No behavioral, no environmental — the collector never ran. + meta: {}, +}; + +export const PROFILES: Array<{ name: string; signals: Signals; ua: string; headers: Record }> = [ + { name: 'real-chrome (baseline)', signals: realChrome, ua: CHROME_UA, headers: REAL_HEADERS }, + { name: 'botasaurus-browser-cloud', signals: botasaurusBrowserCloud, ua: CHROME_LINUX_UA, headers: REAL_HEADERS }, + { name: 'botasaurus-browser-desktop', signals: botasaurusBrowserDesktop, ua: CHROME_UA, headers: REAL_HEADERS }, + // request mode ships browser-like headers too; only accept-language sometimes differs. + { name: 'botasaurus-request (no JS)', signals: botasaurusRequest, ua: CHROME_UA, headers: REAL_HEADERS }, +]; diff --git a/packages/webdecoy/harness/measure.ts b/packages/webdecoy/harness/measure.ts new file mode 100644 index 0000000..54388cf --- /dev/null +++ b/packages/webdecoy/harness/measure.ts @@ -0,0 +1,70 @@ +/** + * Stealth-detection measurement harness. + * + * Runs each signal profile in `fixtures.ts` through the REAL DetectionEngine + * and prints score, recommendation, and every triggered detection. Use it to + * establish a baseline before changes and to measure the delta after. + * + * npx tsx harness/measure.ts + * + * `requirePoW` is disabled so we score raw signals (the PoW hard-fail would + * otherwise dominate and mask the fingerprint signal we're measuring). + */ + +import { DetectionEngine } from '../src/detection/engine'; +import type { Detection } from '../src/detection/types'; +import { PROFILES } from './fixtures'; + +const engine = new DetectionEngine({ requirePoW: false }); + +function fmtPct(n: number): string { + return (n * 100).toFixed(0).padStart(3) + '%'; +} + +function recBadge(rec: string): string { + if (rec === 'block') return 'BLOCK'; + if (rec === 'challenge') return 'CHALLENGE'; + return 'allow'; +} + +const summary: Array<{ name: string; score: number; rec: string; hits: number }> = []; + +for (const profile of PROFILES) { + // A controlled, non-datacenter IP so IP reputation doesn't confound the run. + const verdict = engine.score(profile.signals, { + ip: '203.0.113.10', + siteKey: 'harness', + userAgent: profile.ua, + headers: profile.headers, + }); + + const triggered = verdict.detections.filter((d: Detection) => d.score > 0); + triggered.sort((a, b) => b.score * b.confidence - a.score * a.confidence); + + console.log('\n' + '='.repeat(78)); + console.log(`PROFILE: ${profile.name}`); + console.log( + ` score=${fmtPct(verdict.score)} -> ${recBadge(verdict.recommendation)} (block>=60%, challenge>=30%)`, + ); + console.log(' triggered detections:'); + if (triggered.length === 0) { + console.log(' (none)'); + } + for (const d of triggered) { + console.log( + ` [${d.category.padEnd(12)}] score=${fmtPct(d.score)} conf=${fmtPct(d.confidence)} ${d.reason}`, + ); + } + + summary.push({ name: profile.name, score: verdict.score, rec: verdict.recommendation, hits: triggered.length }); +} + +console.log('\n' + '='.repeat(78)); +console.log('SUMMARY'); +console.log(' ' + 'profile'.padEnd(32) + 'score verdict detections'); +for (const s of summary) { + console.log( + ' ' + s.name.padEnd(32) + fmtPct(s.score) + ' ' + recBadge(s.rec).padEnd(11) + ' ' + s.hits, + ); +} +console.log(''); diff --git a/packages/webdecoy/src/detection/detectors/advanced.ts b/packages/webdecoy/src/detection/detectors/advanced.ts index afd3a50..73276e0 100644 --- a/packages/webdecoy/src/detection/detectors/advanced.ts +++ b/packages/webdecoy/src/detection/detectors/advanced.ts @@ -14,6 +14,8 @@ import type { PermissionsInfo, DOMRectInfo, } from '../types'; +import { analyzeAudioContext } from './audio'; +import { analyzeLies } from './lies'; export function analyzeWebRTC(webrtcInfo?: WebRTCInfo): Detection[] { if (!webrtcInfo || !webrtcInfo.supported) return []; @@ -248,6 +250,8 @@ export function analyzeAdvancedSignals(signals: Signals, userAgent: string): Det if (env.fontsInfo) detections.push(...analyzeFonts(env.fontsInfo, userAgent)); if (env.permissionsInfo) detections.push(...analyzePermissions(env.permissionsInfo)); if (env.domRectFingerprint) detections.push(...analyzeDOMRect(env.domRectFingerprint)); + if (env.audioInfo) detections.push(...analyzeAudioContext(env.audioInfo)); + if (env.lieDetection) detections.push(...analyzeLies(env.lieDetection)); return detections; } diff --git a/packages/webdecoy/src/detection/detectors/audio.ts b/packages/webdecoy/src/detection/detectors/audio.ts new file mode 100644 index 0000000..c904afc --- /dev/null +++ b/packages/webdecoy/src/detection/detectors/audio.ts @@ -0,0 +1,44 @@ +/** AudioContext fingerprint analysis: headless/mocked audio-stack detection. */ + +import type { AudioContextInfo, Detection } from '../types'; + +export function analyzeAudioContext(audioInfo?: AudioContextInfo): Detection[] { + if (!audioInfo) return []; + const detections: Detection[] = []; + + // No AudioContext at all — common in headless/sandboxed browsers. + if (audioInfo.supported === false) { + detections.push({ + category: 'headless', + score: 0.6, + confidence: 0.6, + reason: 'AudioContext unavailable (headless or blocked audio stack)', + }); + return detections; + } + + // Real browsers expose `baseLatency`; stealth tools that stub AudioContext + // routinely omit it. + if (audioInfo.baseLatency === undefined || audioInfo.baseLatency === null) { + detections.push({ + category: 'headless', + score: 0.55, + confidence: 0.6, + reason: 'AudioContext missing baseLatency (mocked/headless audio)', + }); + } + + // Conventional sample rates are 44100 or 48000. Anything else is unusual for + // a genuine consumer device. + const sr = audioInfo.sampleRate; + if (sr !== undefined && sr !== 44100 && sr !== 48000) { + detections.push({ + category: 'headless', + score: 0.4, + confidence: 0.5, + reason: `Unusual AudioContext sample rate (${sr})`, + }); + } + + return detections; +} diff --git a/packages/webdecoy/src/detection/detectors/headless.ts b/packages/webdecoy/src/detection/detectors/headless.ts index a8d45c9..cf89aa7 100644 --- a/packages/webdecoy/src/detection/detectors/headless.ts +++ b/packages/webdecoy/src/detection/detectors/headless.ts @@ -82,13 +82,20 @@ export function detectHeadless(signals: Signals, userAgent: string): Detection[] } } + // Software / virtualized WebGL renderer — matches the client's own + // `suspiciousRenderer` set (swiftshader, llvmpipe, softpipe, virtualbox, + // vmware). Honor the collected flag when present, else re-derive from the + // renderer string. const renderer = (env.webglInfo?.renderer ?? '').toLowerCase(); - if (renderer.includes('swiftshader') || renderer.includes('llvmpipe')) { + const softwareRenderer = + env.webglInfo?.suspiciousRenderer === true || + ['swiftshader', 'llvmpipe', 'softpipe', 'virtualbox', 'vmware'].some((r) => renderer.includes(r)); + if (softwareRenderer) { detections.push({ category: 'headless', score: 0.8, confidence: 0.8, - reason: 'Software WebGL renderer detected', + reason: 'Software/virtualized WebGL renderer detected', }); } diff --git a/packages/webdecoy/src/detection/detectors/index.ts b/packages/webdecoy/src/detection/detectors/index.ts index 3f4deda..25b90af 100644 --- a/packages/webdecoy/src/detection/detectors/index.ts +++ b/packages/webdecoy/src/detection/detectors/index.ts @@ -8,6 +8,8 @@ export { detectBehavioral } from './behavioral'; export { detectTouchAuthenticity, detectSensorEntropy, detectTouchKinematics } from './mobile'; export { detectFingerprint } from './fingerprint'; export { detectRateAbuse } from './rate'; +export { analyzeAudioContext } from './audio'; +export { analyzeLies } from './lies'; export { analyzeHeaders } from './headers'; export { checkBrowserConsistency } from './consistency'; export { diff --git a/packages/webdecoy/src/detection/detectors/lies.ts b/packages/webdecoy/src/detection/detectors/lies.ts new file mode 100644 index 0000000..110faa3 --- /dev/null +++ b/packages/webdecoy/src/detection/detectors/lies.ts @@ -0,0 +1,35 @@ +/** + * Native-function lie / tampering detection. + * + * Stealth automation (undetected-chromedriver, botasaurus, puppeteer-extra- + * stealth) hides itself by overriding native functions — `navigator.webdriver`, + * `Function.prototype.toString`, `navigator.permissions.query`, + * `WebGLRenderingContext.getParameter`, etc. A patched native no longer reports + * `[native code]` from `toString()`. A genuine browser never does this, so any + * confirmed patch is high-confidence, deliberate evasion — the single strongest + * "this is a stealth bot" signal, scored in its own `stealth` category. + */ + +import type { Detection, LieDetectionInfo } from '../types'; + +export function analyzeLies(lie?: LieDetectionInfo): Detection[] { + if (!lie || !lie.supported) return []; + + const patched = lie.patched ?? []; + const count = lie.patchedCount ?? patched.length; + if (count <= 0) return []; + + // One patch is already damning; each additional independent patch raises + // certainty toward a ceiling. + const score = Math.min(0.97, 0.7 + (count - 1) * 0.12); + + return [ + { + category: 'stealth', + score, + confidence: 0.9, + reason: `Native function(s) patched to hide automation: ${patched.join(', ') || `${count} function(s)`}`, + details: { patched, patchedCount: count }, + }, + ]; +} diff --git a/packages/webdecoy/src/detection/types.ts b/packages/webdecoy/src/detection/types.ts index 7810d65..fc54c2a 100644 --- a/packages/webdecoy/src/detection/types.ts +++ b/packages/webdecoy/src/detection/types.ts @@ -121,6 +121,8 @@ export interface EnvironmentalSignals { fontsInfo?: FontsInfo; permissionsInfo?: PermissionsInfo; domRectFingerprint?: DOMRectInfo; + audioInfo?: AudioContextInfo; + lieDetection?: LieDetectionInfo; [key: string]: unknown; } @@ -182,6 +184,27 @@ export interface DOMRectInfo { [key: string]: unknown; } +/** AudioContext fingerprint. A real audio stack exposes `baseLatency` and a + * conventional sample rate; headless/mocked stacks omit or fake these. */ +export interface AudioContextInfo { + supported?: boolean; + sampleRate?: number; + state?: string; + baseLatency?: number | null; + [key: string]: unknown; +} + +/** Native-function integrity check. `patched` lists native functions whose + * `toString()` no longer reports `[native code]` (or whose prototype was + * tampered with) — the signature of stealth automation hiding itself. A real + * browser never patches its own natives. */ +export interface LieDetectionInfo { + supported?: boolean; + patched?: string[]; + patchedCount?: number; + [key: string]: unknown; +} + /** Per-field keystroke statistics for one textarea. */ export interface TextareaKeyboardStats { keyCount?: number; diff --git a/packages/webdecoy/src/detection/weights.ts b/packages/webdecoy/src/detection/weights.ts index 324a5b0..4a8d772 100644 --- a/packages/webdecoy/src/detection/weights.ts +++ b/packages/webdecoy/src/detection/weights.ts @@ -5,7 +5,16 @@ * complete variant of the engine and the one the reference test suite exercises. */ -/** Default per-category weights used to compute the final score. */ +/** + * Default per-category weights used to compute the final score. + * + * `stealth` carries deliberate anti-detection tampering (native functions + * patched to hide automation). This is the least-ambiguous evidence of a bot — + * a genuine browser never patches its own natives — so it is weighted highest. + * Because weights are a simple weighted sum (not required to total 1.0), and + * `stealth` is 0 for any clean browser, this raises stealth-bot scores without + * affecting legitimate traffic. + */ export const DEFAULT_WEIGHTS: Record = { vision_ai: 0.15, headless: 0.15, @@ -17,6 +26,7 @@ export const DEFAULT_WEIGHTS: Record = { datacenter: 0.07, tor_vpn: 0.01, bot: 0.15, + stealth: 0.3, }; /** User-Agent substrings that indicate a known automation framework. */ From 62f8b61658e8752ace441444dd709b79f4ad3740 Mon Sep 17 00:00:00 2001 From: Chris Portscheller Date: Tue, 30 Jun 2026 20:16:52 -0500 Subject: [PATCH 2/4] fix(detection): drop playwright heuristics that false-positive on real Chrome Live-harness testing against real headful Chrome showed webdriver_configurable and chrome_runtime_missing fire on genuine browsers (Chrome's webdriver descriptor is configurable; chrome.runtime is absent on ordinary pages), scoring a real user at 36% (challenge). Removing them: real Chrome 36%->26% (allow), zero environmental detections; the new stealth detectors stayed clean. Adds the live harness (server.ts + page.html + botasaurus_test.py) used to find this. --- packages/client/src/collectors/environment.ts | 14 +-- packages/webdecoy/harness/botasaurus_test.py | 65 ++++++++++++ packages/webdecoy/harness/page.html | 52 ++++++++++ packages/webdecoy/harness/server.ts | 98 +++++++++++++++++++ .../src/detection/detectors/headless.ts | 5 +- 5 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 packages/webdecoy/harness/botasaurus_test.py create mode 100644 packages/webdecoy/harness/page.html create mode 100644 packages/webdecoy/harness/server.ts diff --git a/packages/client/src/collectors/environment.ts b/packages/client/src/collectors/environment.ts index 69c82c4..7b47632 100644 --- a/packages/client/src/collectors/environment.ts +++ b/packages/client/src/collectors/environment.ts @@ -127,20 +127,14 @@ export class EnvironmentalCollector { ); if (pwKeys.length > 0) signals.push('playwright_globals'); - // Check if navigator.webdriver was deleted or reconfigured + // navigator.webdriver *deleted* from the prototype is a genuine automation + // tell. NOTE: a merely *configurable* descriptor is normal in real Chrome, + // and chrome.runtime is absent on ordinary pages in real Chrome — both + // fired on a real browser in live testing, so neither is a signal here. const proto = Object.getPrototypeOf(navigator); const desc = Object.getOwnPropertyDescriptor(proto, 'webdriver'); if (!desc) { - // Property was deleted from prototype — browsers always have it signals.push('webdriver_deleted'); - } else if (desc.configurable !== false) { - signals.push('webdriver_configurable'); - } - - // Check for missing chrome.runtime in Chrome UA - const isChrome = /Chrome\//.test(navigator.userAgent) && !/Edg\//.test(navigator.userAgent); - if (isChrome && w.chrome && !w.chrome.runtime) { - signals.push('chrome_runtime_missing'); } return { detected: signals.length > 0, signals }; diff --git a/packages/webdecoy/harness/botasaurus_test.py b/packages/webdecoy/harness/botasaurus_test.py new file mode 100644 index 0000000..f9d8db0 --- /dev/null +++ b/packages/webdecoy/harness/botasaurus_test.py @@ -0,0 +1,65 @@ +""" +Live botasaurus test against the WebDecoy stealth harness. + +Prereqs: + pip install botasaurus + npx tsx harness/server.ts # in another terminal (serves :8787) + +Then: + python3 harness/botasaurus_test.py + +It drives botasaurus in BOTH modes against the harness and prints the real +DetectionEngine verdict the server returns: + - browser mode -> loads the page, the page collects real signals & scores + - request mode -> GET /probe (no JS; scored on UA + headers only) + +API note: botasaurus's surface shifts across versions; if `driver.select`/`.get` +differ in your version, adjust the two marked lines. The verdict also prints on +the SERVER stdout regardless, so you can read results there too. +""" + +HARNESS = "http://localhost:8787" + + +def run_browser_mode() -> None: + try: + from botasaurus.browser import browser, Driver + except Exception as e: # noqa: BLE001 + print(f"[browser] botasaurus import failed: {e}") + return + + @browser(headless=True, block_images=True) + def scrape(driver: "Driver", data): # noqa: ANN001 + driver.get(f"{HARNESS}/") # <- adjust if your botasaurus differs + driver.sleep(3) # let the page collect + POST /score + try: + text = driver.select("#result").text # <- adjust selector API if needed + except Exception: # noqa: BLE001 + text = driver.run_js("return document.getElementById('result').textContent") + print("\n===== BOTASAURUS BROWSER MODE — server verdict =====") + print(text) + return text + + scrape() + + +def run_request_mode() -> None: + try: + from botasaurus.request import request, Request + except Exception as e: # noqa: BLE001 + print(f"[request] botasaurus import failed: {e}") + return + + @request + def fetch(req: "Request", data): # noqa: ANN001 + r = req.get(f"{HARNESS}/probe") + print("\n===== BOTASAURUS REQUEST MODE (@request) — server verdict =====") + print(r.text) + return r.text + + fetch() + + +if __name__ == "__main__": + run_browser_mode() + run_request_mode() diff --git a/packages/webdecoy/harness/page.html b/packages/webdecoy/harness/page.html new file mode 100644 index 0000000..16573b4 --- /dev/null +++ b/packages/webdecoy/harness/page.html @@ -0,0 +1,52 @@ + + + + + + WebDecoy stealth-detection harness + + + +

WebDecoy stealth-detection harness

+

Loads the real @webdecoy/client bundle, collects signals in this browser, + and scores them with the real DetectionEngine. Open in a normal browser (baseline) + or point botasaurus at it (target).

+

collecting…

+
+ + + + + diff --git a/packages/webdecoy/harness/server.ts b/packages/webdecoy/harness/server.ts new file mode 100644 index 0000000..81b0033 --- /dev/null +++ b/packages/webdecoy/harness/server.ts @@ -0,0 +1,98 @@ +/** + * Live stealth-detection harness server. + * + * Serves a page that loads the REAL built @webdecoy/client bundle, collects + * signals in whatever browser (or bot) loads it, and scores them with the REAL + * DetectionEngine. Unlike fixtures.ts, the signals here come from an actual + * client — this is the ground-truth path for a real botasaurus run. + * + * npx tsx harness/server.ts # then open http://localhost:8787 + * + * GET / -> page.html (browser mode: runs the collector) + * POST /score -> score posted signals, return verdict JSON (+ log) + * GET /probe -> request-mode path: score UA+headers only (no JS) + */ + +import { createServer, type IncomingHttpHeaders } from 'http'; +import { readFileSync } from 'fs'; +import { join } from 'path'; +import { DetectionEngine } from '../src/detection/engine'; +import type { Signals } from '../src/detection/types'; + +const PORT = Number(process.env.PORT ?? 8787); +const HERE = __dirname; +const BUNDLE = join(HERE, '../../client/dist/webdecoy.global.js'); +const engine = new DetectionEngine({ requirePoW: false }); + +function lower(h: IncomingHttpHeaders): Record { + const o: Record = {}; + for (const k of Object.keys(h)) o[k.toLowerCase()] = Array.isArray(h[k]) ? (h[k] as string[]).join(',') : String(h[k] ?? ''); + return o; +} + +function score(signals: Signals, userAgent: string, headers: Record, label: string) { + const v = engine.score(signals, { ip: '203.0.113.10', siteKey: 'harness', userAgent, headers }); + const triggered = v.detections + .filter((d) => d.score > 0) + .sort((a, b) => b.score * b.confidence - a.score * a.confidence); + const env = (signals.environmental ?? {}) as Record; + const stealthHits = triggered.filter((d) => ['stealth', 'headless', 'cdp', 'automation', 'fingerprint'].includes(d.category)); + + console.log(`\n${'-'.repeat(70)}`); + console.log(`[${label}] score=${(v.score * 100).toFixed(0)}% -> ${v.recommendation.toUpperCase()}`); + console.log(` UA: ${userAgent.slice(0, 90)}`); + console.log(` environment/stealth detections:`); + if (stealthHits.length === 0) console.log(' (none — looks like a genuine browser)'); + for (const d of stealthHits) console.log(` [${d.category.padEnd(11)}] ${(d.score * 100).toFixed(0)}% ${d.reason}`); + if (env.lieDetection) console.log(` lieDetection raw:`, JSON.stringify(env.lieDetection)); + return { score: v.score, recommendation: v.recommendation, detections: triggered }; +} + +const server = createServer((req, res) => { + const url = req.url ?? '/'; + const method = req.method ?? 'GET'; + + if (method === 'GET' && (url === '/' || url.startsWith('/?'))) { + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + res.end(readFileSync(join(HERE, 'page.html'))); + return; + } + if (method === 'GET' && url === '/webdecoy.global.js') { + res.writeHead(200, { 'content-type': 'application/javascript' }); + res.end(readFileSync(BUNDLE)); + return; + } + if (method === 'GET' && url.startsWith('/probe')) { + const headers = lower(req.headers); + const result = score({ meta: {} }, headers['user-agent'] ?? '', headers, 'REQUEST MODE (/probe, no JS)'); + res.writeHead(200, { 'content-type': 'application/json' }); + res.end(JSON.stringify(result, null, 2)); + return; + } + if (method === 'POST' && url === '/score') { + let body = ''; + req.on('data', (c) => (body += c)); + req.on('end', () => { + try { + const payload = JSON.parse(body || '{}'); + const headers = lower(req.headers); + const ua = payload.userAgent || headers['user-agent'] || ''; + const result = score(payload.signals ?? {}, ua, headers, payload.label ?? 'BROWSER MODE (/score)'); + res.writeHead(200, { 'content-type': 'application/json' }); + res.end(JSON.stringify(result, null, 2)); + } catch (e) { + res.writeHead(400); + res.end(String(e)); + } + }); + return; + } + res.writeHead(404); + res.end('not found'); +}); + +server.listen(PORT, () => { + console.log(`\nWebDecoy stealth harness listening on http://localhost:${PORT}`); + console.log(` browser mode: open http://localhost:${PORT}/ in a browser (or botasaurus)`); + console.log(` request mode: GET http://localhost:${PORT}/probe`); +}); diff --git a/packages/webdecoy/src/detection/detectors/headless.ts b/packages/webdecoy/src/detection/detectors/headless.ts index cf89aa7..d063651 100644 --- a/packages/webdecoy/src/detection/detectors/headless.ts +++ b/packages/webdecoy/src/detection/detectors/headless.ts @@ -3,11 +3,12 @@ import type { Detection, Signals } from '../types'; import { AUTOMATION_UA_PATTERNS } from '../weights'; +// `webdriver_configurable` and `chrome_runtime_missing` were removed: both fire +// on genuine Chrome (configurable webdriver descriptor; no chrome.runtime on +// ordinary pages) and caused false positives confirmed via the live harness. const PLAYWRIGHT_SCORE_MAP: Record = { playwright_globals: 0.95, webdriver_deleted: 0.8, - webdriver_configurable: 0.7, - chrome_runtime_missing: 0.6, }; export function detectHeadless(signals: Signals, userAgent: string): Detection[] { From 082052f65163a8c5aac568d52ca89fa0d33192dd Mon Sep 17 00:00:00 2001 From: Chris Portscheller Date: Tue, 30 Jun 2026 20:18:12 -0500 Subject: [PATCH 3/4] =?UTF-8?q?chore(release):=200.4.0=20=E2=80=94=20steal?= =?UTF-8?q?th-browser=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Native-function lie detection (new 'stealth' category) + AudioContext scoring - Broadened software/virtualized WebGL detection to the client's suspiciousRenderer set - Fix: drop playwright heuristics (webdriver_configurable, chrome_runtime_missing) that false-positived on real Chrome - Live harness (server + page + botasaurus script) for real-browser validation Live-validated: real Chrome -> allow (env layer clean), request-mode -> challenge. 64 tests pass. --- packages/client/package.json | 2 +- packages/express/package.json | 4 ++-- packages/fastify/package.json | 4 ++-- packages/nextjs/package.json | 4 ++-- packages/webdecoy/package.json | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/client/package.json b/packages/client/package.json index 6af850f..b37a66d 100644 --- a/packages/client/package.json +++ b/packages/client/package.json @@ -1,6 +1,6 @@ { "name": "@webdecoy/client", - "version": "0.3.0", + "version": "0.4.0", "description": "Web Decoy browser widget - signal collection, proof-of-work, and captcha UI", "main": "./dist/index.js", "module": "./dist/index.mjs", diff --git a/packages/express/package.json b/packages/express/package.json index e591a97..467f770 100644 --- a/packages/express/package.json +++ b/packages/express/package.json @@ -1,6 +1,6 @@ { "name": "@webdecoy/express", - "version": "0.3.0", + "version": "0.4.0", "description": "Web Decoy middleware for Express.js", "main": "./dist/index.js", "types": "./dist/index.d.ts", @@ -40,7 +40,7 @@ "url": "https://github.com/WebDecoy/node-sdk/issues" }, "dependencies": { - "@webdecoy/node": "^0.3.0" + "@webdecoy/node": "^0.4.0" }, "peerDependencies": { "express": "^4.18.0 || ^5.0.0" diff --git a/packages/fastify/package.json b/packages/fastify/package.json index 78e3779..53481bd 100644 --- a/packages/fastify/package.json +++ b/packages/fastify/package.json @@ -1,6 +1,6 @@ { "name": "@webdecoy/fastify", - "version": "0.3.0", + "version": "0.4.0", "description": "Web Decoy plugin for Fastify", "main": "./dist/index.js", "types": "./dist/index.d.ts", @@ -40,7 +40,7 @@ "url": "https://github.com/WebDecoy/node-sdk/issues" }, "dependencies": { - "@webdecoy/node": "^0.3.0", + "@webdecoy/node": "^0.4.0", "fastify-plugin": "^4.5.1" }, "peerDependencies": { diff --git a/packages/nextjs/package.json b/packages/nextjs/package.json index 7defc95..c122a8b 100644 --- a/packages/nextjs/package.json +++ b/packages/nextjs/package.json @@ -1,6 +1,6 @@ { "name": "@webdecoy/nextjs", - "version": "0.3.0", + "version": "0.4.0", "description": "Web Decoy middleware for Next.js", "main": "./dist/index.js", "types": "./dist/index.d.ts", @@ -41,7 +41,7 @@ "url": "https://github.com/WebDecoy/node-sdk/issues" }, "dependencies": { - "@webdecoy/node": "^0.3.0" + "@webdecoy/node": "^0.4.0" }, "peerDependencies": { "next": ">=13.0.0" diff --git a/packages/webdecoy/package.json b/packages/webdecoy/package.json index d76b29f..f1c93fd 100644 --- a/packages/webdecoy/package.json +++ b/packages/webdecoy/package.json @@ -1,6 +1,6 @@ { "name": "@webdecoy/node", - "version": "0.3.0", + "version": "0.4.0", "description": "Web Decoy SDK for Node.js - Bot detection with TLS fingerprinting", "main": "./dist/index.js", "types": "./dist/index.d.ts", From db6360b9dd93f0c1eda9f0eeb93915f6ee162cd1 Mon Sep 17 00:00:00 2001 From: Chris Portscheller Date: Tue, 30 Jun 2026 20:30:03 -0500 Subject: [PATCH 4/4] feat(rules): F4 tripwire rule + honeytoken (deterministic zero-FP deception) Adds tripwire({paths,prefixes,patterns,includeDefaults}) and honeytoken() (a hidden decoy link + its tripwire path). Any request for a honeypot path is automated by construction, so it DENYs through the existing rule pipeline (403 + violation report) with no middleware changes. Detects intent, not fingerprint, so stealth tools like botasaurus cannot evade it. Live-validated against real botasaurus: the crawler that walked past F1 fingerprinting (browser mode -> allow) is BLOCKED by the tripwire (GET /__wd/... -> 403). Real Chrome loading the same page never requests the hidden link -> zero false positive. - rules/tripwire-rule.ts, rules/honeytoken.ts + exports; rules/tripwire.test.ts (13 tests) - harness: tripwire wired into server, honeytoken injected into page, botasaurus_crawl_test.py --- .../webdecoy/harness/botasaurus_crawl_test.py | 48 ++++++++++ packages/webdecoy/harness/page.html | 6 ++ packages/webdecoy/harness/server.ts | 26 +++++- packages/webdecoy/src/index.ts | 15 ++- packages/webdecoy/src/rules/honeytoken.ts | 50 ++++++++++ packages/webdecoy/src/rules/index.ts | 31 ++++++- packages/webdecoy/src/rules/tripwire-rule.ts | 85 +++++++++++++++++ packages/webdecoy/src/rules/tripwire.test.ts | 92 +++++++++++++++++++ packages/webdecoy/src/rules/types.ts | 18 ++++ 9 files changed, 368 insertions(+), 3 deletions(-) create mode 100644 packages/webdecoy/harness/botasaurus_crawl_test.py create mode 100644 packages/webdecoy/src/rules/honeytoken.ts create mode 100644 packages/webdecoy/src/rules/tripwire-rule.ts create mode 100644 packages/webdecoy/src/rules/tripwire.test.ts diff --git a/packages/webdecoy/harness/botasaurus_crawl_test.py b/packages/webdecoy/harness/botasaurus_crawl_test.py new file mode 100644 index 0000000..aff8204 --- /dev/null +++ b/packages/webdecoy/harness/botasaurus_crawl_test.py @@ -0,0 +1,48 @@ +""" +F4 tripwire validation — the test botasaurus (or any fingerprint-stealth tool) +CANNOT pass. + +A scraper that *crawls* — fetches a page, extracts links, follows them — will +request the hidden honeytoken decoy link and trip the tripwire. Unlike F1 +fingerprinting (which botasaurus defeats), this catches *intent* (going where a +human can't), which stealth cannot spoof away. + + /bin/python harness/botasaurus_crawl_test.py + +Run request mode (the exact mode that evaded F1). A real human/browser loads the +page but never follows the invisible link, so it is never flagged. +""" + +import re + +HARNESS = "http://localhost:8787" + + +def crawl() -> None: + from botasaurus.request import request, Request + + @request + def run(req: "Request", data): # noqa: ANN001 + page = req.get(f"{HARNESS}/").text + hrefs = re.findall(r'href="([^"]+)"', page) + print(f"scraper extracted {len(hrefs)} link(s): {hrefs}") + + tripped = [] + for h in hrefs: + url = h if h.startswith("http") else HARNESS + h + r = req.get(url) + flag = "" + if r.status_code == 403: + flag = " <-- TRIPWIRE: BLOCKED" + tripped.append(h) + print(f" GET {h} -> {r.status_code}{flag}") + + print(f"\nRESULT: {'CAUGHT (tripwire blocked the scraper)' if tripped else 'evaded'}" + f" — tripwires hit: {tripped}") + return tripped + + run() + + +if __name__ == "__main__": + crawl() diff --git a/packages/webdecoy/harness/page.html b/packages/webdecoy/harness/page.html index 16573b4..c9d41d3 100644 --- a/packages/webdecoy/harness/page.html +++ b/packages/webdecoy/harness/page.html @@ -20,6 +20,12 @@

WebDecoy stealth-detection harness

collecting…

+ +

Home

+ + +