@@ -11,7 +11,9 @@ import {
1111 useState ,
1212} from 'react'
1313import { createLogger } from '@sim/logger'
14+ import { getErrorMessage } from '@sim/utils/errors'
1415import { generateId } from '@sim/utils/id'
16+ import { backoffWithJitter } from '@sim/utils/retry'
1517import { useParams } from 'next/navigation'
1618import type { Socket } from 'socket.io-client'
1719import { getSocketUrl } from '@/lib/core/utils/urls'
@@ -162,6 +164,16 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
162164 const explicitWorkflowIdRef = useRef < string | null > ( explicitWorkflowId )
163165 const joinControllerRef = useRef ( new SocketJoinController ( ) )
164166 const joinRetryTimeoutRef = useRef < ReturnType < typeof setTimeout > | null > ( null )
167+ /**
168+ * Why the most recent socket-token mint failed. A `transient` failure (rate
169+ * limit, 5xx, network) passes a null token like a true auth failure does, but
170+ * must keep Socket.IO reconnecting rather than latch `authFailed` — the server
171+ * rejects both with the same "Authentication required" message, so the client
172+ * can only tell them apart by remembering why the token was null.
173+ */
174+ const tokenFailureModeRef = useRef < 'none' | 'auth' | 'transient' > ( 'none' )
175+ const authRetryTimeoutRef = useRef < ReturnType < typeof setTimeout > | null > ( null )
176+ const authRetryAttemptRef = useRef ( 0 )
165177
166178 const params = useParams ( )
167179 const urlWorkflowId = params ?. workflowId as string | undefined
@@ -361,21 +373,29 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
361373 auth : async ( cb ) => {
362374 try {
363375 const freshToken = await generateSocketToken ( )
376+ tokenFailureModeRef . current = 'none'
364377 cb ( { token : freshToken } )
365378 } catch ( error ) {
366- logger . error ( 'Failed to generate fresh token for connection:' , error )
367379 if ( error instanceof Error && error . message === 'Authentication required' ) {
368- // True auth failure - pass null token, server will reject with "Authentication required"
369- cb ( { token : null } )
380+ tokenFailureModeRef . current = 'auth'
381+ logger . error ( 'Failed to generate fresh token for connection:' , error )
382+ } else {
383+ tokenFailureModeRef . current = 'transient'
384+ logger . warn ( 'Transient socket token failure, will retry connection' , {
385+ error : getErrorMessage ( error ) ,
386+ } )
370387 }
371- // For server errors, don't call cb - connection will timeout and Socket.IO will retry
388+ cb ( { token : null } )
372389 }
373390 } ,
374391 } )
375392
376393 socketInstance . on ( 'connect' , ( ) => {
377394 setIsConnected ( true )
378395 setIsConnecting ( false )
396+ setIsReconnecting ( false )
397+ tokenFailureModeRef . current = 'none'
398+ authRetryAttemptRef . current = 0
379399 setCurrentSocketId ( socketInstance . id ?? null )
380400 logger . info ( 'Socket connected successfully' , {
381401 socketId : socketInstance . id ,
@@ -406,11 +426,11 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
406426 setIsConnecting ( false )
407427 logger . error ( 'Socket connection error:' , { message : error . message } )
408428
409- // Check if this is an authentication failure
410429 const isAuthError =
411- error . message ?. includes ( 'Token validation failed' ) ||
412- error . message ?. includes ( 'Authentication failed' ) ||
413- error . message ?. includes ( 'Authentication required' )
430+ tokenFailureModeRef . current !== 'transient' &&
431+ ( error . message ?. includes ( 'Token validation failed' ) ||
432+ error . message ?. includes ( 'Authentication failed' ) ||
433+ error . message ?. includes ( 'Authentication required' ) )
414434
415435 if ( isAuthError ) {
416436 logger . warn (
@@ -737,6 +757,35 @@ export function SocketProvider({ children, user }: SocketProviderProps) {
737757 }
738758 } , [ user ?. id , authFailed ] )
739759
760+ /**
761+ * Auto-recover from an auth failure. The token mint can 401 transiently while a
762+ * session is mid-rotation (e.g. right after switching active organization on
763+ * invite accept); without this the socket stays dead until a manual page reload,
764+ * since no other caller invokes {@link retryConnection}. Retries with backoff so a
765+ * genuine logged-out session re-mints lazily rather than hammering the endpoint.
766+ */
767+ useEffect ( ( ) => {
768+ if ( ! authFailed ) {
769+ return
770+ }
771+
772+ const attempt = authRetryAttemptRef . current
773+ const delay = backoffWithJitter ( attempt + 1 , null , { baseMs : 1000 , maxMs : 30000 } )
774+ authRetryTimeoutRef . current = setTimeout ( ( ) => {
775+ authRetryTimeoutRef . current = null
776+ authRetryAttemptRef . current = attempt + 1
777+ logger . info ( 'Auto-retrying socket connection after auth failure' , { attempt } )
778+ setAuthFailed ( false )
779+ } , delay )
780+
781+ return ( ) => {
782+ if ( authRetryTimeoutRef . current !== null ) {
783+ clearTimeout ( authRetryTimeoutRef . current )
784+ authRetryTimeoutRef . current = null
785+ }
786+ }
787+ } , [ authFailed ] )
788+
740789 const hydrationPhase = useWorkflowRegistryStore ( ( s ) => s . hydration . phase )
741790
742791 useEffect ( ( ) => {
0 commit comments