From 8d609a41f35f1490f1bc4882fa5fdda0ffa4ca4e Mon Sep 17 00:00:00 2001 From: liobrasil Date: Tue, 10 Mar 2026 22:19:08 -0400 Subject: [PATCH 01/13] test: tighten supervisor recovery regressions --- cadence/tests/scheduled_supervisor_test.cdc | 105 ++++++- ...duler_mixed_population_regression_test.cdc | 292 ++++++++++++++++++ 2 files changed, 388 insertions(+), 9 deletions(-) create mode 100644 cadence/tests/scheduler_mixed_population_regression_test.cdc diff --git a/cadence/tests/scheduled_supervisor_test.cdc b/cadence/tests/scheduled_supervisor_test.cdc index 32438057..bb96543a 100644 --- a/cadence/tests/scheduled_supervisor_test.cdc +++ b/cadence/tests/scheduled_supervisor_test.cdc @@ -599,6 +599,25 @@ fun testStuckYieldVaultDetectionLogic() { log("PASS: Stuck yield vault detection correctly identifies healthy yield vaults") } +/// Returns per-yield-vault recovery event counts from YieldVaultRecovered events. +/// +/// This is used by stress tests to distinguish "all vaults recovered at least once" +/// from "lots of recovery events happened", which can otherwise hide duplicate +/// recovery churn for the same vault IDs. +/// +/// Example: 240 recovery events for 200 vaults can look healthy if the test only +/// checks `events.length >= 200`, even though 40 of those events may be repeats. +access(all) +fun getRecoveredYieldVaultCounts(): {UInt64: Int} { + let counts: {UInt64: Int} = {} + let recoveredEvents = Test.eventsOfType(Type()) + for recoveredAny in recoveredEvents { + let recovered = recoveredAny as! FlowYieldVaultsSchedulerV1.YieldVaultRecovered + counts[recovered.yieldVaultID] = (counts[recovered.yieldVaultID] ?? 0) + 1 + } + return counts +} + /// COMPREHENSIVE TEST: Insufficient Funds -> Failure -> Recovery /// /// This test validates the COMPLETE failure and recovery cycle: @@ -939,9 +958,21 @@ fun testInsufficientFundsAndRecovery() { /// /// Flow: create 200 yield vaults, run 2 scheduling rounds, drain FLOW so executions fail, /// wait for vaults to be marked stuck, refund FLOW, schedule the supervisor, then advance -/// time for ceil(200/MAX_BATCH_SIZE)+10 supervisor ticks. Asserts all 200 vaults are -/// recovered (YieldVaultRecovered events), none still stuck, and all have active schedules. -/// The +10 extra ticks are a buffer so every vault is processed despite scheduler timing. +/// time for enough supervisor ticks to recover all unique vault IDs, plus a short +/// stabilization window. This asserts: +/// - every one of the 200 vault IDs is recovered at least once, +/// - no recovery failures occur, +/// - no vault emits more than one recovery event, +/// - once all vaults are recovered and healthy, extra supervisor ticks do not emit +/// additional recovery events, +/// - none remain stuck, and all have active schedules. +/// +/// Why this was tightened: +/// the earlier version only checked `YieldVaultRecovered.length >= n` after +/// `ceil(n / MAX_BATCH_SIZE) + 10` supervisor ticks. That allowed the test to pass +/// even when some vaults recovered more than once while others had not yet been +/// uniquely validated. This version keeps the same tick budget, but uses it as a +/// timeout ceiling instead of treating it as proof that the recovery set is clean. access(all) fun testSupervisorHandlesManyStuckVaults() { let n = 200 @@ -1024,19 +1055,75 @@ fun testSupervisorHandlesManyStuckVaults() { ) Test.expect(schedSupRes, Test.beSucceeded()) - // 7. Advance time for supervisor ticks (ceil(n/MAX_BATCH_SIZE)+10); each tick processes a batch + // 7. Advance time until every target vault has emitted at least one recovery event. + // + // We still compute ceil(n / MAX_BATCH_SIZE) + 10, but it now acts as a maximum + // allowed budget for supervisor ticks. The loop stops early once all 200 vault IDs + // have been seen at least once. This makes the assertion sensitive to duplicate + // recoveries: repeated events for the same vault no longer help the test finish. + // + // After all unique IDs are observed, run a short stabilization window to check + // that a healthy supervisor does not continue to emit recovery events. let supervisorRunsNeeded = (UInt(n) + UInt(maxBatchSize) - 1) / UInt(maxBatchSize) + let maxSupervisorTicks = supervisorRunsNeeded + 10 var run = 0 as UInt - while run < supervisorRunsNeeded + 10 { + var recoveredCounts = getRecoveredYieldVaultCounts() + while run < maxSupervisorTicks && recoveredCounts.length < n { Test.moveTime(by: 60.0 * 10.0 + 10.0) Test.commitBlock() run = run + 1 + recoveredCounts = getRecoveredYieldVaultCounts() + } + log("testSupervisorHandlesManyStuckVaults: ran \(run.toString()) supervisor ticks to reach \(recoveredCounts.length.toString()) unique recovered vaults") + + let recoveryFailures = Test.eventsOfType(Type()) + Test.assertEqual(0, recoveryFailures.length) + + // Split the validation into: + // - missing recoveries: some vaults never emitted YieldVaultRecovered at all + // - duplicate recoveries: some vaults emitted YieldVaultRecovered more than once + // + // Both matter. Missing recoveries means the supervisor did not cover the whole set. + // Duplicate recoveries means event volume was inflated by churn, which the old + // `recoveredEvents.length >= n` assertion could not distinguish from success. + var missingRecoveries = 0 + var duplicatedRecoveries = 0 + var duplicatedVaults = 0 + for yieldVaultID in yieldVaultIDs { + let recoveryCount = recoveredCounts[yieldVaultID] ?? 0 + if recoveryCount == 0 { + missingRecoveries = missingRecoveries + 1 + } else if recoveryCount > 1 { + duplicatedRecoveries = duplicatedRecoveries + (recoveryCount - 1) + duplicatedVaults = duplicatedVaults + 1 + } } - log("testSupervisorHandlesManyStuckVaults: ran \((supervisorRunsNeeded + 10).toString()) supervisor ticks") + Test.assert( + missingRecoveries == 0, + message: "expected every vault to recover at least once, but \(missingRecoveries.toString()) vaults emitted no YieldVaultRecovered event" + ) + Test.assert( + duplicatedRecoveries == 0, + message: "expected exactly one recovery per vault, but saw \(duplicatedRecoveries.toString()) duplicate recoveries across \(duplicatedVaults.toString()) vaults" + ) - let recoveredEvents = Test.eventsOfType(Type()) - Test.assert(recoveredEvents.length >= n, message: "expected at least \(n.toString()) recovered, got \(recoveredEvents.length.toString())") - log("testSupervisorHandlesManyStuckVaults: recovered \(recoveredEvents.length.toString()) vaults") + // This second guard catches late churn that may start only after the full unique + // set has already been recovered. The duplicate-per-vault check above inspects the + // state up to this point; the stabilization window verifies the system stays quiet + // once recovery should be complete. + let recoveredEventsBeforeStabilization = Test.eventsOfType(Type()).length + var stabilizationTick = 0 + while stabilizationTick < 2 { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + stabilizationTick = stabilizationTick + 1 + } + let recoveredEventsAfterStabilization = Test.eventsOfType(Type()).length + Test.assert( + recoveredEventsAfterStabilization == recoveredEventsBeforeStabilization, + message: "expected no additional recovery churn after all vaults were recovered; before stabilization: \(recoveredEventsBeforeStabilization.toString()), after: \(recoveredEventsAfterStabilization.toString())" + ) + log("testSupervisorHandlesManyStuckVaults: stable recovery set of \(recoveredCounts.length.toString()) unique vaults with \(recoveredEventsAfterStabilization.toString()) total recovery events") // 8. Health check: none stuck, all have active schedules var stillStuck = 0 diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc new file mode 100644 index 00000000..036758ae --- /dev/null +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -0,0 +1,292 @@ +/// Mixed-population regression tests for the scheduler stuck-scan. +/// +/// WHY THIS FILE EXISTS +/// -------------------- +/// The Supervisor does not scan the full registry for stuck vaults. Each run only asks the +/// registry for up to MAX_BATCH_SIZE candidates from the tail of the scan order, then checks +/// those candidates one by one. +/// +/// That optimization breaks down when the scan order contains vaults that can never become +/// stuck. Today, non-recurring vaults are still registered into the same ordering even though +/// isStuckYieldVault() immediately returns false for them. +/// +/// This creates a liveness risk: +/// - more than MAX_BATCH_SIZE non-recurring vaults can occupy the tail, +/// - the Supervisor can keep rescanning those same ineligible IDs, +/// - and a real stuck recurring vault further up the list is never detected or recovered. +/// +/// This file exists to lock that failure mode down as a regression. The main test below +/// intentionally builds that mixed population and asserts the Supervisor should still find +/// the real stuck vault. On the current implementation, that assertion fails, which is the +/// exact bug this test is meant to expose until the scheduler logic is fixed. +import Test +import BlockchainHelpers + +import "test_helpers.cdc" + +import "FlowToken" +import "MOET" +import "YieldToken" +import "MockStrategies" +import "FlowYieldVaultsAutoBalancers" +import "FlowYieldVaultsSchedulerV1" +import "FlowYieldVaultsSchedulerRegistry" + +access(all) let protocolAccount = Test.getAccount(0x0000000000000008) +access(all) let flowYieldVaultsAccount = Test.getAccount(0x0000000000000009) +access(all) let yieldTokenAccount = Test.getAccount(0x0000000000000010) + +access(all) var strategyIdentifier = Type<@MockStrategies.TracerStrategy>().identifier +access(all) var flowTokenIdentifier = Type<@FlowToken.Vault>().identifier +access(all) var yieldTokenIdentifier = Type<@YieldToken.Vault>().identifier +access(all) var moetTokenIdentifier = Type<@MOET.Vault>().identifier +access(all) var snapshot: UInt64 = 0 + +access(all) +fun setup() { + log("Setting up mixed-population scheduler regression test...") + + deployContracts() + mintFlow(to: flowYieldVaultsAccount, amount: 1000.0) + + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.0) + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 1.0) + + let reserveAmount = 100_000_00.0 + setupMoetVault(protocolAccount, beFailed: false) + setupYieldVault(protocolAccount, beFailed: false) + mintFlow(to: protocolAccount, amount: reserveAmount) + mintMoet(signer: protocolAccount, to: protocolAccount.address, amount: reserveAmount, beFailed: false) + mintYield(signer: yieldTokenAccount, to: protocolAccount.address, amount: reserveAmount, beFailed: false) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: MOET.VaultStoragePath) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: YieldToken.VaultStoragePath) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: /storage/flowTokenVault) + + createAndStorePool(signer: protocolAccount, defaultTokenIdentifier: moetTokenIdentifier, beFailed: false) + addSupportedTokenFixedRateInterestCurve( + signer: protocolAccount, + tokenTypeIdentifier: flowTokenIdentifier, + collateralFactor: 0.8, + borrowFactor: 1.0, + yearlyRate: UFix128(0.1), + depositRate: 1_000_000.0, + depositCapacityCap: 1_000_000.0 + ) + + let openRes = _executeTransaction( + "../../lib/FlowALP/cadence/transactions/flow-alp/position/create_position.cdc", + [reserveAmount / 2.0, /storage/flowTokenVault, true], + protocolAccount + ) + Test.expect(openRes, Test.beSucceeded()) + + addStrategyComposer( + signer: flowYieldVaultsAccount, + strategyIdentifier: strategyIdentifier, + composerIdentifier: Type<@MockStrategies.TracerStrategyComposer>().identifier, + issuerStoragePath: MockStrategies.IssuerStoragePath, + beFailed: false + ) + + snapshot = getCurrentBlockHeight() + log("Setup complete") +} + +access(all) +fun cancelSchedulesAndRemoveRecurringConfig(yieldVaultID: UInt64) { + let storagePath = FlowYieldVaultsAutoBalancers.deriveAutoBalancerPath( + id: yieldVaultID, + storage: true + ) as! StoragePath + let res = _executeTransaction( + "../transactions/flow-yield-vaults/admin/cancel_all_scheduled_transactions_and_remove_recurring_config.cdc", + [storagePath], + flowYieldVaultsAccount + ) + Test.expect(res, Test.beSucceeded()) +} + +access(all) +fun isStuckYieldVault(_ yieldVaultID: UInt64): Bool { + let res = _executeScript("../scripts/flow-yield-vaults/is_stuck_yield_vault.cdc", [yieldVaultID]) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! Bool +} + +access(all) +fun hasActiveSchedule(_ yieldVaultID: UInt64): Bool { + let res = _executeScript("../scripts/flow-yield-vaults/has_active_schedule.cdc", [yieldVaultID]) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! Bool +} + +access(all) +fun getFlowYieldVaultsFlowBalance(): UFix64 { + let res = _executeScript( + "../scripts/flow-yield-vaults/get_flow_balance.cdc", + [flowYieldVaultsAccount.address] + ) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! UFix64 +} + +/// Regression test: more than MAX_BATCH_SIZE non-recurring registry entries must not +/// permanently starve stuck recurring vault detection. +/// +/// Setup: +/// 1. Create MAX_BATCH_SIZE + 1 mock vaults and strip their recurring config/schedules. +/// 2. Create one real recurring vault after them so it sits behind those tail entries. +/// 3. Drain FLOW so that recurring vault executes once but fails to reschedule, becoming stuck. +/// 4. Fund and run the Supervisor for several ticks. +/// +/// Expected behavior: +/// - The recurring vault is eventually detected and recovered. +/// - The non-recurring tail entries do not block recovery forever. +access(all) +fun testSupervisorScansPastNonRecurringTailEntries() { + if snapshot != getCurrentBlockHeight() { + Test.reset(to: snapshot) + } + log("\n[TEST] Supervisor scans past non-recurring tail entries...") + + let blockerCount = FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE + 1 + let user = Test.createAccount() + mintFlow(to: user, amount: 2000.0) + grantBeta(flowYieldVaultsAccount, user) + + // Step 1: create more than one full scan batch of normal recurring mock vaults. + // We will convert these into permanently ineligible "blockers" without removing them + // from the registry, so they keep occupying the tail of the scan order. + var idx = 0 + while idx < blockerCount { + let createRes = _executeTransaction( + "../transactions/flow-yield-vaults/create_yield_vault.cdc", + [strategyIdentifier, flowTokenIdentifier, 25.0], + user + ) + Test.expect(createRes, Test.beSucceeded()) + idx = idx + 1 + } + + let blockerIDs = getYieldVaultIDs(address: user.address)! + Test.assertEqual(blockerCount, blockerIDs.length) + + // Step 2: strip schedules and recurring config from those vaults. They stay registered, + // but they can no longer self-schedule and they can never satisfy isStuckYieldVault(). + for blockerID in blockerIDs { + cancelSchedulesAndRemoveRecurringConfig(yieldVaultID: blockerID) + Test.assertEqual(false, hasActiveSchedule(blockerID)) + Test.assertEqual(false, isStuckYieldVault(blockerID)) + } + log("Prepared \(blockerCount.toString()) non-recurring registry entries at the tail") + + // Step 3: create one real recurring vault after the blockers. This vault is the one we + // will intentionally push into a stuck state and expect the Supervisor to recover. + let targetRes = _executeTransaction( + "../transactions/flow-yield-vaults/create_yield_vault.cdc", + [strategyIdentifier, flowTokenIdentifier, 25.0], + user + ) + Test.expect(targetRes, Test.beSucceeded()) + + let allYieldVaultIDs = getYieldVaultIDs(address: user.address)! + var recurringYieldVaultID: UInt64 = 0 + var foundRecurringTarget = false + for yieldVaultID in allYieldVaultIDs { + if !blockerIDs.contains(yieldVaultID) { + recurringYieldVaultID = yieldVaultID + foundRecurringTarget = true + } + } + Test.assert(foundRecurringTarget, message: "Failed to identify the recurring target yield vault") + Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) + + // Sanity check the test setup: the first scan batch should contain only blocker IDs, + // proving the real recurring target starts behind the tail window the Supervisor reads. + let initialCandidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( + limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE) + ) + Test.assert( + !initialCandidates.contains(recurringYieldVaultID), + message: "Setup failure: target recurring vault should sit behind the first stuck-scan batch" + ) + for candidate in initialCandidates { + Test.assert( + blockerIDs.contains(candidate), + message: "Setup failure: initial tail scan should contain only non-recurring blockers" + ) + } + + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 2.0) + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.5) + + // Step 4: drain the shared FLOW fee vault to the minimum possible residual balance. + // The target vault already has its first schedule funded, so it should execute once and + // then eventually stop self-scheduling, making it a genuine stuck recurring vault. + let balanceBeforeDrain = getFlowYieldVaultsFlowBalance() + let residualBalance = 0.00000001 + if balanceBeforeDrain > residualBalance { + let drainRes = _executeTransaction( + "../transactions/flow-yield-vaults/drain_flow.cdc", + [balanceBeforeDrain - residualBalance], + flowYieldVaultsAccount + ) + Test.expect(drainRes, Test.beSucceeded()) + } + + // Give the already-funded first execution time to run, then keep advancing until the + // vault becomes overdue with no active schedule. A nearly-empty fee vault can still be + // enough for one extra scheduling attempt, so this waits several intervals. + idx = 0 + while idx < 6 && !isStuckYieldVault(recurringYieldVaultID) { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + idx = idx + 1 + } + + Test.assertEqual(true, isStuckYieldVault(recurringYieldVaultID)) + Test.assertEqual(false, hasActiveSchedule(recurringYieldVaultID)) + + // Step 5: fund the account again and start the Supervisor. A correct implementation + // should eventually scan past the blockers, detect the real stuck recurring vault, and + // recover it. + mintFlow(to: flowYieldVaultsAccount, amount: 200.0) + Test.commitBlock() + + let recoveredEventsBefore = Test.eventsOfType(Type()).length + let detectedEventsBefore = Test.eventsOfType(Type()).length + + let scheduleSupervisorRes = _executeTransaction( + "../transactions/flow-yield-vaults/admin/schedule_supervisor.cdc", + [60.0 * 10.0, UInt8(1), UInt64(800), true], + flowYieldVaultsAccount + ) + Test.expect(scheduleSupervisorRes, Test.beSucceeded()) + + let supervisorTicks = 3 + idx = 0 + while idx < supervisorTicks { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + idx = idx + 1 + } + + let recoveredEvents = Test.eventsOfType(Type()) + let detectedEvents = Test.eventsOfType(Type()) + log("Recovered events after supervisor ticks: \(recoveredEvents.length.toString())") + log("Detected events after supervisor ticks: \(detectedEvents.length.toString())") + + // These are the core regression assertions. On the current implementation they fail, + // because the Supervisor keeps rescanning the same non-recurring tail entries and never + // reaches the real stuck recurring vault behind them. + Test.assert( + detectedEvents.length > detectedEventsBefore, + message: "Supervisor should eventually detect the stuck recurring vault instead of rescanning the same non-recurring tail entries forever" + ) + Test.assert( + recoveredEvents.length > recoveredEventsBefore, + message: "Supervisor should eventually recover the stuck recurring vault even when more than MAX_BATCH_SIZE non-recurring entries occupy the tail" + ) + Test.assertEqual(false, isStuckYieldVault(recurringYieldVaultID)) + Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) +} From 8447480f21cdb9926f150b7ec5e59356c47d7451 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Wed, 11 Mar 2026 00:21:09 -0400 Subject: [PATCH 02/13] fix: avoid duplicate supervisor recoveries --- .../FlowYieldVaultsAutoBalancers.cdc | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index c9325872..770d4d25 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -68,11 +68,20 @@ access(all) contract FlowYieldVaultsAutoBalancers { return nil } - /// Checks if an AutoBalancer has at least one active (Scheduled) transaction. + /// Checks if an AutoBalancer has at least one active internally-managed transaction. /// Used by Supervisor to detect stuck yield vaults that need recovery. /// + /// A transaction is considered active when it is: + /// - still `Scheduled`, or + /// - already marked `Executed` by FlowTransactionScheduler, but the AutoBalancer has not + /// yet advanced its last rebalance timestamp past that transaction's scheduled time. + /// + /// The second case matters because FlowTransactionScheduler flips status to `Executed` + /// before the handler actually runs. Without treating that in-flight window as active, + /// the Supervisor can falsely classify healthy vaults as stuck and recover them twice. + /// /// @param id: The yield vault/AutoBalancer ID - /// @return Bool: true if there's at least one Scheduled transaction, false otherwise + /// @return Bool: true if there's at least one active internally-managed transaction, false otherwise /// access(all) fun hasActiveSchedule(id: UInt64): Bool { let autoBalancer = self.borrowAutoBalancer(id: id) @@ -80,10 +89,20 @@ access(all) contract FlowYieldVaultsAutoBalancers { return false } + let lastRebalanceTimestamp = autoBalancer!.getLastRebalanceTimestamp() let txnIDs = autoBalancer!.getScheduledTransactionIDs() for txnID in txnIDs { - if autoBalancer!.borrowScheduledTransaction(id: txnID)?.status() == FlowTransactionScheduler.Status.Scheduled { - return true + if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) { + if let status = scheduledTxn.status() { + if status == FlowTransactionScheduler.Status.Scheduled { + return true + } + + if status == FlowTransactionScheduler.Status.Executed + && scheduledTxn.timestamp > lastRebalanceTimestamp { + return true + } + } } } return false @@ -110,7 +129,7 @@ access(all) contract FlowYieldVaultsAutoBalancers { return false // Not configured for recurring, can't be "stuck" } - // Check if there's an active schedule + // Check if there's an active schedule or an in-flight due execution if self.hasActiveSchedule(id: id) { return false // Has active schedule, not stuck } From 66ccb02dc8ccab717307af6546374cbc31ce6fa6 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Wed, 11 Mar 2026 00:55:32 -0400 Subject: [PATCH 03/13] fix: restrict supervisor stuck scan to recurring vaults --- .../FlowYieldVaultsAutoBalancers.cdc | 10 +++- .../FlowYieldVaultsSchedulerRegistry.cdc | 53 +++++++++++++------ .../contracts/FlowYieldVaultsSchedulerV1.cdc | 5 +- docs/IMPLEMENTATION_SUMMARY.md | 8 +-- docs/SCHEDULED_REBALANCING_GUIDE.md | 4 +- ...autobalancer-restart-recurring-proposal.md | 2 +- docs/rebalancing_architecture.md | 11 ++-- 7 files changed, 60 insertions(+), 33 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index 770d4d25..bf857ba2 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -232,8 +232,14 @@ access(all) contract FlowYieldVaultsAutoBalancers { let scheduleCap = self.account.capabilities.storage .issue(storagePath) - // Register yield vault in registry for global mapping of live yield vault IDs - FlowYieldVaultsSchedulerRegistry.register(yieldVaultID: uniqueID.id, handlerCap: handlerCap, scheduleCap: scheduleCap) + // Register yield vault in registry for global mapping of live yield vault IDs. + // Only recurring vaults participate in stuck-scan ordering. + FlowYieldVaultsSchedulerRegistry.register( + yieldVaultID: uniqueID.id, + handlerCap: handlerCap, + scheduleCap: scheduleCap, + participatesInStuckScan: recurringConfig != nil + ) // Start the native AutoBalancer self-scheduling chain if recurringConfig was provided // This schedules the first rebalance; subsequent ones are scheduled automatically diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index a8bdbe97..6c26b201 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -16,6 +16,7 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /* --- TYPES --- */ /// Node in the simulated doubly-linked list used for O(1) stuck-scan ordering. + /// Only recurring, scan-eligible vaults participate in this list. /// `prev` points toward the head (most recently executed); `next` points toward the tail (oldest/least recently executed). access(all) struct ListNode { access(all) var prev: UInt64? @@ -79,10 +80,11 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /// Stored as a dictionary for O(1) add/remove; iteration gives the pending set access(self) var pendingQueue: {UInt64: Bool} - /// Simulated doubly-linked list for O(1) stuck-scan ordering. - /// listHead = most recently executed vault ID (or nil if empty). - /// listTail = least recently executed vault ID — getStuckScanCandidates walks from here. - /// On reportExecution a vault is snipped from its current position and moved to head in O(1). + /// Simulated doubly-linked list for O(1) stuck-scan ordering across recurring scan participants. + /// listHead = most recently executed recurring vault ID (or nil if empty). + /// listTail = least recently executed recurring vault ID — getStuckScanCandidates walks from here. + /// On reportExecution a recurring participant is snipped from its current position and moved to head in O(1). + /// If a vault later disables recurring config, its stale list entry is pruned lazily during candidate walks. access(self) var listNodes: {UInt64: ListNode} access(self) var listHead: UInt64? access(self) var listTail: UInt64? @@ -136,10 +138,12 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /* --- ACCOUNT-LEVEL FUNCTIONS --- */ /// Register a YieldVault and store its handler and schedule capabilities (idempotent) + /// `participatesInStuckScan` should be true only for vaults that currently have recurring config. access(account) fun register( yieldVaultID: UInt64, handlerCap: Capability, - scheduleCap: Capability + scheduleCap: Capability, + participatesInStuckScan: Bool ) { pre { handlerCap.check(): "Invalid handler capability provided for yieldVaultID \(yieldVaultID)" @@ -148,20 +152,23 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { self.yieldVaultRegistry[yieldVaultID] = true self.handlerCaps[yieldVaultID] = handlerCap self.scheduleCaps[yieldVaultID] = scheduleCap - // New vaults go to the head; they haven't executed yet but are freshly registered. + + // Only recurring vaults participate in stuck-scan ordering. // If already in the list (idempotent re-register), remove first to avoid duplicates. if self.listNodes[yieldVaultID] != nil { self._listRemove(id: yieldVaultID) } - self._listInsertAtHead(id: yieldVaultID) + if participatesInStuckScan { + self._listInsertAtHead(id: yieldVaultID) + } emit YieldVaultRegistered(yieldVaultID: yieldVaultID) } - /// Called on every execution. Moves yieldVaultID to the head (most recently executed) - /// so the Supervisor scans from the tail (least recently executed) for stuck detection — O(1). - /// If the list entry is unexpectedly missing, reinsert it to restore the ordering structure. + /// Called on every execution. Moves scan-participating yieldVaultID to the head + /// (most recently executed) so the Supervisor scans recurring participants from the tail + /// (least recently executed) for stuck detection — O(1). access(account) fun reportExecution(yieldVaultID: UInt64) { - if !(self.yieldVaultRegistry[yieldVaultID] ?? false) { + if !(self.yieldVaultRegistry[yieldVaultID] ?? false) || self.listNodes[yieldVaultID] == nil { return } let _ = self._listRemove(id: yieldVaultID) @@ -270,18 +277,30 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { return self.pendingQueue.length } - /// Returns up to `limit` vault IDs starting from the tail (least recently executed). + /// Returns up to `limit` recurring scan participants starting from the tail + /// (least recently executed among recurring participants). + /// Stale entries whose recurring config has been removed are pruned lazily as the walk proceeds. /// Supervisor should only scan these for stuck detection instead of all registered vaults. /// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE) access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] { var result: [UInt64] = [] var current = self.listTail - var count: UInt = 0 - while count < limit { + while UInt(result.length) < limit { if let id = current { - result.append(id) - current = self.listNodes[id]?.prev - count = count + 1 + let previous = self.listNodes[id]?.prev + let scheduleCap = self.scheduleCaps[id] + let isRecurringParticipant = + scheduleCap != nil + && scheduleCap!.check() + && scheduleCap!.borrow()?.getRecurringConfig() != nil + + if isRecurringParticipant { + result.append(id) + } else { + self.dequeuePending(yieldVaultID: id) + let _ = self._listRemove(id: id) + } + current = previous } else { break } diff --git a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc index 27aceb70..ff17ee04 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc @@ -160,7 +160,8 @@ access(all) contract FlowYieldVaultsSchedulerV1 { /// Detects and recovers stuck yield vaults by directly calling their scheduleNextRebalance(). /// /// Detection methods: - /// 1. State-based: Scans for registered yield vaults with no active schedule that are overdue + /// 1. State-based: Scans recurring yield vaults in stuck-scan order for candidates with + /// no active schedule that are overdue /// /// Recovery method: /// - Uses Schedule capability to call AutoBalancer.scheduleNextRebalance() directly @@ -172,7 +173,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 { /// "priority": UInt8 (0=High,1=Medium,2=Low) - for Supervisor self-rescheduling /// "executionEffort": UInt64 - for Supervisor self-rescheduling /// "recurringInterval": UFix64 (for Supervisor self-rescheduling) - /// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed vaults for stuck ones) + /// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed recurring scan participants for stuck ones) /// } access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) { let cfg = data as? {String: AnyStruct} ?? {} diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md index b6a73b31..c81ab1e7 100644 --- a/docs/IMPLEMENTATION_SUMMARY.md +++ b/docs/IMPLEMENTATION_SUMMARY.md @@ -22,7 +22,7 @@ This document reflects the current scheduler architecture in this repository. 2. Direct AutoBalancer capabilities, with no scheduling wrapper layer. 3. Native self-scheduling for healthy recurring AutoBalancers. 4. Recovery-only Supervisor with bounded scanning and bounded pending-queue processing. -5. LRU stuck-scan ordering, so the longest-idle vaults are checked first. +5. LRU stuck-scan ordering across recurring scan participants, so the longest-idle recurring vaults are checked first. ### Main Components @@ -40,7 +40,7 @@ FlowYieldVaults Account | +-- handlerCaps | +-- scheduleCaps | +-- pendingQueue - | +-- listNodes / listHead / listTail (LRU stuck-scan order) + | +-- listNodes / listHead / listTail (LRU stuck-scan order for recurring participants) | +-- supervisorCap | +-- FlowYieldVaultsSchedulerV1 @@ -75,7 +75,7 @@ FlowYieldVaults Account Each Supervisor run has two bounded steps: 1. Stuck detection: - - reads up to `MAX_BATCH_SIZE` least-recently-executed vault IDs from `getStuckScanCandidates(...)` + - reads up to `MAX_BATCH_SIZE` least-recently-executed recurring-participant vault IDs from `getStuckScanCandidates(...)` - checks whether each candidate is overdue and lacks an active schedule - enqueues stuck vaults into `pendingQueue` @@ -98,7 +98,7 @@ If the Supervisor itself is configured with a recurring interval, it self-resche - registered vault tracking - pending queue - handler/schedule capability storage - - LRU stuck-scan ordering + - LRU stuck-scan ordering for recurring participants - `FlowYieldVaultsSchedulerV1.cdc` - Supervisor recovery handler diff --git a/docs/SCHEDULED_REBALANCING_GUIDE.md b/docs/SCHEDULED_REBALANCING_GUIDE.md index 687f176b..c47fb5a4 100644 --- a/docs/SCHEDULED_REBALANCING_GUIDE.md +++ b/docs/SCHEDULED_REBALANCING_GUIDE.md @@ -162,10 +162,10 @@ flow scripts execute cadence/scripts/flow-yield-vaults/get_pending_count.cdc ### What It Does The Supervisor handles two recovery scenarios per run: -1. **Stuck detection**: Scans up to `MAX_BATCH_SIZE` vault candidates using `getStuckScanCandidates()`, which returns vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. +1. **Stuck detection**: Scans up to `MAX_BATCH_SIZE` recurring scan participants using `getStuckScanCandidates()`, which returns recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. 2. **Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run). When scheduled with a recurring interval, the Supervisor keeps self-rescheduling even if a given run finds no work. -Each AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed tail, the Supervisor still prioritises the longest-idle vaults first. +Each recurring AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed recurring tail, the Supervisor prioritises the longest-idle recurring vaults first. ### When It's Needed diff --git a/docs/autobalancer-restart-recurring-proposal.md b/docs/autobalancer-restart-recurring-proposal.md index 6aa42dcb..111b5fed 100644 --- a/docs/autobalancer-restart-recurring-proposal.md +++ b/docs/autobalancer-restart-recurring-proposal.md @@ -2,7 +2,7 @@ > Historical note: this proposal describes the recovery design that was later implemented. > Current code names are `FlowYieldVaultsSchedulerV1` and `FlowYieldVaultsSchedulerRegistry`. -> Current stuck detection scans up to `MAX_BATCH_SIZE` least-recently-executed vaults from +> Current stuck detection scans up to `MAX_BATCH_SIZE` least-recently-executed recurring scan participants from > the registry's LRU ordering, not the full registered set. ## Problem Statement diff --git a/docs/rebalancing_architecture.md b/docs/rebalancing_architecture.md index 5d2acd8d..81972ed6 100644 --- a/docs/rebalancing_architecture.md +++ b/docs/rebalancing_architecture.md @@ -45,7 +45,7 @@ - `yieldVaultRegistry`: registered yield vault IDs - `handlerCaps`: direct capabilities to AutoBalancers (no wrapper) - `pendingQueue`: yield vaults needing (re)seeding; processing is bounded by `MAX_BATCH_SIZE = 5` per Supervisor run - - `stuckScanOrder`: LRU-ordered list of vault IDs for stuck detection; vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor always scans the longest-idle vaults first + - `stuckScanOrder`: LRU-ordered list of recurring scan participants for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor scans the longest-idle recurring vaults first - `supervisorCap`: capability for Supervisor self-scheduling - **FlowYieldVaultsSchedulerV1** provides: - `Supervisor`: recovery handler for failed schedules @@ -125,7 +125,8 @@ When `_initNewAutoBalancer()` is called: FlowYieldVaultsSchedulerRegistry.register( yieldVaultID: uniqueID.id, handlerCap: handlerCap, - scheduleCap: scheduleCap + scheduleCap: scheduleCap, + participatesInStuckScan: recurringConfig != nil ) autoBalancerRef.scheduleNextRebalance(whileExecuting: nil) ``` @@ -162,7 +163,7 @@ fun executeTransaction(id: UInt64, data: AnyStruct?) { The Supervisor runs two steps per execution: **Step 1 – Stuck detection** (when `scanForStuck == true`): -Fetches up to `MAX_BATCH_SIZE` candidates from `getStuckScanCandidates(limit:)`, which returns vault IDs starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. +Fetches up to `MAX_BATCH_SIZE` candidates from `getStuckScanCandidates(limit:)`, which returns recurring scan participants starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. **Step 2 – Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPendingYieldVaultIDsPaginated(page: 0, size: UInt(MAX_BATCH_SIZE))`). @@ -170,7 +171,7 @@ Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPending ```cadence access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) { - // STEP 1: scan least-recently-executed vaults for stuck detection + // STEP 1: scan least-recently-executed recurring participants for stuck detection let candidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE)) for yieldVaultID in candidates { @@ -195,7 +196,7 @@ fun executeTransaction(id: UInt64, data: AnyStruct?) { } ``` -Each AutoBalancer sets a shared `RegistryReportCallback` capability at creation time. On every execution it calls `FlowYieldVaultsSchedulerRegistry.reportExecution(yieldVaultID:)`, which moves the vault to the head of `stuckScanOrder` so the least-recently-executed tail remains the next stuck-scan priority. +Each AutoBalancer sets a shared `RegistryReportCallback` capability at creation time. On every execution, recurring scan participants call `FlowYieldVaultsSchedulerRegistry.reportExecution(yieldVaultID:)`, which moves the vault to the head of `stuckScanOrder` so the least-recently-executed recurring tail remains the next stuck-scan priority. --- From 9284c7eb273856a5c4c381e4c601c953f3641c8f Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 12:12:12 -0400 Subject: [PATCH 04/13] docs: clarify scheduler registry semantics --- cadence/contracts/FlowYieldVaultsAutoBalancers.cdc | 4 ++-- .../contracts/FlowYieldVaultsSchedulerRegistry.cdc | 11 +++++++---- docs/IMPLEMENTATION_SUMMARY.md | 4 ++-- docs/rebalancing_architecture.md | 4 ++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index 2b427380..780eefc4 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -245,8 +245,8 @@ access(all) contract FlowYieldVaultsAutoBalancers { let scheduleCap = self.account.capabilities.storage .issue(storagePath) - // Register yield vault in registry for global mapping of live yield vault IDs. - // Only recurring vaults participate in stuck-scan ordering. + // Register the yield vault in the global scheduler registry. + // Only recurring vaults participate in the Supervisor's stuck-scan ordering. FlowYieldVaultsSchedulerRegistry.register( yieldVaultID: uniqueID.id, handlerCap: handlerCap, diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index 2f8960b2..73feee5f 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -5,11 +5,12 @@ import "UInt64LinkedList" /// FlowYieldVaultsSchedulerRegistry /// -/// Stores registry of YieldVault IDs and their handler capabilities for scheduling. +/// Stores the global registry of live YieldVault IDs and their scheduling capabilities. /// This contract maintains: -/// - A registry of all yield vault IDs that participate in scheduled rebalancing +/// - A registry of all live yield vault IDs known to the scheduler infrastructure /// - Handler capabilities (AutoBalancer capabilities) for each yield vault /// - A pending queue for yield vaults that need initial seeding or re-seeding +/// - A recurring-only stuck-scan ordering used by the Supervisor /// - The global Supervisor capability for recovery operations /// access(all) contract FlowYieldVaultsSchedulerRegistry { @@ -48,7 +49,8 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /* --- STATE --- */ - /// Registry of all yield vault IDs that participate in scheduling + /// Registry of all live yield vault IDs known to the scheduler infrastructure. + /// This is broader than the recurring-only stuck-scan ordering. access(self) var yieldVaultRegistry: {UInt64: Bool} /// Handler capabilities (AutoBalancer) for each yield vault - keyed by yield vault ID @@ -90,7 +92,8 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { self.handlerCaps[yieldVaultID] = handlerCap self.scheduleCaps[yieldVaultID] = scheduleCap - // Only recurring vaults participate in stuck-scan ordering. + // The registry tracks all live yield vaults, but only recurring vaults + // participate in the Supervisor's stuck-scan ordering. // If already in the list (idempotent re-register), remove first to avoid duplicates. let list = self._list() if list.contains(id: yieldVaultID) { diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md index c81ab1e7..72f8a395 100644 --- a/docs/IMPLEMENTATION_SUMMARY.md +++ b/docs/IMPLEMENTATION_SUMMARY.md @@ -36,11 +36,11 @@ FlowYieldVaults Account | +-- Starts first native schedule | +-- FlowYieldVaultsSchedulerRegistry - | +-- yieldVaultRegistry: {UInt64: Bool} + | +-- yieldVaultRegistry: {UInt64: Bool} (all live yield vault IDs) | +-- handlerCaps | +-- scheduleCaps | +-- pendingQueue - | +-- listNodes / listHead / listTail (LRU stuck-scan order for recurring participants) + | +-- listNodes / listHead / listTail (LRU recurring-only stuck-scan order) | +-- supervisorCap | +-- FlowYieldVaultsSchedulerV1 diff --git a/docs/rebalancing_architecture.md b/docs/rebalancing_architecture.md index 81972ed6..d0ddf69f 100644 --- a/docs/rebalancing_architecture.md +++ b/docs/rebalancing_architecture.md @@ -42,10 +42,10 @@ ### FlowYieldVaultsSchedulerV1 + FlowYieldVaultsSchedulerRegistry - **FlowYieldVaultsSchedulerRegistry** stores: - - `yieldVaultRegistry`: registered yield vault IDs + - `yieldVaultRegistry`: all live yield vault IDs known to scheduler infrastructure - `handlerCaps`: direct capabilities to AutoBalancers (no wrapper) - `pendingQueue`: yield vaults needing (re)seeding; processing is bounded by `MAX_BATCH_SIZE = 5` per Supervisor run - - `stuckScanOrder`: LRU-ordered list of recurring scan participants for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor scans the longest-idle recurring vaults first + - `stuckScanOrder`: LRU-ordered recurring-only subset used for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor scans the longest-idle recurring vaults first - `supervisorCap`: capability for Supervisor self-scheduling - **FlowYieldVaultsSchedulerV1** provides: - `Supervisor`: recovery handler for failed schedules From 999cd1dceedbdf7ea899df5a37f92590ec221bf8 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 12:28:37 -0400 Subject: [PATCH 05/13] fix: bound optimistic execution recovery window --- cadence/contracts/FlowYieldVaultsAutoBalancers.cdc | 12 ++++++++---- .../flow-yield-vaults/has_active_schedule.cdc | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index 780eefc4..86d0c9a5 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -86,12 +86,15 @@ access(all) contract FlowYieldVaultsAutoBalancers { /// /// A transaction is considered active when it is: /// - still `Scheduled`, or - /// - already marked `Executed` by FlowTransactionScheduler, but the AutoBalancer has not - /// yet advanced its last rebalance timestamp past that transaction's scheduled time. + /// - already marked `Executed` by FlowTransactionScheduler and still within a bounded + /// grace period after its scheduled timestamp. /// /// The second case matters because FlowTransactionScheduler flips status to `Executed` /// before the handler actually runs. Without treating that in-flight window as active, /// the Supervisor can falsely classify healthy vaults as stuck and recover them twice. + /// But that window must be bounded: if the handler panics after the optimistic status + /// update, the vault must eventually become recoverable instead of remaining "active" + /// forever. /// /// @param id: The yield vault/AutoBalancer ID /// @return Bool: true if there's at least one active internally-managed transaction, false otherwise @@ -102,7 +105,8 @@ access(all) contract FlowYieldVaultsAutoBalancers { return false } - let lastRebalanceTimestamp = autoBalancer!.getLastRebalanceTimestamp() + let currentTimestamp = getCurrentBlock().timestamp + let optimisticExecutionGracePeriod: UFix64 = 5.0 let txnIDs = autoBalancer!.getScheduledTransactionIDs() for txnID in txnIDs { if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) { @@ -112,7 +116,7 @@ access(all) contract FlowYieldVaultsAutoBalancers { } if status == FlowTransactionScheduler.Status.Executed - && scheduledTxn.timestamp > lastRebalanceTimestamp { + && currentTimestamp <= scheduledTxn.timestamp + optimisticExecutionGracePeriod { return true } } diff --git a/cadence/scripts/flow-yield-vaults/has_active_schedule.cdc b/cadence/scripts/flow-yield-vaults/has_active_schedule.cdc index f8cb77ed..d13b3276 100644 --- a/cadence/scripts/flow-yield-vaults/has_active_schedule.cdc +++ b/cadence/scripts/flow-yield-vaults/has_active_schedule.cdc @@ -1,12 +1,12 @@ import "FlowYieldVaultsAutoBalancers" -/// Returns true if the yield vault/AutoBalancer has at least one active (Scheduled) transaction. -/// Used to verify that healthy yield vaults maintain their scheduling chain. +/// Returns true if the yield vault/AutoBalancer has at least one active internally-managed +/// transaction. Active includes `Scheduled`, plus a recently `Executed` transaction still +/// within the optimistic-execution grace period. /// /// @param yieldVaultID: The YieldVault/AutoBalancer ID -/// @return Bool: true if there's at least one Scheduled transaction, false otherwise +/// @return Bool: true if there's at least one active internally-managed transaction, false otherwise /// access(all) fun main(yieldVaultID: UInt64): Bool { return FlowYieldVaultsAutoBalancers.hasActiveSchedule(id: yieldVaultID) } - From 5959f87c5bc58b0cd74320261864f9bf902c2800 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 14:26:41 -0400 Subject: [PATCH 06/13] fix: bound supervisor stuck-scan pruning work --- .../contracts/FlowYieldVaultsSchedulerRegistry.cdc | 13 ++++++++----- .../scheduler_mixed_population_regression_test.cdc | 13 ++++++------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index 73feee5f..4b13f5cd 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -219,17 +219,20 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { return self.pendingQueue.length } - /// Returns up to `limit` recurring scan participants starting from the tail - /// (least recently executed among recurring participants). - /// Stale entries whose recurring config has been removed are pruned lazily as the walk proceeds. + /// Inspects up to `limit` tail entries from the recurring stuck-scan ordering and returns + /// the recurring participants encountered. + /// Stale entries whose recurring config has been removed are pruned lazily as the bounded + /// walk proceeds, so repeated calls keep making forward progress without unbounded work. /// Supervisor should only scan these for stuck detection instead of all registered vaults. - /// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE) + /// @param limit: Maximum number of tail entries to inspect in this call access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] { let list = self._list() var result: [UInt64] = [] + var inspected: UInt = 0 var current = list.tail - while UInt(result.length) < limit { + while inspected < limit { if let id = current { + inspected = inspected + 1 let previous = list.nodes[id]?.prev let scheduleCap = self.scheduleCaps[id] let isRecurringParticipant = diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc index 036758ae..a69a35f2 100644 --- a/cadence/tests/scheduler_mixed_population_regression_test.cdc +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -17,8 +17,7 @@ /// /// This file exists to lock that failure mode down as a regression. The main test below /// intentionally builds that mixed population and asserts the Supervisor should still find -/// the real stuck vault. On the current implementation, that assertion fails, which is the -/// exact bug this test is meant to expose until the scheduler logic is fixed. +/// the real stuck vault after bounded lazy pruning advances through the stale tail entries. import Test import BlockchainHelpers @@ -201,8 +200,8 @@ fun testSupervisorScansPastNonRecurringTailEntries() { Test.assert(foundRecurringTarget, message: "Failed to identify the recurring target yield vault") Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) - // Sanity check the test setup: the first scan batch should contain only blocker IDs, - // proving the real recurring target starts behind the tail window the Supervisor reads. + // Sanity check the test setup: one bounded scan should not reach the recurring target yet, + // because its inspection budget is spent pruning the non-recurring blockers at the tail. let initialCandidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE) ) @@ -276,9 +275,9 @@ fun testSupervisorScansPastNonRecurringTailEntries() { log("Recovered events after supervisor ticks: \(recoveredEvents.length.toString())") log("Detected events after supervisor ticks: \(detectedEvents.length.toString())") - // These are the core regression assertions. On the current implementation they fail, - // because the Supervisor keeps rescanning the same non-recurring tail entries and never - // reaches the real stuck recurring vault behind them. + // These are the core regression assertions. A correct implementation should prune through + // the stale non-recurring tail over repeated bounded scans, then detect and recover the + // real recurring vault behind it. Test.assert( detectedEvents.length > detectedEventsBefore, message: "Supervisor should eventually detect the stuck recurring vault instead of rescanning the same non-recurring tail entries forever" From d773f55bf5e466e7796a196b4702dc297f11e35e Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 14:29:22 -0400 Subject: [PATCH 07/13] test: fix mixed-population supervisor regression --- ...duler_mixed_population_regression_test.cdc | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc index a69a35f2..c711e0c4 100644 --- a/cadence/tests/scheduler_mixed_population_regression_test.cdc +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -200,22 +200,6 @@ fun testSupervisorScansPastNonRecurringTailEntries() { Test.assert(foundRecurringTarget, message: "Failed to identify the recurring target yield vault") Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) - // Sanity check the test setup: one bounded scan should not reach the recurring target yet, - // because its inspection budget is spent pruning the non-recurring blockers at the tail. - let initialCandidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( - limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE) - ) - Test.assert( - !initialCandidates.contains(recurringYieldVaultID), - message: "Setup failure: target recurring vault should sit behind the first stuck-scan batch" - ) - for candidate in initialCandidates { - Test.assert( - blockerIDs.contains(candidate), - message: "Setup failure: initial tail scan should contain only non-recurring blockers" - ) - } - setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 2.0) setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.5) @@ -262,16 +246,38 @@ fun testSupervisorScansPastNonRecurringTailEntries() { ) Test.expect(scheduleSupervisorRes, Test.beSucceeded()) - let supervisorTicks = 3 + // First Supervisor tick should spend its bounded inspection budget pruning the non-recurring + // blocker tail. It should not reach the stuck recurring target yet. + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + + var recoveredEvents = Test.eventsOfType(Type()) + var detectedEvents = Test.eventsOfType(Type()) + Test.assertEqual( + detectedEventsBefore, + detectedEvents.length, + message: "First Supervisor tick should only prune blocker tail entries, not detect the recurring target yet" + ) + Test.assertEqual( + recoveredEventsBefore, + recoveredEvents.length, + message: "First Supervisor tick should not recover the recurring target yet" + ) + + // Subsequent bounded scans should make forward progress and eventually reach the real + // stuck recurring vault behind the stale tail. + let remainingSupervisorTicks = 2 idx = 0 - while idx < supervisorTicks { + while idx < remainingSupervisorTicks + && detectedEvents.length == detectedEventsBefore + && recoveredEvents.length == recoveredEventsBefore { Test.moveTime(by: 60.0 * 10.0 + 10.0) Test.commitBlock() + recoveredEvents = Test.eventsOfType(Type()) + detectedEvents = Test.eventsOfType(Type()) idx = idx + 1 } - let recoveredEvents = Test.eventsOfType(Type()) - let detectedEvents = Test.eventsOfType(Type()) log("Recovered events after supervisor ticks: \(recoveredEvents.length.toString())") log("Detected events after supervisor ticks: \(detectedEvents.length.toString())") From 82cdec2144b5b7fd65c2b5b5f22676e7d73dfb05 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 16:32:04 -0400 Subject: [PATCH 08/13] fix: restore manual deferred redeem claim retry --- cadence/contracts/PMStrategiesV1.cdc | 15 +++++++++++++++ .../transactions/pm-strategies/claim_redeem.cdc | 12 ++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 cadence/tests/transactions/pm-strategies/claim_redeem.cdc diff --git a/cadence/contracts/PMStrategiesV1.cdc b/cadence/contracts/PMStrategiesV1.cdc index eb045d41..7f23581f 100644 --- a/cadence/contracts/PMStrategiesV1.cdc +++ b/cadence/contracts/PMStrategiesV1.cdc @@ -1046,6 +1046,21 @@ access(all) contract PMStrategiesV1 { ) } + /// Completes a pending deferred redemption. Permissionless - called automatically by + /// PendingRedeemHandler.executeTransaction when the timelock expires, or manually by + /// anyone to retry if the scheduled execution failed or was missed. + access(all) fun claimRedeem(yieldVaultID: UInt64) { + let handler = self._borrowHandler() + ?? panic("PendingRedeemHandler not initialized") + let scheduledClaim = handler.getScheduledClaim(id: yieldVaultID) + ?? panic("No scheduled claim for vault \(yieldVaultID)") + assert( + getCurrentBlock().timestamp >= scheduledClaim.timestamp, + message: "Timelock has not expired yet (claimable after \(scheduledClaim.timestamp))" + ) + self._claimRedeem(yieldVaultID: yieldVaultID) + } + /// Called by PendingRedeemHandler.executeTransaction when the timelock has expired. /// Redeems shares via service COA, converts underlying ERC-20 to Cadence, deposits to user's wallet. access(self) fun _claimRedeem(yieldVaultID: UInt64) { diff --git a/cadence/tests/transactions/pm-strategies/claim_redeem.cdc b/cadence/tests/transactions/pm-strategies/claim_redeem.cdc new file mode 100644 index 00000000..73554707 --- /dev/null +++ b/cadence/tests/transactions/pm-strategies/claim_redeem.cdc @@ -0,0 +1,12 @@ +import "PMStrategiesV1" + +/// Test transaction: calls the permissionless claimRedeem to complete +/// a pending deferred redemption. +/// +/// @param yieldVaultID: The yield vault ID with a pending redeem +/// +transaction(yieldVaultID: UInt64) { + execute { + PMStrategiesV1.claimRedeem(yieldVaultID: yieldVaultID) + } +} From f4c8e16fdc296b456a6c00cea6d99816d284b4a5 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 16:32:24 -0400 Subject: [PATCH 09/13] docs: align scheduler docs with scan semantics --- docs/SCHEDULED_REBALANCING_GUIDE.md | 2 +- docs/autobalancer-restart-recurring-proposal.md | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/SCHEDULED_REBALANCING_GUIDE.md b/docs/SCHEDULED_REBALANCING_GUIDE.md index c47fb5a4..e7566ac7 100644 --- a/docs/SCHEDULED_REBALANCING_GUIDE.md +++ b/docs/SCHEDULED_REBALANCING_GUIDE.md @@ -162,7 +162,7 @@ flow scripts execute cadence/scripts/flow-yield-vaults/get_pending_count.cdc ### What It Does The Supervisor handles two recovery scenarios per run: -1. **Stuck detection**: Scans up to `MAX_BATCH_SIZE` recurring scan participants using `getStuckScanCandidates()`, which returns recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. +1. **Stuck detection**: Inspects up to `MAX_BATCH_SIZE` entries from the recurring scan tail using `getStuckScanCandidates()`. The scan lazily prunes stale non-recurring entries and returns up to `MAX_BATCH_SIZE` recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. 2. **Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run). When scheduled with a recurring interval, the Supervisor keeps self-rescheduling even if a given run finds no work. Each recurring AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed recurring tail, the Supervisor prioritises the longest-idle recurring vaults first. diff --git a/docs/autobalancer-restart-recurring-proposal.md b/docs/autobalancer-restart-recurring-proposal.md index 111b5fed..937f30b5 100644 --- a/docs/autobalancer-restart-recurring-proposal.md +++ b/docs/autobalancer-restart-recurring-proposal.md @@ -42,7 +42,12 @@ Instead of modifying DeFiActions to add a `restartRecurring` flag, we use the ex let scheduleCap = self.account.capabilities.storage .issue(storagePath) - FlowYieldVaultsSchedulerRegistry.register(yieldVaultID: uniqueID.id, handlerCap: handlerCap, scheduleCap: scheduleCap) + FlowYieldVaultsSchedulerRegistry.register( + yieldVaultID: uniqueID.id, + handlerCap: handlerCap, + scheduleCap: scheduleCap, + participatesInStuckScan: recurringConfig != nil + ) ``` 2. **Supervisor Recovery** @@ -154,7 +159,8 @@ access(self) var scheduleCaps: {UInt64: Capability, - scheduleCap: Capability + scheduleCap: Capability, + participatesInStuckScan: Bool ) access(account) view fun getScheduleCap(yieldVaultID: UInt64): Capability? From 6ef8cc8c43024a459db31d9b77ad5afab9d60dca Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 16:57:48 -0400 Subject: [PATCH 10/13] test: align mixed-population regression comments --- .../tests/scheduler_mixed_population_regression_test.cdc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc index c711e0c4..e2f41ab1 100644 --- a/cadence/tests/scheduler_mixed_population_regression_test.cdc +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -6,9 +6,10 @@ /// registry for up to MAX_BATCH_SIZE candidates from the tail of the scan order, then checks /// those candidates one by one. /// -/// That optimization breaks down when the scan order contains vaults that can never become -/// stuck. Today, non-recurring vaults are still registered into the same ordering even though -/// isStuckYieldVault() immediately returns false for them. +/// That optimization breaks down when the scan order contains stale entries that can never +/// become stuck. In particular, vaults that were once recurring can remain in the ordering +/// after their recurring config is removed, even though isStuckYieldVault() immediately +/// returns false for them. /// /// This creates a liveness risk: /// - more than MAX_BATCH_SIZE non-recurring vaults can occupy the tail, From 66d5eb82020e6fc34e36f7e6405302f1b126da99 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Thu, 19 Mar 2026 17:23:53 -0400 Subject: [PATCH 11/13] fix: clarify supervisor stuck-scan mutation semantics --- .../contracts/FlowYieldVaultsSchedulerRegistry.cdc | 13 +++++++------ cadence/contracts/FlowYieldVaultsSchedulerV1.cdc | 4 +++- .../scheduler_mixed_population_regression_test.cdc | 10 ++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index 4b13f5cd..bad2fd3a 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -219,13 +219,12 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { return self.pendingQueue.length } - /// Inspects up to `limit` tail entries from the recurring stuck-scan ordering and returns - /// the recurring participants encountered. - /// Stale entries whose recurring config has been removed are pruned lazily as the bounded - /// walk proceeds, so repeated calls keep making forward progress without unbounded work. - /// Supervisor should only scan these for stuck detection instead of all registered vaults. + /// Inspects up to `limit` tail entries from the recurring stuck-scan ordering, lazily prunes + /// stale non-recurring entries, and returns the recurring participants encountered. + /// Repeated calls keep making forward progress without unbounded work. + /// This is intentionally account-restricted because it mutates registry state as it prunes. /// @param limit: Maximum number of tail entries to inspect in this call - access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] { + access(account) fun pruneAndGetStuckScanCandidates(limit: UInt): [UInt64] { let list = self._list() var result: [UInt64] = [] var inspected: UInt = 0 @@ -243,6 +242,8 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { if isRecurringParticipant { result.append(id) } else { + // Protocol invariant: once recurring config is removed from a vault, it is not + // re-enabled later. Pruned entries therefore do not need a rejoin path. self.dequeuePending(yieldVaultID: id) let _ = list.remove(id: id) } diff --git a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc index ff17ee04..4af728ed 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc @@ -187,7 +187,9 @@ access(all) contract FlowYieldVaultsSchedulerV1 { // STEP 1: State-based detection - scan for stuck yield vaults if scanForStuck { - let candidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates(limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE)) + let candidates = FlowYieldVaultsSchedulerRegistry.pruneAndGetStuckScanCandidates( + limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE) + ) for yieldVaultID in candidates { if FlowYieldVaultsAutoBalancers.isStuckYieldVault(id: yieldVaultID) { FlowYieldVaultsSchedulerRegistry.enqueuePending(yieldVaultID: yieldVaultID) diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc index e2f41ab1..460b5025 100644 --- a/cadence/tests/scheduler_mixed_population_regression_test.cdc +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -254,14 +254,12 @@ fun testSupervisorScansPastNonRecurringTailEntries() { var recoveredEvents = Test.eventsOfType(Type()) var detectedEvents = Test.eventsOfType(Type()) - Test.assertEqual( - detectedEventsBefore, - detectedEvents.length, + Test.assert( + detectedEvents.length == detectedEventsBefore, message: "First Supervisor tick should only prune blocker tail entries, not detect the recurring target yet" ) - Test.assertEqual( - recoveredEventsBefore, - recoveredEvents.length, + Test.assert( + recoveredEvents.length == recoveredEventsBefore, message: "First Supervisor tick should not recover the recurring target yet" ) From 8ffc8c70ac5fed9bcae4fd5e9aaab6222a937e5f Mon Sep 17 00:00:00 2001 From: liobrasil Date: Fri, 20 Mar 2026 01:05:06 -0400 Subject: [PATCH 12/13] Tighten scheduler recovery grace and docs --- cadence/contracts/FlowYieldVaultsAutoBalancers.cdc | 2 +- docs/IMPLEMENTATION_SUMMARY.md | 4 ++-- docs/SCHEDULED_REBALANCING_GUIDE.md | 4 ++-- docs/rebalancing_architecture.md | 6 +++--- docs/scheduled_rebalancing_comprehensive_analysis.md | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index 39e6a707..c2c28d2a 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -106,7 +106,7 @@ access(all) contract FlowYieldVaultsAutoBalancers { } let currentTimestamp = getCurrentBlock().timestamp - let optimisticExecutionGracePeriod: UFix64 = 5.0 + let optimisticExecutionGracePeriod: UFix64 = 15.0 let txnIDs = autoBalancer!.getScheduledTransactionIDs() for txnID in txnIDs { if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) { diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md index 72f8a395..dd6ebec8 100644 --- a/docs/IMPLEMENTATION_SUMMARY.md +++ b/docs/IMPLEMENTATION_SUMMARY.md @@ -68,14 +68,14 @@ FlowYieldVaults Account 2. The AutoBalancer rebalances. 3. If recurring scheduling is configured, the AutoBalancer schedules its next run. 4. The shared execution callback reports success to the registry. -5. `reportExecution()` moves that vault to the most-recently-executed end of the LRU list. +5. `reportExecution()` moves that vault to the head of the LRU list (most recently executed). ### Recovery Operation Each Supervisor run has two bounded steps: 1. Stuck detection: - - reads up to `MAX_BATCH_SIZE` least-recently-executed recurring-participant vault IDs from `getStuckScanCandidates(...)` + - reads up to `MAX_BATCH_SIZE` least-recently-executed recurring-participant vault IDs from `pruneAndGetStuckScanCandidates(...)` - checks whether each candidate is overdue and lacks an active schedule - enqueues stuck vaults into `pendingQueue` diff --git a/docs/SCHEDULED_REBALANCING_GUIDE.md b/docs/SCHEDULED_REBALANCING_GUIDE.md index e7566ac7..7bf59fae 100644 --- a/docs/SCHEDULED_REBALANCING_GUIDE.md +++ b/docs/SCHEDULED_REBALANCING_GUIDE.md @@ -162,10 +162,10 @@ flow scripts execute cadence/scripts/flow-yield-vaults/get_pending_count.cdc ### What It Does The Supervisor handles two recovery scenarios per run: -1. **Stuck detection**: Inspects up to `MAX_BATCH_SIZE` entries from the recurring scan tail using `getStuckScanCandidates()`. The scan lazily prunes stale non-recurring entries and returns up to `MAX_BATCH_SIZE` recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. +1. **Stuck detection**: Inspects up to `MAX_BATCH_SIZE` entries from the recurring scan tail using `pruneAndGetStuckScanCandidates()`. The scan lazily prunes stale non-recurring entries and returns up to `MAX_BATCH_SIZE` recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. 2. **Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run). When scheduled with a recurring interval, the Supervisor keeps self-rescheduling even if a given run finds no work. -Each recurring AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed recurring tail, the Supervisor prioritises the longest-idle recurring vaults first. +Each recurring AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the head of the internal list (most recently executed). Because stuck scanning starts from the least-recently-executed recurring tail, the Supervisor prioritises the longest-idle recurring vaults first. ### When It's Needed diff --git a/docs/rebalancing_architecture.md b/docs/rebalancing_architecture.md index d0ddf69f..b5d9776e 100644 --- a/docs/rebalancing_architecture.md +++ b/docs/rebalancing_architecture.md @@ -45,7 +45,7 @@ - `yieldVaultRegistry`: all live yield vault IDs known to scheduler infrastructure - `handlerCaps`: direct capabilities to AutoBalancers (no wrapper) - `pendingQueue`: yield vaults needing (re)seeding; processing is bounded by `MAX_BATCH_SIZE = 5` per Supervisor run - - `stuckScanOrder`: LRU-ordered recurring-only subset used for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor scans the longest-idle recurring vaults first + - `stuckScanOrder`: LRU-ordered recurring-only subset used for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the head (most recently executed), so the Supervisor scans the longest-idle recurring vaults first - `supervisorCap`: capability for Supervisor self-scheduling - **FlowYieldVaultsSchedulerV1** provides: - `Supervisor`: recovery handler for failed schedules @@ -163,7 +163,7 @@ fun executeTransaction(id: UInt64, data: AnyStruct?) { The Supervisor runs two steps per execution: **Step 1 – Stuck detection** (when `scanForStuck == true`): -Fetches up to `MAX_BATCH_SIZE` candidates from `getStuckScanCandidates(limit:)`, which returns recurring scan participants starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. +Fetches up to `MAX_BATCH_SIZE` candidates from `pruneAndGetStuckScanCandidates(limit:)`, which lazily prunes stale non-recurring entries and returns recurring scan participants starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. **Step 2 – Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPendingYieldVaultIDsPaginated(page: 0, size: UInt(MAX_BATCH_SIZE))`). @@ -172,7 +172,7 @@ Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPending access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) { // STEP 1: scan least-recently-executed recurring participants for stuck detection - let candidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( + let candidates = FlowYieldVaultsSchedulerRegistry.pruneAndGetStuckScanCandidates( limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE)) for yieldVaultID in candidates { if FlowYieldVaultsAutoBalancers.isStuckYieldVault(id: yieldVaultID) { diff --git a/docs/scheduled_rebalancing_comprehensive_analysis.md b/docs/scheduled_rebalancing_comprehensive_analysis.md index f3e39da7..b817779e 100644 --- a/docs/scheduled_rebalancing_comprehensive_analysis.md +++ b/docs/scheduled_rebalancing_comprehensive_analysis.md @@ -7,7 +7,7 @@ > Current implementation summary: > - scheduler contract: `FlowYieldVaultsSchedulerV1` > - registry batch size: `MAX_BATCH_SIZE = 5` -> - stuck detection: bounded LRU scan via `getStuckScanCandidates(...)` +> - stuck detection: bounded LRU scan via `pruneAndGetStuckScanCandidates(...)` > - recovery: direct `Schedule` capability calls to `scheduleNextRebalance(...)` > > For current behavior and architecture, see: From 7626b37d798a42b8eb617420275fccc9a6b9aa06 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Mon, 23 Mar 2026 23:20:31 -0400 Subject: [PATCH 13/13] docs: note recurring re-enable follow-up --- cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index bad2fd3a..b6d4578b 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -222,6 +222,8 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /// Inspects up to `limit` tail entries from the recurring stuck-scan ordering, lazily prunes /// stale non-recurring entries, and returns the recurring participants encountered. /// Repeated calls keep making forward progress without unbounded work. + /// NOTE: re-enabling recurring scheduling for a previously pruned vault is not handled here. + /// TODO: add an explicit rejoin path for recurring off -> on in a follow-up PR. /// This is intentionally account-restricted because it mutates registry state as it prunes. /// @param limit: Maximum number of tail entries to inspect in this call access(account) fun pruneAndGetStuckScanCandidates(limit: UInt): [UInt64] { @@ -242,8 +244,9 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { if isRecurringParticipant { result.append(id) } else { - // Protocol invariant: once recurring config is removed from a vault, it is not - // re-enabled later. Pruned entries therefore do not need a rejoin path. + // Current behavior: removing recurring config prunes the vault from the + // stuck-scan ordering. Re-enabling recurring later still needs an explicit + // rejoin path and is intentionally left to a follow-up PR. self.dequeuePending(yieldVaultID: id) let _ = list.remove(id: id) }