diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..872c89b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @temporalio/ai-sdk diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml new file mode 100644 index 0000000..f2b89e3 --- /dev/null +++ b/.github/workflows/package-skill.yml @@ -0,0 +1,222 @@ +# ABOUTME: Packages the skill on every push to main (as a ZIP artifact) and, if the version in SKILL.md +# ABOUTME: has been bumped, creates a GitHub Release and syncs the skill contents to three plugin repos +# ABOUTME: (cursor-temporal-plugin, codex-temporal-plugin, claude-temporal-plugin) via PRs. +# ABOUTME: Required secrets (used only by the sync job for cross-repo PRs): +# ABOUTME: SKILL_T_DEV_APP_ID — the GitHub App's ID +# ABOUTME: SKILL_T_DEV_KEY — the GitHub App's private key +# ABOUTME: The app must be installed on the three plugin repos with Contents (write) and Pull Requests (write). + +name: Package and Sync Skill + +on: + push: + branches: [main] + workflow_dispatch: + +jobs: + package: + runs-on: ubuntu-latest + permissions: + contents: write + outputs: + version: ${{ steps.version.outputs.version }} + tag: ${{ steps.version.outputs.tag }} + released: ${{ steps.tag_check.outputs.exists == 'false' }} + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Read version from SKILL.md + id: version + run: | + version=$(grep '^version:' SKILL.md | sed 's/version:[[:space:]]*//') + echo "version=$version" >> "$GITHUB_OUTPUT" + echo "tag=v$version" >> "$GITHUB_OUTPUT" + + - name: Check if tag exists + id: tag_check + run: | + if git rev-parse "refs/tags/${{ steps.version.outputs.tag }}" >/dev/null 2>&1; then + echo "exists=true" >> "$GITHUB_OUTPUT" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + fi + + - name: Package skill + run: | + zip -r temporal-developer-skill.zip \ + SKILL.md \ + references/ \ + -x '*.DS_Store' + + - name: Upload artifact + uses: actions/upload-artifact@v7 + with: + name: temporal-developer-skill + path: temporal-developer-skill.zip + + - name: Create release + if: steps.tag_check.outputs.exists == 'false' + uses: softprops/action-gh-release@v3 + with: + tag_name: ${{ steps.version.outputs.tag }} + name: ${{ steps.version.outputs.tag }} + files: temporal-developer-skill.zip + generate_release_notes: true + + sync: + needs: package + if: needs.package.outputs.released == 'true' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + # contents: write is required by the POST /releases/generate-notes endpoint, + # even though it only returns text and doesn't actually write anything. + contents: write + strategy: + fail-fast: false + matrix: + include: + - repo: temporalio/cursor-temporal-plugin + target_path: skills/temporal-developer + - repo: temporalio/codex-temporal-plugin + target_path: plugins/temporal-developer/skills/temporal-developer + - repo: temporalio/claude-temporal-plugin + target_path: skills/temporal-developer + + steps: + - name: Generate token from GitHub App + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} + private-key: ${{ secrets.SKILL_T_DEV_KEY }} + owner: ${{ github.repository_owner }} + + - name: Checkout source + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Checkout target repo + uses: actions/checkout@v6 + with: + repository: ${{ matrix.repo }} + token: ${{ steps.app-token.outputs.token }} + path: target-repo + + - name: Sync skill contents + working-directory: target-repo + run: | + BRANCH="sync/temporal-developer-skill" + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Create or reset the sync branch based on current main. + # -B ensures the branch always starts from main's tip, even if a + # stale remote branch exists from a previously merged PR. + git checkout -B "$BRANCH" origin/main + + # Remove old contents and copy current + rm -rf "${{ matrix.target_path }}/SKILL.md" \ + "${{ matrix.target_path }}/references" + cp ../SKILL.md "${{ matrix.target_path }}/" + cp -r ../references "${{ matrix.target_path }}/" + + # Check for changes against main + git add "${{ matrix.target_path }}" + if git diff --cached --quiet; then + echo "no_changes=true" >> "$GITHUB_ENV" + echo "No changes to sync" + else + echo "no_changes=false" >> "$GITHUB_ENV" + version="${{ needs.package.outputs.tag }}" + git commit -m "sync temporal-developer skill ${version} from source repo" + git push --force origin "$BRANCH" + fi + + - name: Build changelog + if: env.no_changes == 'false' + env: + GH_TOKEN: ${{ github.token }} + run: | + current_tag="${{ needs.package.outputs.tag }}" + + # Determine the base for the changelog: the version currently on the + # target repo's main branch. This represents what was last merged, so + # the changelog spans every release since then — correctly accumulating + # unmerged versions if a prior sync PR is still open. + # + # Read the old SKILL.md from git (it's been overwritten on disk by the + # sync step) via `git show origin/main:...`. + target_version=$(git -C target-repo show "origin/main:${{ matrix.target_path }}/SKILL.md" 2>/dev/null \ + | grep '^version:' | sed 's/version:[[:space:]]*//' || echo "") + + if [ -n "$target_version" ]; then + base_tag="v${target_version}" + else + base_tag="" + fi + + # Prefer GitHub's auto-generated notes for the range (nicely formatted + # with PR links and contributors). Fall back to git log if unavailable. + echo "Base tag: ${base_tag:-} / Current tag: ${current_tag}" + if [ -n "$base_tag" ]; then + if notes=$(gh api \ + --method POST \ + "/repos/${{ github.repository }}/releases/generate-notes" \ + -f tag_name="${current_tag}" \ + -f previous_tag_name="${base_tag}" \ + --jq '.body') && [ -n "$notes" ]; then + echo "Using auto-generated release notes" + echo "$notes" > /tmp/changelog.md + else + echo "generate-notes API call failed or empty; falling back to git log" + git log --oneline "${base_tag}..HEAD" > /tmp/changelog.md + fi + else + echo "No base tag found; using last 20 commits" + git log --oneline -20 > /tmp/changelog.md + fi + + - name: Create or update PR + if: env.no_changes == 'false' + working-directory: target-repo + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + BRANCH="sync/temporal-developer-skill" + version="${{ needs.package.outputs.tag }}" + changelog=$(cat /tmp/changelog.md) + + # Check if a PR already exists from this branch + existing_pr=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number') + + if [ -n "$existing_pr" ]; then + echo "PR #${existing_pr} already exists — updated by the force-push" + gh pr edit "$existing_pr" \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was updated automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). + + ## Changelog + ${changelog}" + gh pr comment "$existing_pr" --body "Updated to ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }})." + pr_url=$(gh pr view "$existing_pr" --json url --jq '.url') + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Updated [PR #${existing_pr}](${pr_url})" >> "$GITHUB_STEP_SUMMARY" + else + pr_url=$(gh pr create \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). + + ## Changelog + ${changelog}") + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Created ${pr_url}" >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7092ef5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Temporal Technologies Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 9e496b5..e607dcf 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,44 @@ # Temporal Development Skill -A comprehensive skill for building Temporal applications. +A comprehensive skill for developers to use when building [Temporal](https://temporal.io/) applications. + +> [!WARNING] +> This Skill is currently in Public Preview, and will continue to evolve and improve. +> We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY) ## Installation -### As a Claude Code Plugin +### As a Plugin + +This skill is packaged as a plugin for major coding agents, which provides a simple way to install and receive future updates: + +- **Claude Code**: [temporalio/claude-temporal-plugin](https://github.com/temporalio/claude-temporal-plugin) +- **Cursor**: [temporalio/cursor-temporal-plugin](https://github.com/temporalio/cursor-temporal-plugin) +- **OpenAI Codex**: [temporalio/codex-temporal-plugin](https://github.com/temporalio/codex-temporal-plugin) + +See each repo's README for installation instructions. -1. Run `/plugin marketplace add temporalio/agent-skills` -2. Run `/plugin` to open the plugin manager -3. Select **Marketplaces** -4. Choose `temporal-marketplace` from the list -5. Select **Enable auto-update** or **Disable auto-update** -6. run `/plugin install temporal-developer@temporalio-agent-skills` -7. Restart Claude Code +### Standalone Installation -### Via `npx skills` - supports all major coding agents +If you prefer to install the skill directly without the plugin wrapper: + +#### Via `npx skills` — supports all major coding agents 1. `npx skills add temporalio/skill-temporal-developer` 2. Follow prompts -### Via manually cloning the skill repo: +#### Via manually cloning the skill repo 1. `mkdir -p ~/.claude/skills && git clone https://github.com/temporalio/skill-temporal-developer ~/.claude/skills/temporal-developer` -Appropriately adjust the installation directory based on your coding agent. \ No newline at end of file +Appropriately adjust the installation directory based on your coding agent. + +## Currently Supported Temporal SDK Langages + +- [x] Python ✅ +- [x] TypeScript ✅ +- [x] Go ✅ +- [x] Java ✅ +- [x] .NET ✅ +- [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) +- [x] PHP ✅ diff --git a/SKILL.md b/SKILL.md index eb8d590..0a88049 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,42 +1,29 @@ --- name: temporal-developer -description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal PHP", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. -version: 1.0.0 +description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, Java, .NET and PHP. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. +version: 0.3.2 --- # Skill: temporal-developer ## Overview -Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, and PHP. +Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, Go, Java, .NET and PHP. ## Core Architecture -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Temporal Cluster │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌────────────────┐ │ -│ │ Event History │ │ Task Queues │ │ Visibility │ │ -│ │ (Durable Log) │ │ (Work Router) │ │ (Search) │ │ -│ └─────────────────┘ └─────────────────┘ └────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ - ▲ - │ Poll / Complete - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Worker │ -│ ┌─────────────────────────┐ ┌──────────────────────────────┐ │ -│ │ Workflow Definitions │ │ Activity Implementations │ │ -│ │ (Deterministic) │ │ (Non-deterministic OK) │ │ -│ └─────────────────────────┘ └──────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Components:** -- **Workflows** - Durable, deterministic functions that orchestrate activities -- **Activities** - Non-deterministic operations (API calls, I/O) that can fail and retry -- **Workers** - Long-running processes that poll task queues and execute code -- **Task Queues** - Named queues connecting clients to workers +The **Temporal Cluster** is the central orchestration backend. It maintains three key subsystems: the **Event History** (a durable log of all workflow state), **Task Queues** (which route work to the right workers), and a **Visibility** store (for searching and listing workflows). There are three ways to run a Cluster: + +- **Temporal CLI dev server** — a local, single-process server started with `temporal server start-dev`. Suitable for development and testing only, not production. +- **Self-hosted** — you deploy and manage the Temporal server and its dependencies (e.g., database) in your own infrastructure for production use. +- **Temporal Cloud** — a fully managed production service operated by Temporal. No cluster infrastructure to manage. + +**Workers** are long-running processes that you run and manage. They poll Task Queues for work and execute your code. You might run a single Worker process on one machine during development, or run many Worker processes across a large fleet of machines in production. Each Worker hosts two types of code: + +- **Workflow Definitions** — durable, deterministic functions that orchestrate work. These must not have side effects. +- **Activity Implementations** — non-deterministic operations (API calls, file I/O, etc.) that can fail and be retried. + +Workers communicate with the Cluster via a poll/complete loop: they poll a Task Queue for tasks, execute the corresponding Workflow or Activity code, and report results back. ## History Replay: Why Determinism Matters @@ -61,69 +48,56 @@ See `references/core/determinism.md` for detailed explanation. ### Ensure Temporal CLI is installed -Check if `temporal` CLI is installed. If not, follow these instructions: - -#### macOS - -``` -brew install temporal -``` - -#### Linux - -Check your machine's architecture and download the appropriate archive: - -- [Linux amd64](https://temporal.download/cli/archive/latest?platform=linux&arch=amd64) -- [Linux arm64](https://temporal.download/cli/archive/latest?platform=linux&arch=arm64) - -Once you've downloaded the file, extract the downloaded archive and add the temporal binary to your PATH by copying it to a directory like /usr/local/bin - -#### Windows - -Check your machine's architecture and download the appropriate archive: - -- [Windows amd64](https://temporal.download/cli/archive/latest?platform=windows&arch=amd64) -- [Windows arm64](https://temporal.download/cli/archive/latest?platform=windows&arch=arm64) - -Once you've downloaded the file, extract the downloaded archive and add the temporal.exe binary to your PATH. +Check if `temporal` CLI is installed. If not, follow the instructions at `references/core/install_cli.md` to install it for your platform. ### Read All Relevant References 1. First, read the getting started guide for the language you are working in: - - Python -> read `references/python/python.md` - - TypeScript -> read `references/typescript/typescript.md` - - PHP -> read `references/php/php.md` + - Python -> read `references/python/python.md` + - TypeScript -> read `references/typescript/typescript.md` + - Go -> read `references/go/go.md` + - Java -> read `references/java/java.md` + - .NET (C#) -> read `references/dotnet/dotnet.md` + - PHP -> read `references/php/php.md` 2. Second, read appropriate `core` and language-specific references for the task at hand. - ## Primary References + - **`references/core/determinism.md`** - Why determinism matters, replay mechanics, basic concepts of activities - + Language-specific info at `references/{your_language}/determinism.md` + - Language-specific info at `references/{your_language}/determinism.md` - **`references/core/patterns.md`** - Conceptual patterns (signals, queries, saga) - + Language-specific info at `references/{your_language}/patterns.md` + - Language-specific info at `references/{your_language}/patterns.md` - **`references/core/gotchas.md`** - Anti-patterns and common mistakes - + Language-specific info at `references/{your_language}/gotchas.md` + - Language-specific info at `references/{your_language}/gotchas.md` - **`references/core/versioning.md`** - Versioning strategies and concepts - how to safely change workflow code while workflows are running - + Language-specific info at `references/{your_language}/versioning.md` + - Language-specific info at `references/{your_language}/versioning.md` - **`references/core/troubleshooting.md`** - Decision trees, recovery procedures - **`references/core/error-reference.md`** - Common error types, workflow status reference - **`references/core/interactive-workflows.md`** - Testing signals, updates, queries - **`references/core/dev-management.md`** - Dev cycle & management of server and workers - **`references/core/ai-patterns.md`** - AI/LLM pattern concepts - + Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. + - Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. + +## Task Queue Priority and Fairness + +If the developer is building a **multi-tenant application**, proactively recommend Task Queue Fairness. Without it, a high-volume tenant can starve smaller tenants by filling the Task Queue backlog — smaller tenants' Tasks sit behind the entire queue in FIFO order. Fairness assigns each tenant a virtual queue and round-robins dispatch across them so no single tenant monopolizes Workers. + +Priority and Fairness also apply to tiered workloads (batch vs. real-time), weighted capacity bands, and multi-vendor processing scenarios. + +- **`references/core/priority-fairness.md`** - Priority keys, fairness keys and weights, rate limiting, SDK examples, and limitations ## Additional Topics + - **`references/{your_language}/observability.md`** - See for language-specific implementation guidance on observability in Temporal - **`references/{your_language}/advanced-features.md`** - See for language-specific guidance on advanced Temporal features and language-specific features - ## Feedback ### ALWAYS PROMPT USER TO PROVIDE FEEDBACK AT STARTUP When you have **first** loaded this skill, output this message to the user: -"Thank you for trying out the prerelease of Temporal's development skill! We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY)." +"Thank you for trying out the public preview of the Temporal development skill! We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY)." Do not output this message multiple times in the same conversation. diff --git a/references/core/ai-patterns.md b/references/core/ai-patterns.md index 071b9f0..d680bec 100644 --- a/references/core/ai-patterns.md +++ b/references/core/ai-patterns.md @@ -32,6 +32,7 @@ The remainder of this document describes general principles to follow when build - returns model response, as a typed structured output **Benefits**: + - Single activity handles multiple use cases - Consistent retry handling - Centralized configuration @@ -48,6 +49,7 @@ Workflow: ``` **Benefits**: + - Independent retry for each step - Clear audit trail in history - Easier testing and mocking @@ -69,17 +71,17 @@ Workflow: Disable retries in LLM client libraries, let Temporal handle retries. - LLM Client Config: - - max_retries = 0 ← Disable client retries at the LLM client level + - max_retries = 0 ← Disable client retries at the LLM client level Use either the default activity retry policy, or customize it as needed for the situation. **Why**: + - Temporal retries are durable (survive crashes) - Single retry configuration point - Better visibility into retry attempts - Consistent backoff behavior - ### Pattern 5: Multi-Agent Orchestration Complex pipelines with multiple specialized agents: @@ -114,6 +116,7 @@ Deep Research Example: | Document processing | 60-120 seconds | **Rationale**: + - Reasoning models need time for complex computation - Web searches may hit rate limits requiring backoff - Fast timeouts catch stuck operations @@ -128,7 +131,6 @@ Parse rate limit info from API responses: - Response Headers: - Retry-After: 30 - X-RateLimit-Remaining: 0 - - Activity: - If rate limited: - Raise retryable error with a next retry delay @@ -137,12 +139,14 @@ Parse rate limit info from API responses: ## Error Handling ### Retryable Errors + - Rate limits (429) - Timeouts - Temporary server errors (500, 502, 503) - Network errors ### Non-Retryable Errors + - Invalid API key (401) - Invalid input/prompt - Content policy violations @@ -161,6 +165,6 @@ Parse rate limit info from API responses: ## Observability See `references/{your_language}/observability.md` for the language you are working in for documentation on implementing observability in Temporal. It is generally recommended to add observability for: + - Token usage, via activity logging - any else to help track LLM usage and debug agentic flows, within moderation. - diff --git a/references/core/determinism.md b/references/core/determinism.md index 5ebb54d..be931d9 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -50,22 +50,27 @@ Result: Commands don't match history → NondeterminismError ## Sources of Non-Determinism ### Time-Based Operations + - `datetime.now()`, `time.time()`, `Date.now()` - Different value on each execution ### Random Values + - `random.random()`, `Math.random()`, `uuid.uuid4()` - Different value on each execution ### External State + - Reading files, environment variables, databases, networking / HTTP calls - State may change between executions ### Non-Deterministic Iteration + - Map/dict iteration order (in some languages) - Set iteration order ### Threading/Concurrency + - Race conditions produce different outcomes - Non-deterministic ordering @@ -76,16 +81,22 @@ In Temporal, activities are the primary mechanism for making non-deterministic c For a few simple cases, like timestamps, random values, UUIDs, etc. the Temporal SDK in your language may provide durable variants that are simple to use. See `references/{your_language}/determinism.md` for the language you are working in for more info. ## SDK Protection Mechanisms -Each Temporal SDK language provides a protection mechanism to make it easier to catch non-determinism errors earlier in development: -- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls at runtime. +Each Temporal SDK language provides a different level of protection against non-determinism: + +- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. +- Java: The Java SDK has no sandbox. Determinism is enforced by developer conventions — the SDK provides `Workflow.*` APIs as safe alternatives (e.g., `Workflow.sleep()` instead of `Thread.sleep()`), and non-determinism is only detected at replay time via `NonDeterministicException`. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time. Cooperative threading under a global lock eliminates the need for synchronization. +- Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. +- .NET: The .NET SDK has no sandbox. It uses a custom TaskScheduler and a runtime EventListener to detect invalid task scheduling. Developers must use `Workflow.*` safe alternatives (e.g., Workflow.DelayAsync instead of Task.Delay) and avoid non-deterministic .NET Task APIs. - PHP: The PHP SDK performs runtime checks that detect adding, removing, or reordering of commands (activity calls, timers, child workflows, etc.). It does NOT have a sandbox — developers must be disciplined about avoiding non-deterministic operations in workflow code. +Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. ## Detecting Non-Determinism ### During Execution + - `NondeterminismError` raised when Commands don't match Events - Workflow becomes blocked until code is fixed @@ -96,13 +107,17 @@ Replay tests verify that workflows follow identical code paths when re-run, by a ## Recovery from Non-Determinism ### Accidental Change + If you accidentally introduced non-determinism: + 1. Revert code to match what's in history 2. Restart worker 3. Workflow auto-recovers ### Intentional Change + If you need to change workflow logic: + 1. Use the **Patching API** to support both old and new code paths 2. Or terminate old workflows and start new ones with updated code diff --git a/references/core/dev-management.md b/references/core/dev-management.md index 01faed0..45385d3 100644 --- a/references/core/dev-management.md +++ b/references/core/dev-management.md @@ -20,7 +20,6 @@ When you need a new worker, you should start it in the background (and preferrab **Best practice**: As far as local development goes, run only ONE worker instance with the latest code. Don't keep stale workers (running old code) around. - ### Cleanup **Always kill workers when done.** Don't leave workers running. diff --git a/references/core/error-reference.md b/references/core/error-reference.md index a0f905b..29a40b7 100644 --- a/references/core/error-reference.md +++ b/references/core/error-reference.md @@ -1,19 +1,19 @@ # Common Error Types Reference -| Error Type | Error identifier (if any) | Where to Find | What Happened | Recovery | Link to additional info (if any) +| Error Type | Error identifier (if any) | Where to Find | What Happened | Recovery | Link to additional info (if any) | |------------|---------------|---------------|---------------|----------|----------| | **Non-determinism** | TMPRL1100 | `WorkflowTaskFailed` in history | Replay doesn't match history | Analyze error first. **If accidental**: fix code to match history → restart worker. **If intentional v2 change**: terminate → start fresh workflow. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1100.md | | **Deadlock** | TMPRL1101 | `WorkflowTaskFailed` in history, worker logs | Workflow blocked too long (deadlock detected) | Remove blocking operations from workflow code (no I/O, no sleep, no threading locks). Use Temporal primitives instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1101.md | | **Unfinished handlers** | TMPRL1102 | `WorkflowTaskFailed` in history | Workflow completed while update/signal handlers still running | Ensure all handlers complete before workflow finishes. Use `workflow.wait_condition()` to wait for handler completion. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1102.md | | **Payload overflow** | TMPRL1103 | `WorkflowTaskFailed` or `ActivityTaskFailed` in history | Payload size limit exceeded (default 2MB) | Reduce payload size. Use external storage (S3, database) for large data and pass references instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1103.md | -| **Workflow code bug** | | `WorkflowTaskFailed` in history | Bug in workflow logic | Fix code → Restart worker → Workflow auto-resumes | | -| **Missing workflow** | | Worker logs | Workflow not registered | Add to worker.py → Restart worker | | -| **Missing activity** | | Worker logs | Activity not registered | Add to worker.py → Restart worker | | -| **Activity bug** | | `ActivityTaskFailed` in history | Bug in activity code | Fix code → Restart worker → Auto-retries | | -| **Activity retries** | | `ActivityTaskFailed` (count >2) | Repeated failures | Fix code → Restart worker → Auto-retries | | -| **Sandbox violation** | | Worker logs | Bad imports in workflow | Fix workflow.py imports → Restart worker | | -| **Task queue mismatch** | | Workflow never starts | Different queues in starter/worker | Align task queue names | | -| **Timeout** | | Status = TIMED_OUT | Operation too slow | Increase timeout config | | +| **Workflow code bug** | | `WorkflowTaskFailed` in history | Bug in workflow logic | Fix code → Restart worker → Workflow auto-resumes | | +| **Missing workflow** | | Worker logs | Workflow not registered | Add to worker.py → Restart worker | | +| **Missing activity** | | Worker logs | Activity not registered | Add to worker.py → Restart worker | | +| **Activity bug** | | `ActivityTaskFailed` in history | Bug in activity code | Fix code → Restart worker → Auto-retries | | +| **Activity retries** | | `ActivityTaskFailed` (count >2) | Repeated failures | Fix code → Restart worker → Auto-retries | | +| **Sandbox violation** | | Worker logs | Bad imports in workflow | Fix workflow.py imports → Restart worker | | +| **Task queue mismatch** | | Workflow never starts | Different queues in starter/worker | Align task queue names | | +| **Timeout** | | Status = TIMED_OUT | Operation too slow | Increase timeout config | | ## Workflow Status Reference diff --git a/references/core/gotchas.md b/references/core/gotchas.md index 55b6ddb..677362f 100644 --- a/references/core/gotchas.md +++ b/references/core/gotchas.md @@ -9,6 +9,7 @@ This document provides a general overview of conceptual-level gotchas in Tempora **The Problem**: Activities may execute more than once due to retries or Worker failures. If an activity calls an external service without an idempotency key, you may charge a customer twice, send duplicate emails, or create duplicate records. **Symptoms**: + - Duplicate side effects (double charges, duplicate notifications) - Data inconsistencies after retries @@ -21,6 +22,7 @@ This document provides a general overview of conceptual-level gotchas in Tempora **The Problem**: Code in workflow functions runs on first execution AND on every replay. Any side effect (logging, notifications, metrics, etc.) will happen multiple times and non-deterministic code (IO, current time, random numbers, threading, etc.) won't replay correctly. **Symptoms**: + - Non-determinism errors - Sandbox violations, depending on SDK language - Duplicate log entries @@ -28,11 +30,12 @@ This document provides a general overview of conceptual-level gotchas in Tempora - Inflated metrics **The Fix**: + - Use Temporal replay-aware managed side effects for common, non-business logic cases: - - Temporal workflow logging - - Temporal date time - - Temporal UUID generation - - Temporal random number generation + - Temporal workflow logging + - Temporal date time + - Temporal UUID generation + - Temporal random number generation - Put all other side effects in Activities See `references/core/determinism.md` for more info. @@ -42,10 +45,12 @@ See `references/core/determinism.md` for more info. **The Problem**: If Worker A runs part of a workflow with code v1, then Worker B (with code v2) picks it up, replay may produce different Commands. **Symptoms**: + - Non-determinism errors after deploying new code - Errors mentioning "command mismatch" or "unexpected command" **The Fix**: + - Use Worker Versioning for production deployments - Use patching APIs - During development: kill old workers before starting new ones @@ -60,6 +65,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Using aggressive activity retry policies that give up too easily. **Symptoms**: + - Workflows failing on transient errors - Unnecessary workflow failures during brief outages @@ -72,6 +78,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Queries and update validators are read-only. Modifying state causes non-determinism on replay, and must strictly be avoided. **Symptoms**: + - State inconsistencies after workflow replay - Non-determinism errors @@ -82,6 +89,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Queries and update validators must return immediately. They cannot await activities, child workflows, timers, or conditions. **Symptoms**: + - Query / update validators timeouts - Deadlocks @@ -110,6 +118,7 @@ See language-specific gotchas for details. **The Problem**: Not testing what happens when things go wrong. **Questions to answer**: + - What happens when an Activity exhausts all retries? - What happens when a workflow is cancelled mid-execution? - What happens during a Worker restart? @@ -121,6 +130,7 @@ See language-specific gotchas for details. **The Problem**: Changing workflow code without verifying existing workflows can still replay. **Symptoms**: + - Non-determinism errors after deployment - Stuck workflows that can't make progress @@ -133,6 +143,7 @@ See language-specific gotchas for details. **The Problem**: Catching errors without proper handling hides failures. **Symptoms**: + - Silent failures - Workflows completing "successfully" despite errors - Difficult debugging @@ -144,10 +155,12 @@ See language-specific gotchas for details. **The Problem**: Marking transient errors as non-retryable, or permanent errors as retryable. **Symptoms**: + - Workflows failing on temporary network issues (if marked non-retryable) - Infinite retries on invalid input (if marked retryable) **The Fix**: + - **Retryable**: Network errors, timeouts, rate limits, temporary unavailability - **Non-retryable**: Invalid input, authentication failures, business rule violations, resource not found @@ -158,6 +171,7 @@ See language-specific gotchas for details. **The Problem**: When a workflow is cancelled, cleanup code after the cancellation point doesn't run unless explicitly protected. **Symptoms**: + - Resources not released after cancellation - Incomplete compensation/rollback - Leaked state @@ -169,10 +183,12 @@ See language-specific gotchas for details. **The Problem**: Activities must opt in to receive cancellation. Without proper handling, a cancelled activity continues running to completion, wasting resources. **Requirements for activity cancellation**: + 1. **Heartbeating** - Cancellation is delivered via heartbeat. Activities that don't heartbeat won't know they've been cancelled. 2. **Checking for cancellation** - Activity must explicitly check for cancellation or await a cancellation signal. **Symptoms**: + - Cancelled activities running to completion - Wasted compute on work that will be discarded - Delayed workflow cancellation @@ -184,11 +200,13 @@ See language-specific gotchas for details. **The Problem**: Temporal has built-in limits on payload sizes. Exceeding them causes workflows to fail. **Limits**: + - Max 2MB per individual payload - Max 4MB per gRPC message -- Max 50MB for complete workflow history (aim for <10MB in practice) +- Max 50MB for complete workflow history (aim for < 10MB in practice) **Symptoms**: + - Payload too large errors - gRPC message size exceeded errors - Workflow history growing unboundedly diff --git a/references/core/install_cli.md b/references/core/install_cli.md new file mode 100644 index 0000000..4421172 --- /dev/null +++ b/references/core/install_cli.md @@ -0,0 +1,25 @@ +# How to install Temporal CLI + +## macOS + +``` +brew install temporal +``` + +## Linux + +Check your machine's architecture and download the appropriate archive: + +- [Linux amd64](https://temporal.download/cli/archive/latest?platform=linux&arch=amd64) +- [Linux arm64](https://temporal.download/cli/archive/latest?platform=linux&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal binary to your PATH by copying it to a directory like /usr/local/bin + +## Windows + +Check your machine's architecture and download the appropriate archive: + +- [Windows amd64](https://temporal.download/cli/archive/latest?platform=windows&arch=amd64) +- [Windows arm64](https://temporal.download/cli/archive/latest?platform=windows&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal.exe binary to your PATH. \ No newline at end of file diff --git a/references/core/patterns.md b/references/core/patterns.md index 93f774d..7e7c7a3 100644 --- a/references/core/patterns.md +++ b/references/core/patterns.md @@ -2,8 +2,9 @@ ## Overview -Common patterns for building robust Temporal workflows. +Common patterns for building robust Temporal workflows. See the language-specific references for the language you are working in: + - `references/{language}/{language}.md` for the root level documentation for that language - `references/{language}/patterns.md` for language-specific example code of the patterns in this file. @@ -12,18 +13,21 @@ See the language-specific references for the language you are working in: **Purpose**: Send data to a running workflow asynchronously (fire-and-forget). **When to Use**: + - Human approval workflows - Adding items to a workflow's queue - Notifying workflow of external events - Live configuration updates **Characteristics**: + - Asynchronous - sender doesn't wait for response - Can mutate workflow state - Durable - signals are persisted in history - Can be sent before workflow starts (signal-with-start) **Example Flow**: + ``` Client Workflow │ │ @@ -41,12 +45,14 @@ you want the external process to Heartbeat or receive Cancellation. If this may **Purpose**: Read workflow state synchronously without modifying it. **When to Use**: + - Building dashboards showing workflow progress - Health checks and monitoring - Debugging workflow state - Exposing current status to external systems **Characteristics**: + - Synchronous - caller waits for response - Read-only - must not modify state - Not recorded in history @@ -54,6 +60,7 @@ you want the external process to Heartbeat or receive Cancellation. If this may - Can run even on completed workflows **Example Flow**: + ``` Client Workflow │ │ @@ -67,18 +74,22 @@ Client Workflow **Purpose**: Modify workflow state and receive a response synchronously. **When to Use**: + - Operations that need confirmation (add item, return count) - Validation before accepting changes - Replace signal+query combinations - Request-response patterns within workflow **Characteristics**: + - Synchronous - caller waits for completion - Can mutate state AND return values - Supports validators to reject invalid updates before they even get persisted into history +- **Validators must NOT mutate workflow state or block** (no activities, sleeps, or commands) — they are read-only, similar to query handlers - Recorded in history **Example Flow**: + ``` Client Workflow │ │ @@ -90,34 +101,39 @@ Client Workflow ## Child Workflows **When to Use**: + - Prevent history from growing too large - Isolate failure domains (child can fail without failing parent) - Different retry policies for different parts **Characteristics**: + - Own history (doesn't bloat parent) - Independent lifecycle options (ParentClosePolicy) - Can be cancelled independently - Results returned to parent **Parent Close Policies**: + - `TERMINATE` - Child terminated when parent closes (default) - `ABANDON` - Child continues running independently - `REQUEST_CANCEL` - Cancellation requested but not forced -**Note:** Do not need to use child workflows simply for breaking complex logic down into smaller pieces. Standard programming abstractions within a workflow can already be used for that. +**Note:** Do not need to use child workflows simply for breaking complex logic down into smaller pieces. Standard programming abstractions within a workflow can already be used for that. ## Continue-as-New **Purpose**: Prevent unbounded history growth by "restarting" with fresh history. **When to Use**: + - Long-running workflows (entity workflows, subscriptions) - Workflows with many iterations - When history approaches 10,000+ events - Periodic cleanup of accumulated state **How It Works**: + ``` Workflow (history: 10,000 events) │ @@ -135,12 +151,14 @@ New Workflow Execution (history: 0 events) **Purpose**: Distributed transactions with compensation for failures. **When to Use**: + - Multi-step operations that span services - Operations requiring rollback on failure - Financial transactions, order processing - Booking systems with multiple reservations **How It Works**: + ``` Step 1: Reserve inventory └─ Compensation: Release inventory @@ -157,6 +175,7 @@ On failure at step 3: ``` **Implementation Pattern**: + 1. Track compensation actions as you complete each step 2. On failure, execute compensations in reverse order 3. Handle compensation failures gracefully (log, alert, manual intervention) @@ -166,12 +185,14 @@ On failure at step 3: **Purpose**: Run multiple independent operations concurrently. **When to Use**: + - Processing multiple items that don't depend on each other - Calling multiple APIs simultaneously - Fan-out/fan-in patterns - Reducing total workflow duration **Patterns**: + - `Promise` / `asyncio` - Use traditional concurrency helpers (e.g. wait for all, wait for first, etc) - Partial failure handling - Continue with successful results @@ -180,12 +201,14 @@ On failure at step 3: **Purpose**: Model long-lived entities as workflows that handle events. **When to Use**: + - Subscription management - User sessions - Shopping carts - Any stateful entity receiving events over time **How It Works**: + ``` Entity Workflow (user-123) │ @@ -206,12 +229,14 @@ Entity Workflow (user-123) **Purpose**: Durable delays that survive worker restarts. **Use Cases**: + - Scheduled reminders - Timeout handling - Delayed actions - Polling with intervals **Characteristics**: + - Timers are durable (persisted in history) - Can be cancelled @@ -252,15 +277,16 @@ To ensure that polling_activity is restarted in a timely manner, we make sure th **Implementation**: -Define an Activty which fails (raises an exception) exactly when polling is not completed. +Define an Activity which fails (raises an exception) exactly when polling is not completed. + +The polling loop is accomplished via activity retries, by setting the following Retry options: -The polling loop is accomplised via activity retries, by setting the following Retry options: - backoff_coefficient: to 1 - initial_interval: to the polling interval (e.g. 60 seconds) This will enable the Activity to be retried exactly on the set interval. -**Advantage:** Individual Activity retries are not recorded in Workflow History, so this approach can poll for a very long time without affecting the history size. +**Advantage:** Individual Activity retries are not recorded in Workflow History, so this approach can poll for a very long time without affecting the history size. ## Idempotency Patterns @@ -284,6 +310,7 @@ Activity: charge_payment(order_id, amount) ``` **Good idempotency key sources**: + - Workflow ID (unique per workflow execution) - Business identifier (order ID, transaction ID) - Workflow ID + activity name + attempt number @@ -336,13 +363,15 @@ This ensures that on replay, already-completed steps are skipped. **Purpose**: Handle data that exceeds Temporal's payload limits without polluting workflow history. **Limits** (see `references/core/gotchas.md` for details): + - Max 2MB per individual payload - Max 4MB per gRPC message -- Max 50MB for workflow history (aim for <10MB) +- Max 50MB for workflow history (aim for < 10MB) **Key Principle**: Large data should never flow through workflow history. Activities read and write large data directly, passing only small references through the workflow. **Wrong Approach**: + ``` Workflow │ @@ -356,6 +385,7 @@ Workflow This defeats the purpose—large data enters workflow history multiple times. **Correct Approach**: + ``` Workflow │ @@ -368,6 +398,7 @@ Workflow The workflow only handles references (small strings). The activity does all large data operations internally. **Implementation Pattern**: + 1. Accept a reference (URL, S3 key, database ID) as activity input 2. Download/fetch the large data inside the activity 3. Process the data inside the activity @@ -375,6 +406,7 @@ The workflow only handles references (small strings). The activity does all larg 5. Return only a reference to the result **Other Strategies**: + - **Compression**: Use a PayloadCodec to compress data automatically - **Chunking**: Split large collections across multiple activities, each handling a subset @@ -383,11 +415,13 @@ The workflow only handles references (small strings). The activity does all larg **Purpose**: Enable cancellation delivery and progress tracking for long-running activities. **Why Heartbeat**: + 1. **Support activity cancellation** - Cancellations are delivered to activities via heartbeat. Activities that don't heartbeat won't know they've been cancelled. 2. **Resume progress after failure** - Heartbeat details persist across retries, allowing activities to resume where they left off. 3. **Detect stuck activities** - If an activity stops heartbeating, Temporal can time it out and retry. **How Cancellation Works**: + ``` Workflow requests activity cancellation │ @@ -410,20 +444,24 @@ Activity calls heartbeat() **Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. **When to Use**: + - Short operations completing in milliseconds/seconds - High-frequency calls where task queue overhead is significant - Low-latency requirements where you can't afford task queue round-trip **Characteristics**: + - Executes on the same worker that runs the workflow - No task queue round-trip (lower latency) - Still recorded in history - Should complete quickly (default timeout is short) **Trade-offs**: + - Less visibility in Temporal UI (no separate task) - Must complete on the same worker - Not suitable for long-running operations +- **Risk with consecutive local activities:** Local activity completions are only persisted when the current Workflow Task completes. Calling multiple local activities in a row (with nothing in between to yield the Workflow Task) increases the risk of losing work if the worker crashes mid-sequence. If you need a chain of operations with durable checkpoints between each step, use regular activities instead. ## Choosing Between Patterns diff --git a/references/core/priority-fairness.md b/references/core/priority-fairness.md new file mode 100644 index 0000000..cb6930e --- /dev/null +++ b/references/core/priority-fairness.md @@ -0,0 +1,340 @@ +# Task Queue Priority and Fairness + +## Overview + +Priority and Fairness control how Tasks are distributed within a Task Queue. Priority determines execution order. Fairness prevents one group of Tasks from starving others. They can be used independently or together. + +Both features are in Public Preview. Priority is free. Fairness is a paid feature in Temporal Cloud. + +## Priority + +Priority lets you control execution order within a single Task Queue by assigning a priority key (integer 1-5, lower = higher priority). Each priority level acts as a sub-queue. All priority-1 Tasks dispatch before priority-2, and so on. Tasks at the same priority level dispatch in FIFO order. + +Default priority is 3. Activities inherit their parent workflow's priority unless explicitly overridden. + +### When to use Priority + +Use Priority to differentiate execution order between types of work sharing a single Task Queue and Worker pool. For example, process payment-related Tasks before less time-sensitive inventory management Tasks, or ensure real-time Tasks run ahead of batch Tasks. You can also use it to run urgent Tasks immediately by assigning them priority 1. + +### CLI + +``` +temporal workflow start \ + --type ChargeCustomer \ + --task-queue my-task-queue \ + --workflow-id my-workflow-id \ + --input '{"customerId":"12345"}' \ + --priority-key 1 +``` + +### Go + +```go +workflowOptions := client.StartWorkflowOptions{ + ID: "my-workflow-id", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{PriorityKey: 1}, +} +we, err := c.ExecuteWorkflow(context.Background(), workflowOptions, MyWorkflow) +``` + +### Java + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder().setPriorityKey(1).build()) + .build(); +``` + +### Python + +```python +await client.start_workflow( + MyWorkflow.run, + args="hello", + id="my-workflow-id", + task_queue="my-task-queue", + priority=Priority(priority_key=1), +) +``` + +### TypeScript + +```ts +const handle = await startWorkflow(workflows.myWorkflow, { + args: [false, 1], + priority: { priorityKey: 1 }, +}); +``` + +### .NET + +```csharp +var handle = await Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("hello"), + new StartWorkflowOptions(id: "my-workflow-id", taskQueue: "my-task-queue") + { + Priority = new Priority(1), + } +); +``` + +## Fairness + +Fairness prevents one group of Tasks from monopolizing Worker capacity. Each fairness key creates a "virtual queue" within the Task Queue. The server uses round-robin dispatch across virtual queues so no single key can block others, even with a much larger backlog. + +### When to use Fairness + +Fairness solves the multi-tenant starvation problem. Without it, Tasks dispatch FIFO: if tenant-big enqueues 100k Tasks, tenant-small's 10 Tasks sit behind the entire backlog. With Fairness, each tenant gets its own virtual queue and Tasks are interleaved. + +Common scenarios: + +- **Multi-tenant applications** where large tenants should not block small ones. +- **Tiered capacity bands** where you want weighted distribution (e.g., 80% premium, 20% free) without limiting overall throughput when one band is empty. +- **Batch jobs** where some jobs run far more frequently than others. +- **Multi-vendor processing** where a few vendors generate the majority of work. + +If all your Tasks can be dispatched immediately (no backlog), you don't need Fairness. + +Fairness applies at Task dispatch time and considers each Task as having equal cost until dispatch. It does not account for Tasks currently being processed by Workers. So if you look at Tasks being processed by Workers, you might not see "fairness" across tenants — for example, if tenant-big already has Tasks being processed when tenant-small's Tasks are dispatched, it may still appear that tenant-big is using the most resources. + +### Fairness keys and weights + +A fairness key is a string, typically a tenant ID or workload category. Each unique key creates a virtual queue. + +A fairness weight (float, default 1.0) controls how often a key's Tasks are dispatched relative to others. A key with weight 2.0 dispatches twice as often as keys with weight 1.0. + +Example with three tiers: + +| Fairness Key | Weight | Share of Dispatches | +|----------------|--------|---------------------| +| premium-tier | 5.0 | 50% | +| basic-tier | 3.0 | 30% | +| free-tier | 2.0 | 20% | + +Tasks without a fairness key are grouped under an implicit empty-string key with weight 1.0. Adoption is incremental: unkeyed Tasks participate in round-robin alongside keyed Tasks. + +### Using Fairness with Priority + +When combined, Priority determines which sub-queue Tasks go into (priority 1 before 2, etc.), and Fairness applies within each priority level. + +### SDK examples + +#### CLI + +``` +temporal workflow start \ + --type ChargeCustomer \ + --task-queue my-task-queue \ + --workflow-id my-workflow-id \ + --input '{"customerId":"12345"}' \ + --priority-key 1 \ + --fairness-key tenant-123 \ + --fairness-weight 2.0 +``` + +#### Go + +```go +workflowOptions := client.StartWorkflowOptions{ + ID: "my-workflow-id", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +we, err := c.ExecuteWorkflow(context.Background(), workflowOptions, MyWorkflow) +``` + +Activities: + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +ctx := workflow.WithActivityOptions(ctx, ao) +err := workflow.ExecuteActivity(ctx, MyActivity).Get(ctx, nil) +``` + +#### Java + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder() + .setPriorityKey(1) + .setFairnessKey("tenant-123") + .setFairnessWeight(2.0) + .build()) + .build(); +``` + +#### Python + +```python +await client.start_workflow( + MyWorkflow.run, + args="hello", + id="my-workflow-id", + task_queue="my-task-queue", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), +) +``` + +Activities: + +```python +await workflow.execute_activity( + say_hello, + "hi", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), + start_to_close_timeout=timedelta(seconds=5), +) +``` + +#### TypeScript + +```ts +const handle = await startWorkflow(workflows.myWorkflow, { + args: [false, 1], + priority: { priorityKey: 1, fairnessKey: 'tenant-123', fairnessWeight: 2.0 }, +}); +``` + +#### .NET + +```csharp +var handle = await Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("hello"), + new StartWorkflowOptions(id: "my-workflow-id", taskQueue: "my-task-queue") + { + Priority = new Priority( + priorityKey: 1, + fairnessKey: "tenant-123", + fairnessWeight: 2.0 + ) + } +); +``` + +#### Child Workflows + +Child workflows can set their own priority and fairness, overriding the parent. + +Go: + +```go +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: "child-workflow-id", + TaskQueue: "child-task-queue", + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +ctx := workflow.WithChildOptions(ctx, cwo) +err := workflow.ExecuteChildWorkflow(ctx, MyChildWorkflow).Get(ctx, nil) +``` + +Java: + +```java +ChildWorkflowOptions childOptions = ChildWorkflowOptions.newBuilder() + .setTaskQueue("child-task-queue") + .setWorkflowId("child-workflow-id") + .setPriority(Priority.newBuilder() + .setPriorityKey(1) + .setFairnessKey("tenant-123") + .setFairnessWeight(2.0) + .build()) + .build(); +MyChildWorkflow child = Workflow.newChildWorkflowStub(MyChildWorkflow.class, childOptions); +child.run(); +``` + +Python: + +```python +await workflow.execute_child_workflow( + MyChildWorkflow.run, + args="hello child", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), +) +``` + +TypeScript: + +```ts +const handle = await startChildWorkflow(workflows.myChildWorkflow, { + args: [false, 1], + priority: { priorityKey: 1, fairnessKey: 'tenant-123', fairnessWeight: 2.0 }, +}); +``` + +.NET: + +```csharp +await Workflow.ExecuteChildWorkflowAsync( + (MyChildWorkflow wf) => wf.RunAsync("hello child"), + new() { + Priority = new( + priorityKey: 1, + fairnessKey: "tenant-123", + fairnessWeight: 2.0 + ) + } +); +``` + +### Rate limiting + +Two rate-limiting controls work alongside Fairness: + +- **`queue-rps-limit`** — overall dispatch rate for the entire Task Queue. +- **`fairness-key-rps-limit-default`** — per-key rate limit, scaled by weight. If the default is 10 rps and a key has weight 2.5, that key's effective limit is 25 rps. + +``` +temporal task-queue config set \ + --task-queue my-task-queue \ + --task-queue-type activity \ + --namespace my-namespace \ + --queue-rps-limit 500 \ + --queue-rps-limit-reason "overall limit" \ + --fairness-key-rps-limit-default 33.3 \ + --fairness-key-rps-limit-reason "per-key limit" +``` + +If both limits are set, the more restrictive one applies. + +### Fairness weight overrides + +You can override the weights of up to 1000 keys through the config API. When an override is set for a key, the SDK-supplied weight is ignored. Overrides are per Task Queue and type (workflow vs. activity), so set them for both if needed. + +### Enabling Fairness + +When you start using fairness keys, it switches your active Task Queues to fairness mode. Existing queued Tasks are processed before any new fairness-mode ones. + +**Temporal Cloud**: automatically enabled when you start using fairness keys. + +**Self-hosted**: set these dynamic config flags to `true`: + +- `matching.useNewMatcher` +- `matching.enableFairness` +- `matching.enableMigration` (to drain existing backlogs after enabling) + +### Limitations + +- Accuracy can degrade with a very large number of distinct fairness keys. +- Task Queue partitioning can interfere with fairness distribution. Contact Temporal Support to set a Task Queue to a single partition if needed. +- Weights apply at schedule time, not dispatch time. Changing a weight does not reorder already-backlogged Tasks. +- Fairness is not guaranteed across different Worker versions when using Worker Versioning. +- After server restarts, less-active keys may briefly dispatch new Tasks ahead of their existing backlog until ordering normalizes. diff --git a/references/core/troubleshooting.md b/references/core/troubleshooting.md index e4ef2cb..1df80f9 100644 --- a/references/core/troubleshooting.md +++ b/references/core/troubleshooting.md @@ -59,19 +59,15 @@ Workflow stuck in RUNNING? 1. **No worker running** - See references/core/dev-management.md - 2. **Worker on wrong task queue** - Check: Worker logs for task queue name - Fix: Start worker with matching task queue - 3. **Worker has stale code** - Check: Worker startup time vs code changes - Fix: Restart worker with updated code - 4. **Workflow waiting for signal** - Check: Workflow history for pending signals - Fix: Send expected signal or check signal sender - 5. **Activity stuck/timing out** - Check: Activity retry attempts in history - Fix: Investigate activity failure, increase timeout @@ -107,6 +103,7 @@ NondeterminismError? ### Common Causes 1. **Changed call order** + ``` # Before # After (BREAKS) await activity_a await activity_b @@ -114,28 +111,33 @@ NondeterminismError? ``` 2. **Changed call name** + ``` # Before # After (BREAKS) await process_order(...) await handle_order(...) ``` 3. **Added/removed call** + - Adding new activity mid-workflow - Removing activity that was previously called 4. **Using non-deterministic code** + - `datetime.now()` in workflow (use `workflow.now()`) - `random.random()` in workflow (use `workflow.random()`) ### Recovery **Accidental Change:** + 1. Identify the change 2. Revert code to match history 3. Restart worker 4. Workflow automatically recovers **Intentional Change:** + 1. Use patching API for gradual migration 2. Or terminate old workflows, start new ones @@ -163,11 +165,9 @@ Workflow status = FAILED? 1. **Unhandled exception in workflow** - Check error message and stack trace - Fix bug in workflow code - 2. **Activity exhausted retries** - All retry attempts failed - Check activity logs for root cause - 3. **Non-retryable error thrown** - Error marked as non-retryable - Intentional failure, check business logic @@ -192,7 +192,7 @@ Timeout error? ├─▶ Which timeout? │ │ │ ├─▶ Workflow timeout -│ │ └─▶ Increase timeout or optimize workflow. Better yet, consider removing the workflow timeout, as it is generally discourged unless *necessary* for your use case. +│ │ └─▶ Increase timeout or optimize workflow. Better yet, consider removing the workflow timeout, as it is generally discouraged unless *necessary* for your use case. │ │ │ ├─▶ ScheduleToCloseTimeout │ │ └─▶ Activity taking too long overall (including retries) @@ -236,11 +236,9 @@ Activity retrying repeatedly? 1. **Bug in activity code** - Fix the bug - Consider marking certain errors as non-retryable - 2. **External service down** - Retries are working as intended - Monitor service recovery - 3. **Invalid input** - Validate inputs before activity - Return non-retryable error for bad input diff --git a/references/core/versioning.md b/references/core/versioning.md index 226bb83..3081dcb 100644 --- a/references/core/versioning.md +++ b/references/core/versioning.md @@ -40,14 +40,17 @@ else: ### Three-Phase Lifecycle **Phase 1: Patch In** + - Add both old and new code paths - New workflows take new path, old workflows take old path **Phase 2: Deprecate** + - After all old workflows complete, remove old code - Keep deprecation marker for history compatibility **Phase 3: Remove** + - After all deprecated workflows complete - Remove patch entirely, only new code remains @@ -116,6 +119,7 @@ Worker v2.0 (Build ID: def456) **Build ID**: Specific code version (e.g., git commit hash) **Versioning Behaviors**: + - `PINNED` - Workflows stay on original worker version - `AUTO_UPGRADE` - Workflows can move to newer versions diff --git a/references/dotnet/advanced-features.md b/references/dotnet/advanced-features.md new file mode 100644 index 0000000..dd844d0 --- /dev/null +++ b/references/dotnet/advanced-features.md @@ -0,0 +1,203 @@ +# .NET SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```csharp +using Temporalio.Client.Schedules; + +var scheduleId = "daily-report"; +await client.CreateScheduleAsync( + scheduleId, + new Schedule( + Action: ScheduleActionStartWorkflow.Create( + (DailyReportWorkflow wf) => wf.RunAsync(), + new(id: "daily-report", taskQueue: "reports")), + Spec: new ScheduleSpec + { + Intervals = new List + { + new(Every: TimeSpan.FromDays(1)), + }, + })); + +// Manage schedules +var handle = client.GetScheduleHandle(scheduleId); +await handle.PauseAsync("Maintenance window"); +await handle.UnpauseAsync(); +await handle.TriggerAsync(); // Run immediately +await handle.DeleteAsync(); +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a `HeartbeatTimeout` on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a `HeartbeatTimeout`, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +```csharp +using Temporalio.Activities; +using Temporalio.Client; + +[Activity] +public async Task RequestApprovalAsync(string requestId) +{ + var taskToken = ActivityExecutionContext.Current.Info.TaskToken; + + // Store task token for later completion (e.g., in database) + await StoreTaskTokenAsync(requestId, taskToken); + + // Mark this activity as waiting for external completion + throw new CompleteAsyncException(); +} + +// Later, complete the activity from another process +public async Task CompleteApprovalAsync(string requestId, bool approved) +{ + var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + // Retrieve the task token from external storage (e.g., database) + var taskToken = await GetTaskTokenAsync(requestId); + + var handle = client.GetAsyncActivityHandle(taskToken); + + // Optional: if a HeartbeatTimeout was set, you can periodically: + // await handle.HeartbeatAsync(progressDetails); + + if (approved) + await handle.CompleteAsync("approved"); + else + // You can also fail or report cancellation via the handle + await handle.FailAsync(new ApplicationFailureException("Rejected")); +} +``` + +## Worker Tuning + +Configure worker performance settings. + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + // Workflow task concurrency + MaxConcurrentWorkflowTasks = 100, + // Activity task concurrency + MaxConcurrentActivities = 100, + // Graceful shutdown timeout + GracefulShutdownTimeout = TimeSpan.FromSeconds(30), + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +## Workflow Init Attribute + +You should always put state initialization logic in the constructor of your workflow class, so that it happens before signals/updates arrive. + +Normally, your constructor must have no arguments. However, if you add the `[WorkflowInit]` attribute, then your constructor instead receives the same workflow arguments that `[WorkflowRun]` receives: + +```csharp +[Workflow] +public class MyWorkflow +{ + private readonly string _initialValue; + private readonly List _items = new(); + + [WorkflowInit] + public MyWorkflow(string initialValue) + { + _initialValue = initialValue; + } + + [WorkflowRun] + public async Task RunAsync(string initialValue) + { + // _initialValue and _items are already initialized + return _initialValue; + } +} +``` + +Constructor (with `[WorkflowInit]`) and `[WorkflowRun]` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. + +## Workflow Failure Exception Types + +Control which exceptions cause workflow failures vs workflow task retries. + +**Default behavior:** Only `ApplicationFailureException` fails a workflow. All other exceptions retry the workflow task forever (treated as bugs to fix with a code deployment). + +**Tip for testing:** Set `WorkflowFailureExceptionTypes` to include `Exception` so any unhandled exception fails the workflow immediately rather than retrying the workflow task forever. This surfaces bugs faster. + +### Worker-Level Configuration + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + // These exception types will fail the workflow execution (not just the task) + WorkflowFailureExceptionTypes = new[] { typeof(ArgumentException), typeof(InvalidOperationException) }, + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +## Dependency Injection + +The .NET SDK supports dependency injection via the `Temporalio.Extensions.Hosting` package, which integrates with .NET's generic host. + +### Worker as Generic Host + +```csharp +using Temporalio.Extensions.Hosting; + +public class Program +{ + public static async Task Main(string[] args) + { + var host = Host.CreateDefaultBuilder(args) + .ConfigureServices(ctx => + ctx. + AddScoped(). + AddHostedTemporalWorker( + clientTargetHost: "localhost:7233", + clientNamespace: "default", + taskQueue: "my-task-queue"). + AddScopedActivities(). + AddWorkflow()) + .Build(); + await host.RunAsync(); + } +} +``` + +### Activity Dependency Injection + +As shown in the host setup above, activities can be registered with `AddScopedActivities()`, `AddSingletonActivities()`, or `AddTransientActivities()`. Activities registered this way are created via DI, allowing constructor injection: + +```csharp +public class MyActivities +{ + private readonly ILogger _logger; + private readonly IOrderRepository _repository; + + public MyActivities(ILogger logger, IOrderRepository repository) + { + _logger = logger; + _repository = repository; + } + + [Activity] + public async Task GetOrderAsync(string orderId) + { + _logger.LogInformation("Fetching order {OrderId}", orderId); + return await _repository.GetAsync(orderId); + } +} +``` + +**Note:** Dependency injection is NOT available in workflows — workflows must be self-contained for determinism. diff --git a/references/dotnet/data-handling.md b/references/dotnet/data-handling.md new file mode 100644 index 0000000..8d0bb23 --- /dev/null +++ b/references/dotnet/data-handling.md @@ -0,0 +1,217 @@ +# .NET SDK Data Handling + +## Overview + +The .NET SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. + +## Default Data Converter + +The default converter handles: + +- `null` +- `byte[]` (as binary) +- `Google.Protobuf.IMessage` instances +- Anything that `System.Text.Json` supports +- `IRawValue` as unconverted raw payloads + +## Custom Data Converter + +Customize serialization by extending `DefaultPayloadConverter`. For example, to use camelCase property naming: + +```csharp +using System.Text.Json; +using Temporalio.Client; +using Temporalio.Converters; + +public class CamelCasePayloadConverter : DefaultPayloadConverter +{ + public CamelCasePayloadConverter() + : base(new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase }) + { + } +} + +var client = await TemporalClient.ConnectAsync(new() +{ + TargetHost = "localhost:7233", + Namespace = "my-namespace", + DataConverter = DataConverter.Default with + { + PayloadConverter = new CamelCasePayloadConverter(), + }, +}); +``` + +## Protobuf Support + +The default data converter includes built-in support for Protocol Buffer messages via `Google.Protobuf.IMessage`. Protobuf messages are automatically serialized using proto3 JSON. + +```csharp +// Any Google.Protobuf.IMessage is automatically handled +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync(MyProtoRequest request) + { + // Protobuf messages are serialized/deserialized automatically + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessAsync(request), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +## Payload Encryption + +Encrypt sensitive workflow data using a custom `IPayloadCodec`: + +```csharp +using Temporalio.Converters; +using Google.Protobuf; + +public class EncryptionCodec : IPayloadCodec +{ + public Task> EncodeAsync( + IReadOnlyCollection payloads) => + Task.FromResult>(payloads.Select(p => + new Payload + { + Metadata = { ["encoding"] = "binary/encrypted" }, + Data = ByteString.CopyFrom(Encrypt(p.ToByteArray())), + }).ToList()); + + public Task> DecodeAsync( + IReadOnlyCollection payloads) => + Task.FromResult>(payloads.Select(p => + { + if (p.Metadata.GetValueOrDefault("encoding") != "binary/encrypted") + return p; + return Payload.Parser.ParseFrom(Decrypt(p.Data.ToByteArray())); + }).ToList()); + + private byte[] Encrypt(byte[] data) => /* your encryption logic */; + private byte[] Decrypt(byte[] data) => /* your decryption logic */; +} + +// Apply encryption codec +var client = await TemporalClient.ConnectAsync(new("localhost:7233") +{ + DataConverter = DataConverter.Default with + { + PayloadCodec = new EncryptionCodec(), + }, +}); +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. These can be set at workflow start: + +```csharp +using Temporalio.Common; + +var handle = await client.StartWorkflowAsync( + (OrderWorkflow wf) => wf.RunAsync(order), + new(id: $"order-{order.Id}", taskQueue: "orders") + { + TypedSearchAttributes = new SearchAttributeCollection.Builder() + .Set(SearchAttributeKey.CreateKeyword("OrderId"), order.Id) + .Set(SearchAttributeKey.CreateKeyword("OrderStatus"), "pending") + .Set(SearchAttributeKey.CreateFloat("OrderTotal"), order.Total) + .Build(), + }); +``` + +Or upserted during workflow execution: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + // ... process order ... + + // Update search attribute + Workflow.UpsertTypedSearchAttributes( + SearchAttributeKey.CreateKeyword("OrderStatus").ValueSet("completed")); + return "done"; + } +} +``` + +### Querying Workflows by Search Attributes + +```csharp +await foreach (var wf in client.ListWorkflowsAsync( + "OrderStatus = \"processing\" OR OrderStatus = \"pending\"")) +{ + Console.WriteLine($"Workflow {wf.Id} is still processing"); +} +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```csharp +await client.ExecuteWorkflowAsync( + (OrderWorkflow wf) => wf.RunAsync(order), + new(id: $"order-{order.Id}", taskQueue: "orders") + { + Memo = new Dictionary + { + ["customer_name"] = order.CustomerName, + ["notes"] = "Priority customer", + }, + }); +``` + +```csharp +// Read memo from workflow +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + var notes = Workflow.Memo["notes"]; + // ... + } +} +``` + +## Deterministic APIs for Values + +Use these APIs within workflows for deterministic random values and UUIDs: + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // Deterministic GUID (same on replay) + var uniqueId = Workflow.NewGuid(); + + // Deterministic random (same on replay) + var value = Workflow.Random.Next(1, 100); + + // Deterministic current time + var now = Workflow.UtcNow; + + return uniqueId.ToString(); + } +} +``` + +## Best Practices + +1. Use records or classes with `System.Text.Json` support for input/output +2. Keep payloads small — see `references/core/gotchas.md` for limits +3. Encrypt sensitive data with `IPayloadCodec` +4. Use `Workflow.NewGuid()` and `Workflow.Random` for deterministic values +5. Use camelCase converter if interoperating with other SDKs diff --git a/references/dotnet/determinism-protection.md b/references/dotnet/determinism-protection.md new file mode 100644 index 0000000..8c7f331 --- /dev/null +++ b/references/dotnet/determinism-protection.md @@ -0,0 +1,51 @@ +# .NET Determinism Protection + +## Overview + +The .NET SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **runtime task detection**. Unlike the Python and TypeScript SDKs, the .NET SDK will not intercept or replace non-deterministic calls at compile time or import time. The SDK does provide a runtime `EventListener` that detects some invalid task scheduling, but catching all non-deterministic code requires following the rules below and testing, in particular replay tests (see `references/dotnet/testing.md`). + +## Runtime Task Detection + +By default, the .NET SDK enables an `EventListener` that monitors task events. When workflow code accidentally starts a task on the wrong scheduler (e.g., via `Task.Run`), an `InvalidWorkflowOperationException` is thrown. This causes the workflow task to fail, which will continuously retry until the code is fixed. + +```csharp +// This will be detected at runtime and fail the workflow task +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // BAD: Task.Run uses TaskScheduler.Default + await Task.Run(() => DoSomething()); + } +} +``` + +## .NET Task Determinism Rules + +Many .NET `Task` APIs implicitly use `TaskScheduler.Default`, which breaks determinism. Here are the key rules: + +**Do NOT use:** + +- `Task.Run` — uses default scheduler. Use `Workflow.RunTaskAsync`. +- `Task.ConfigureAwait(false)` — leaves current context. Use `ConfigureAwait(true)` or omit. +- `Task.Delay` / `Task.Wait` / timeout-based `CancellationTokenSource` — uses system timers. Use `Workflow.DelayAsync` / `Workflow.WaitConditionAsync`. +- `Task.WhenAny` — use `Workflow.WhenAnyAsync`. +- `Task.WhenAll` — use `Workflow.WhenAllAsync` (technically safe currently, but wrapper is recommended). +- `CancellationTokenSource.CancelAsync` — use `CancellationTokenSource.Cancel`. +- `System.Threading.Semaphore` / `SemaphoreSlim` / `Mutex` — use `Temporalio.Workflows.Semaphore` / `Mutex`. + +**Be wary of:** + +- Third-party libraries that implicitly use `TaskScheduler.Default` +- `Dataflow` blocks and similar concurrency libraries with hidden default scheduler usage + +## Best Practices + +1. **Always use `Workflow.*` alternatives** for Task operations in workflows +2. **Don't disable the `EventListener`** — it's on by default and catches mistakes at runtime +3. **Separate workflow and activity code** into different files/projects for clarity +4. **Use `SortedDictionary`** or sort collections before iterating — `Dictionary` iteration order is not guaranteed +5. **Test with replay** to catch non-determinism early +6. **Review third-party library usage** in workflow code for hidden default scheduler usage diff --git a/references/dotnet/determinism.md b/references/dotnet/determinism.md new file mode 100644 index 0000000..c1dbf56 --- /dev/null +++ b/references/dotnet/determinism.md @@ -0,0 +1,56 @@ +# .NET SDK Determinism + +## Overview + +The .NET SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be deterministic for replay, and determinism is enforced by developer convention and runtime task detection via an `EventListener` (see `references/dotnet/determinism-protection.md`). + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. + +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. + +```csharp +// DO NOT do these in workflows: +await Task.Run(() => { }); // Uses default scheduler +await Task.Delay(TimeSpan.FromSeconds(1)); // System timer +var now = DateTime.UtcNow; // System clock +var r = new Random().Next(); // Non-deterministic +var id = Guid.NewGuid(); // Non-deterministic +File.ReadAllText("file.txt"); // I/O +await httpClient.GetAsync("..."); // Network I/O +``` + +Most non-determinism and side effects should be wrapped in Activities. + +## Safe Builtin Alternatives + +| Forbidden | Safe Alternative | +|-----------|------------------| +| `DateTime.Now` / `DateTime.UtcNow` | `Workflow.UtcNow` | +| `Random` | `Workflow.Random` | +| `Guid.NewGuid()` | `Workflow.NewGuid()` | +| `Task.Delay` | `Workflow.DelayAsync` | +| `Thread.Sleep` | `Workflow.DelayAsync` | +| `Task.Run` | `Workflow.RunTaskAsync` | +| `Task.WhenAll` | `Workflow.WhenAllAsync` | +| `Task.WhenAny` | `Workflow.WhenAnyAsync` | +| `System.Threading.Mutex` | `Temporalio.Workflows.Mutex` | +| `System.Threading.Semaphore` | `Temporalio.Workflows.Semaphore` | +| `CancellationTokenSource.CancelAsync` | `CancellationTokenSource.Cancel` | + +## Testing Replay Compatibility + +Use `WorkflowReplayer` to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/dotnet/testing.md`. + +## Best Practices + +1. Always use `Workflow.*` APIs instead of standard .NET equivalents (see table above) +2. Never use `ConfigureAwait(false)` in workflows +3. Use `SortedDictionary` or sort before iterating collections +4. Move all I/O operations (network, filesystem, database) into activities +5. Use `Workflow.Logger` instead of `Console.WriteLine` for replay-safe logging +6. Keep workflow code focused on orchestration; delegate non-deterministic work to activities +7. Test with replay after making changes to workflow definitions diff --git a/references/dotnet/dotnet.md b/references/dotnet/dotnet.md new file mode 100644 index 0000000..a7f1c54 --- /dev/null +++ b/references/dotnet/dotnet.md @@ -0,0 +1,202 @@ +# Temporal .NET SDK Reference + +## Overview + +The Temporal .NET SDK provides a high-performance, type-safe approach to building durable workflows using C# and .NET. Workflows use attributes (`[Workflow]`, `[WorkflowRun]`) and lambda expressions for type-safe invocations. Supports .NET Framework 4.6.2+ and .NET Core 3.1+ (including .NET 5+). + +**CRITICAL**: The .NET SDK has **no sandbox**. Developers must be careful to avoid non-deterministic code in workflows. See the Determinism Rules section below and `references/dotnet/determinism.md`. + +## Understanding Replay + +Temporal workflows are durable through history replay. For details on how this works, see `references/core/determinism.md`. + +## Quick Start + +**Add Dependency:** Install the Temporal SDK NuGet package: + +```bash +dotnet add package Temporalio +``` + +**Activities.cs** - Activity definitions (separate file for clarity): + +```csharp +using Temporalio.Activities; + +public class MyActivities +{ + [Activity] + public string Greet(string name) + { + return $"Hello, {name}!"; + } +} +``` + +**GreetingWorkflow.workflow.cs** - Workflow definition: + +```csharp +using Temporalio.Workflows; + +[Workflow] +public class GreetingWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string name) + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.Greet(name), + new() { StartToCloseTimeout = TimeSpan.FromSeconds(30) }); + } +} +``` + +**Worker (Program.cs)** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): + +```csharp +using Temporalio.Client; +using Temporalio.Worker; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + +using var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + .AddWorkflow() + .AddAllActivities(new MyActivities())); + +await worker.ExecuteAsync(); +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `dotnet run` in the worker project. + +**Starter (Program.cs)** - Start a workflow execution: + +```csharp +using Temporalio.Client; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + +var result = await client.ExecuteWorkflowAsync( + (GreetingWorkflow wf) => wf.RunAsync("my name"), + new(id: $"greeting-{Guid.NewGuid()}", taskQueue: "my-task-queue")); + +Console.WriteLine($"Result: {result}"); +``` + +**Run the workflow:** Run `dotnet run` in the starter project. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition + +- Use `[Workflow]` attribute on class +- Put any state initialization logic in the constructor of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `[WorkflowInit]` attribute and parameters to your constructor. +- Use `[WorkflowRun]` on the async entry point method +- Must return `Task` or `Task` +- Use `[WorkflowSignal]`, `[WorkflowQuery]`, `[WorkflowUpdate]` for handlers + +### Activity Definition + +- Use `[Activity]` attribute on methods +- Can be sync or async +- Instance methods support dependency injection +- Static methods are also supported + +### Worker Setup + +- Connect client, create `TemporalWorker` with workflows and activities +- Use `AddWorkflow()` and `AddAllActivities(instance)` or `AddActivity(method)` + +### Determinism + +**Workflow code must be deterministic!** The .NET SDK has no sandbox. See the Determinism Rules section below and `references/core/determinism.md` and `references/dotnet/determinism.md`. + +## File Organization Best Practice + +**Keep Workflow definitions in separate files from Activity definitions.** While not as critical as Python (no sandbox reloading), separation improves clarity and testability. Use the `.workflow.cs` extension for workflow files so the `.editorconfig` overrides (see below) apply only to workflow code. + +``` +MyTemporalApp/ +├── Workflows/ +│ └── GreetingWorkflow.workflow.cs # Only Workflow classes +├── Activities/ +│ └── TranslateActivities.cs # Only Activity classes +├── Models/ +│ └── OrderInput.cs # Shared data models +├── Worker/ +│ └── Program.cs # Worker setup +└── Starter/ + └── Program.cs # Client code to start workflows +``` + +## Workflow .editorconfig + +Workflow code violates some standard .NET analyzer rules. The recommended approach is to use the `.workflow.cs` file extension for workflow files and scope the overrides to that extension: + +```ini +# Configuration specific for Temporal workflows +[*.workflow.cs] + +# We use getters for queries, they cannot be properties +dotnet_diagnostic.CA1024.severity = none + +# Don't force workflows to have static methods +dotnet_diagnostic.CA1822.severity = none + +# Do not need ConfigureAwait for workflows +dotnet_diagnostic.CA2007.severity = none + +# Do not need task scheduler for workflows +dotnet_diagnostic.CA2008.severity = none + +# Workflow randomness is intentionally deterministic +dotnet_diagnostic.CA5394.severity = none + +# Allow async methods to not have await in them +dotnet_diagnostic.CS1998.severity = none + +# Don't force workflows to call async methods +dotnet_diagnostic.VSTHRD103.severity = none + +# Don't avoid, but rather encourage things using TaskScheduler.Current in workflows +dotnet_diagnostic.VSTHRD105.severity = none +``` + +## Determinism Rules + +The .NET SDK has **no sandbox** like Python or TypeScript. Developers must avoid non-deterministic operations manually. Many standard .NET `Task` APIs use `TaskScheduler.Default` implicitly, which breaks determinism. + +See `references/dotnet/determinism.md` for the full list of forbidden operations, safe alternatives, and best practices. See `references/dotnet/determinism-protection.md` for details on the runtime detection mechanism. + +## Common Pitfalls + +1. **Using `Task.Run` in workflows** — Uses default scheduler, breaks determinism. Use `Workflow.RunTaskAsync`. +2. **Using `Task.Delay` in workflows** — Uses system timer. Use `Workflow.DelayAsync`. +3. **`ConfigureAwait(false)` in workflows** — Leaves the deterministic scheduler. Never use in workflows. +4. **Non-`ApplicationFailureException` in workflows** — Other exceptions retry the workflow task forever instead of failing the workflow. +5. **Dictionary iteration in workflows** — `Dictionary` has no guaranteed order. Use `SortedDictionary`. +6. **Forgetting to heartbeat** — Long-running activities need `ActivityExecutionContext.Current.Heartbeat()` calls. +7. **Using `CancellationTokenSource.CancelAsync`** — Use `CancellationTokenSource.Cancel` instead. +8. **Logging with `Console.WriteLine` in workflows** — Use `Workflow.Logger` for replay-safe logging. + +## Writing Tests + +See `references/dotnet/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files + +- **`references/dotnet/patterns.md`** — Signals, queries, child workflows, saga pattern, etc. +- **`references/dotnet/determinism.md`** — Essentials of determinism in .NET +- **`references/dotnet/gotchas.md`** — .NET-specific mistakes and anti-patterns +- **`references/dotnet/error-handling.md`** — ApplicationFailureException, retry policies, non-retryable errors +- **`references/dotnet/observability.md`** — Logging, metrics, tracing +- **`references/dotnet/testing.md`** — WorkflowEnvironment, time-skipping, activity mocking +- **`references/dotnet/advanced-features.md`** — Schedules, worker tuning, dependency injection +- **`references/dotnet/data-handling.md`** — Data converters, payload encryption, etc. +- **`references/dotnet/versioning.md`** — Patching API, workflow type versioning, Worker Versioning +- **`references/dotnet/determinism-protection.md`** — Runtime task detection, .NET Task determinism rules diff --git a/references/dotnet/error-handling.md b/references/dotnet/error-handling.md new file mode 100644 index 0000000..f441620 --- /dev/null +++ b/references/dotnet/error-handling.md @@ -0,0 +1,157 @@ +# .NET SDK Error Handling + +## Overview + +The .NET SDK uses `ApplicationFailureException` for application-specific errors and provides comprehensive retry policy configuration. Generally, the following information about errors and retryability applies across activities, child workflows and Nexus operations. + +## Application Failures + +```csharp +using Temporalio.Activities; +using Temporalio.Exceptions; + +[Activity] +public async Task ValidateOrderAsync(Order order) +{ + if (!order.IsValid()) + { + throw new ApplicationFailureException( + "Invalid order", + errorType: "ValidationError"); + } +} +``` + +## Non-Retryable Errors + +```csharp +using Temporalio.Activities; +using Temporalio.Exceptions; + +[Activity] +public async Task ChargeCardAsync(ChargeCardInput input) +{ + if (!IsValidCard(input.CardNumber)) + { + throw new ApplicationFailureException( + "Permanent failure - invalid credit card", + errorType: "PaymentError", + nonRetryable: true); // Will not retry activity + } + return await ProcessPaymentAsync(input.CardNumber, input.Amount); +} +``` + +## Handling Activity Errors in Workflows + +```csharp +using Temporalio.Workflows; +using Temporalio.Exceptions; + +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + try + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.RiskyActivityAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + catch (ActivityFailureException ex) when (!TemporalException.IsCanceledException(ex)) + { + Workflow.Logger.LogError(ex, "Activity failed"); + throw new ApplicationFailureException( + "Workflow failed due to activity error"); + } + } +} +``` + +## Retry Configuration + +```csharp +using Temporalio.Common; +using Temporalio.Workflows; + +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(10), + RetryPolicy = new() + { + MaximumInterval = TimeSpan.FromMinutes(1), + MaximumAttempts = 5, + NonRetryableErrorTypes = new[] { "ValidationError", "PaymentError" }, + }, + }); + } +} +``` + +Only set options such as MaximumInterval, MaximumAttempts etc. if you have a domain-specific reason to. +If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(5), // Single attempt + ScheduleToCloseTimeout = TimeSpan.FromMinutes(30), // Including retries + HeartbeatTimeout = TimeSpan.FromMinutes(2), // Between heartbeats + }); + } +} +``` + +## Workflow Failure + +**Critical .NET behavior:** Only `ApplicationFailureException` will fail a workflow. All other exceptions (including standard .NET exceptions like `NullReferenceException`, `KeyNotFoundException`, etc.) will **retry the workflow task** indefinitely. This is by design — those are treated as bugs to be fixed with a code deployment, not reasons for the workflow to fail. + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + if (someCondition) + { + throw new ApplicationFailureException( + "Cannot process order", + errorType: "BusinessError"); + } + return "success"; + } +} +``` + +**Note:** Do not use `nonRetryable:` with `ApplicationFailureException` inside a workflow (as opposed to an activity). + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable in activities +3. Configure appropriate retry policies +4. Log errors before re-raising +5. Use `ActivityFailureException` to catch activity failures in workflows +6. Design code to be idempotent for safe retries (see more at `references/core/patterns.md`) +7. Only throw `ApplicationFailureException` from workflows to fail them — other exceptions will retry the workflow task diff --git a/references/dotnet/gotchas.md b/references/dotnet/gotchas.md new file mode 100644 index 0000000..9b5806c --- /dev/null +++ b/references/dotnet/gotchas.md @@ -0,0 +1,262 @@ +# .NET Gotchas + +.NET-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## .NET Task Determinism + +The biggest .NET gotcha. Many `Task` APIs implicitly use `TaskScheduler.Default`, which breaks determinism. The SDK detects some of these at runtime via an `EventListener`, but not all. + +### Task.Run + +```csharp +// BAD: Uses TaskScheduler.Default +await Task.Run(() => DoSomething()); + +// GOOD: Uses current (deterministic) scheduler +await Workflow.RunTaskAsync(() => DoSomething()); +``` + +### Task.Delay / Thread.Sleep + +```csharp +// BAD: Uses system timer +await Task.Delay(TimeSpan.FromMinutes(5)); + +// GOOD: Creates durable timer in event history +await Workflow.DelayAsync(TimeSpan.FromMinutes(5)); +``` + +### ConfigureAwait(false) + +```csharp +// BAD: Leaves the deterministic context +var result = await SomeCallAsync().ConfigureAwait(false); + +// GOOD: Stays on deterministic scheduler (or just omit ConfigureAwait) +var result = await SomeCallAsync().ConfigureAwait(true); +var result = await SomeCallAsync(); // Also fine +``` + +### Task.WhenAll / Task.WhenAny + +```csharp +// BAD: Potential non-determinism +await Task.WhenAll(task1, task2); +await Task.WhenAny(task1, task2); + +// GOOD: Deterministic wrappers +await Workflow.WhenAllAsync(task1, task2); +await Workflow.WhenAnyAsync(task1, task2); +``` + +### Threading Primitives + +```csharp +// BAD: System threading primitives +var mutex = new System.Threading.Mutex(); +var semaphore = new SemaphoreSlim(1); + +// GOOD: Temporal workflow-safe alternatives +var mutex = new Temporalio.Workflows.Mutex(); +var semaphore = new Temporalio.Workflows.Semaphore(1); +``` + +See `references/dotnet/determinism-protection.md` for the complete list. + +## Wrong Retry Classification + +**Example:** Transient network errors should be retried. Authentication errors should not be. +See `references/dotnet/error-handling.md` to understand how to classify errors. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```csharp +// BAD: No heartbeat, can't detect stuck activities +[Activity] +public async Task ProcessLargeFileAsync(string path) +{ + foreach (var chunk in ReadChunks(path)) + await ProcessAsync(chunk); // Takes hours, no heartbeat + +// GOOD: Regular heartbeats with progress +[Activity] +public async Task ProcessLargeFileAsync(string path) +{ + var chunks = ReadChunks(path); + for (var i = 0; i < chunks.Count; i++) + { + ActivityExecutionContext.Current.Heartbeat($"Processing chunk {i}"); + await ProcessAsync(chunks[i]); + } +} +``` + +### Heartbeat Timeout Too Short + +```csharp +// BAD: Heartbeat timeout shorter than processing time +await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessChunkAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(30), + HeartbeatTimeout = TimeSpan.FromSeconds(10), // Too short! + }); + +// GOOD: Heartbeat timeout allows for processing variance +await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessChunkAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(30), + HeartbeatTimeout = TimeSpan.FromMinutes(2), + }); +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```csharp +// BAD: Cleanup doesn't run on cancellation +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.AcquireResourceAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.DoWorkAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ReleaseResourceAsync(), // Never runs if cancelled! + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} + +// GOOD: Use try/finally for cleanup +[Workflow] +public class GoodWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.AcquireResourceAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + try + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.DoWorkAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + finally + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ReleaseResourceAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(5), + CancellationToken = CancellationToken.None, + }); + } + } +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: + +1. **Heartbeating** — Cancellation is delivered via heartbeat +2. **Checking the cancellation token** — Token is triggered when heartbeat detects cancellation + +```csharp +// BAD: Activity ignores cancellation +[Activity] +public async Task LongActivityAsync() +{ + await DoExpensiveWorkAsync(); // Runs to completion even if cancelled +} + +// GOOD: Heartbeat, check cancellation, and handle cleanup +[Activity] +public async Task LongActivityAsync() +{ + try + { + foreach (var item in items) + { + ActivityExecutionContext.Current.Heartbeat(); + ActivityExecutionContext.Current.CancellationToken.ThrowIfCancellationRequested(); + await ProcessAsync(item); + } + } + catch (OperationCanceledException) + { + await CleanupAsync(); + throw; + } +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/dotnet/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code. Please see `references/dotnet/testing.md` for more info. + +## Timers and Sleep + +### Using Task.Delay + +```csharp +// BAD: Task.Delay uses system timer, not deterministic during replay +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Task.Delay(TimeSpan.FromMinutes(1)); // SDK will detect and fail the task + } +} + +// GOOD: Use Workflow.DelayAsync for deterministic timers +[Workflow] +public class GoodWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.DelayAsync(TimeSpan.FromMinutes(1)); // Deterministic + } +} +``` + +**Why this matters:** `Task.Delay` uses the system clock, which differs between original execution and replay. `Workflow.DelayAsync` creates a durable timer in the event history, ensuring consistent behavior during replay. + +## Dictionary Iteration Order + +```csharp +// BAD: Dictionary iteration order is not guaranteed +var dict = new Dictionary { ["b"] = 2, ["a"] = 1 }; +foreach (var kvp in dict) // Order may differ between executions! + await ProcessAsync(kvp.Key, kvp.Value); + +// GOOD: Use SortedDictionary or sort before iterating +var dict = new SortedDictionary { ["b"] = 2, ["a"] = 1 }; +foreach (var kvp in dict) // Always iterates in key order + await ProcessAsync(kvp.Key, kvp.Value); +``` diff --git a/references/dotnet/observability.md b/references/dotnet/observability.md new file mode 100644 index 0000000..6919207 --- /dev/null +++ b/references/dotnet/observability.md @@ -0,0 +1,108 @@ +# .NET SDK Observability + +## Overview + +The .NET SDK provides observability through logging, metrics, and tracing using standard .NET patterns. + +## Logging + +### Workflow Logging (Replay-Safe) + +Use `Workflow.Logger` for replay-safe logging that avoids duplicate messages: + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string name) + { + Workflow.Logger.LogInformation("Workflow started for {Name}", name); + + var result = await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + Workflow.Logger.LogInformation("Activity completed with {Result}", result); + return result; + } +} +``` + +The workflow logger automatically: + +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `ActivityExecutionContext.Current.Logger` for context-aware activity logging: + +```csharp +[Activity] +public async Task ProcessOrderAsync(string orderId) +{ + var logger = ActivityExecutionContext.Current.Logger; + logger.LogInformation("Processing order {OrderId}", orderId); + + // Perform work... + + logger.LogInformation("Order processed successfully"); + return "completed"; +} +``` + +### Customizing Logger Configuration + +```csharp +using Microsoft.Extensions.Logging; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233") +{ + LoggerFactory = LoggerFactory.Create(builder => + builder + .AddSimpleConsole(options => options.TimestampFormat = "[HH:mm:ss] ") + .SetMinimumLevel(LogLevel.Information)), +}); +``` + +## Metrics + +### Enabling SDK Metrics + +Metrics are configured on `TemporalRuntime`. Create the runtime globally before any client/worker and set a Prometheus endpoint or custom metric meter. + +```csharp +using Temporalio.Client; +using Temporalio.Runtime; + +// Create runtime with Prometheus endpoint +var runtime = new TemporalRuntime(new() +{ + Telemetry = new() { Metrics = new() { Prometheus = new("0.0.0.0:9000") } }, +}); + +// Use this runtime for all clients +var client = await TemporalClient.ConnectAsync( + new("localhost:7233") { Runtime = runtime }); +``` + +Alternatively, use `Temporalio.Extensions.DiagnosticSource` to bridge metrics to a .NET `System.Diagnostics.Metrics.Meter` for integration with OpenTelemetry or other .NET metrics pipelines. + +### Key SDK Metrics + +- `temporal_request` — Client requests to server +- `temporal_workflow_task_execution_latency` — Workflow task processing time +- `temporal_activity_execution_latency` — Activity execution time +- `temporal_workflow_task_replay_latency` — Replay duration + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/dotnet/data-handling.md` + +## Best Practices + +1. Use `Workflow.Logger` in workflows, `ActivityExecutionContext.Current.Logger` in activities +2. Don't use `Console.WriteLine` in workflows — it will produce duplicate output on replay +3. Configure metrics for production monitoring +4. Use Search Attributes for business-level visibility diff --git a/references/dotnet/patterns.md b/references/dotnet/patterns.md new file mode 100644 index 0000000..586fab0 --- /dev/null +++ b/references/dotnet/patterns.md @@ -0,0 +1,495 @@ +# .NET SDK Patterns + +## Signals + +```csharp +[Workflow] +public class OrderWorkflow +{ + private bool _approved; + private readonly List _items = new(); + + [WorkflowSignal] + public async Task ApproveAsync() + { + _approved = true; + } + + [WorkflowSignal] + public async Task AddItemAsync(string item) + { + _items.Add(item); + } + + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.WaitConditionAsync(() => _approved); + return $"Processed {_items.Count} items"; + } +} +``` + +## Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```csharp +[Workflow] +public class DynamicSignalWorkflow +{ + private readonly Dictionary> _signals = new(); + + [WorkflowSignal(Dynamic = true)] + public async Task HandleSignalAsync(string signalName, IRawValue[] args) + { + if (!_signals.ContainsKey(signalName)) + _signals[signalName] = new List(); + var value = Workflow.PayloadConverter.ToValue(args.Single()); + _signals[signalName].Add(value); + } + + [WorkflowRun] + public async Task>> RunAsync() + { + await Workflow.WaitConditionAsync(() => _signals.ContainsKey("done")); + return _signals; + } +} +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```csharp +[Workflow] +public class StatusWorkflow +{ + private string _status = "pending"; + private int _progress; + + [WorkflowQuery] + public string GetStatus() => _status; + + [WorkflowQuery] + public int Progress => _progress; + + [WorkflowRun] + public async Task RunAsync() + { + _status = "running"; + for (var i = 0; i < 100; i++) + { + _progress = i; + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessItem(i), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(1) }); + } + _status = "completed"; + return "done"; + } +} +``` + +## Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```csharp +[Workflow] +public class DynamicQueryWorkflow +{ + private readonly SortedDictionary _state = new() + { + ["status"] = "running", + ["progress"] = "0", + }; + + [WorkflowQuery(Dynamic = true)] + public string HandleQuery(string queryName, IRawValue[] args) + { + return _state.GetValueOrDefault(queryName, "unknown"); + } + + [WorkflowRun] + public async Task RunAsync() { /* ... */ } +} +``` + +## Updates + +```csharp +[Workflow] +public class OrderWorkflow +{ + private readonly List _items = new(); + + [WorkflowUpdate] + public async Task AddItemAsync(string item) + { + _items.Add(item); + return _items.Count; + } + + [WorkflowUpdateValidator(nameof(AddItemAsync))] + public void ValidateAddItem(string item) + { + if (string.IsNullOrEmpty(item)) + throw new ArgumentException("Item cannot be empty"); + if (_items.Count >= 100) + throw new InvalidOperationException("Order is full"); + } + + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.WaitConditionAsync(() => _items.Count > 0); + return $"Order with {_items.Count} items"; + } +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an exception to reject the update; return void to accept. + +## Child Workflows + +```csharp +[Workflow] +public class ParentWorkflow +{ + [WorkflowRun] + public async Task> RunAsync(List orders) + { + var results = new List(); + foreach (var order in orders) + { + var result = await Workflow.ExecuteChildWorkflowAsync( + (ProcessOrderWorkflow wf) => wf.RunAsync(order), + new() + { + Id = $"order-{order.Id}", + // Control what happens to child when parent completes + // Terminate (default), Abandon, RequestCancel + ParentClosePolicy = ParentClosePolicy.Abandon, + }); + results.Add(result); + } + return results; + } +} +``` + +## Handles to External Workflows + +```csharp +[Workflow] +public class CoordinatorWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string targetWorkflowId) + { + var handle = Workflow.GetExternalWorkflowHandle(targetWorkflowId); + + // Signal the external workflow + await handle.SignalAsync(wf => wf.DataReadyAsync(new DataPayload())); + + // Or cancel it + await handle.CancelAsync(); + } +} +``` + +## Parallel Execution + +```csharp +[Workflow] +public class ParallelWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string[] items) + { + var tasks = items.Select(item => + Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessItem(item), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + + return await Workflow.WhenAllAsync(tasks); + } +} +``` + +## Deterministic Task Alternatives + +.NET `Task` APIs often use `TaskScheduler.Default` implicitly. Use Temporal's deterministic alternatives: + +```csharp +// Instead of Task.WhenAll: +await Workflow.WhenAllAsync(task1, task2, task3); + +// Instead of Task.WhenAny: +await Workflow.WhenAnyAsync(task1, task2); + +// Instead of Task.Run: +await Workflow.RunTaskAsync(() => SomeWork()); + +// Instead of Task.Delay: +await Workflow.DelayAsync(TimeSpan.FromMinutes(5)); + +// Instead of System.Threading.Mutex: +var mutex = new Temporalio.Workflows.Mutex(); +await mutex.WaitOneAsync(); +try { /* critical section */ } +finally { mutex.ReleaseMutex(); } + +// Instead of System.Threading.Semaphore: +var semaphore = new Temporalio.Workflows.Semaphore(3); +await semaphore.WaitAsync(); +try { /* limited concurrency section */ } +finally { semaphore.Release(); } +``` + +## Continue-as-New + +```csharp +[Workflow] +public class LongRunningWorkflow +{ + [WorkflowRun] + public async Task RunAsync(WorkflowState state) + { + while (true) + { + state = await ProcessNextBatch(state); + + if (state.IsComplete) + return "done"; + + if (Workflow.ContinueAsNewSuggested) + throw Workflow.CreateContinueAsNewException( + (LongRunningWorkflow wf) => wf.RunAsync(state)); + } + } +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent — they may be retried (as with ALL activities). + +```csharp +[Workflow] +public class OrderSagaWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + var compensations = new List>(); + + try + { + // IMPORTANT: Save compensation BEFORE calling the activity. + // If activity fails after completing but before returning, + // compensation must still be registered. + compensations.Add(() => Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ReleaseInventoryIfReservedAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ReserveInventoryAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + compensations.Add(() => Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.RefundPaymentIfChargedAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ChargePaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ShipOrderAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + return "Order completed"; + } + catch (Exception ex) + { + Workflow.Logger.LogError(ex, "Order failed, running compensations"); + compensations.Reverse(); + foreach (var compensate in compensations) + { + try { await compensate(); } + catch (Exception compErr) + { + Workflow.Logger.LogError(compErr, "Compensation failed"); + } + } + throw; + } + } +} +``` + +## Cancellation Handling (CancellationToken) + +.NET uses standard `CancellationToken` for workflow cancellation. + +```csharp +[Workflow] +public class CancellableWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + try + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.LongRunningAsync(), + new() { StartToCloseTimeout = TimeSpan.FromHours(1) }); + return "completed"; + } + catch (Exception e) when (TemporalException.IsCanceledException(e)) + { + // The "when" clause above is because we only want to apply the logic to cancellation, but + // this kind of cleanup could be done on any/all exceptions too. + Workflow.Logger.LogError(e, "Cancellation occurred, performing cleanup"); + + // Call cleanup activity. If this throws, it will swallow the original exception which we + // are ok with here. This could be changed to just log a failure and let the original + // cancellation continue. + // The default token on Workflow.CancellationToken is now marked + // cancelled, so we pass a different one. We use CancellationToken.None here because the + // cleanup activity itself doesn't need to be cancellable; if it did (e.g. you want to + // cancel cleanup from a timeout or another signal), create a new detached + // CancellationTokenSource and pass its Token instead. + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyCancellationCleanupActivity(), + new() + { + ScheduleToCloseTimeout = TimeSpan.FromMinutes(5), + CancellationToken = CancellationToken.None, + }); + + // Rethrow the cancellation + throw; + } + } +} +``` + +## Wait Condition with Timeout + +```csharp +[Workflow] +public class ApprovalWorkflow +{ + private bool _approved; + + [WorkflowSignal] + public async Task ApproveAsync() => _approved = true; + + [WorkflowRun] + public async Task RunAsync() + { + // Wait for approval with 24-hour timeout + var gotApproval = await Workflow.WaitConditionAsync( + () => _approved, + TimeSpan.FromHours(24)); + + return gotApproval ? "approved" : "auto-rejected due to timeout"; + } +} +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When async handlers are necessary, use `WaitConditionAsync(AllHandlersFinished)` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```csharp +[Workflow] +public class HandlerAwareWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // ... main workflow logic ... + + // Before exiting, wait for all handlers to finish + await Workflow.WaitConditionAsync(() => Workflow.AllHandlersFinished); + return "done"; + } +} +``` + +## Activity Heartbeat Details + +### WHY: + +- **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** — Heartbeat details persist across retries + +### WHEN: + +- **Cancellable activities** — Any activity that should respond to cancellation +- **Long-running activities** — Track progress for resumability +- **Checkpointing** — Save progress periodically + +```csharp +[Activity] +public async Task ProcessLargeFileAsync(string filePath) +{ + var info = ActivityExecutionContext.Current.Info; + // Get heartbeat details from previous attempt (if any) + var startLine = info.HeartbeatDetails.Count > 0 + ? await info.HeartbeatDetailAtAsync(0) + : 0; + + var lines = await File.ReadAllLinesAsync(filePath); + for (var i = startLine; i < lines.Length; i++) + { + await ProcessLineAsync(lines[i]); + + // Heartbeat with progress + // If cancelled, CancellationToken will be triggered + ActivityExecutionContext.Current.Heartbeat(i + 1); + ActivityExecutionContext.Current.CancellationToken.ThrowIfCancellationRequested(); + } + + return "completed"; +} +``` + +## Timers + +```csharp +[Workflow] +public class TimerWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.DelayAsync(TimeSpan.FromHours(1)); + return "Timer fired"; + } +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```csharp +[Workflow] +public class LocalActivityWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + var result = await Workflow.ExecuteLocalActivityAsync( + (MyActivities a) => a.QuickLookup("key"), + new() { StartToCloseTimeout = TimeSpan.FromSeconds(5) }); + return result; + } +} +``` diff --git a/references/dotnet/testing.md b/references/dotnet/testing.md new file mode 100644 index 0000000..d60805a --- /dev/null +++ b/references/dotnet/testing.md @@ -0,0 +1,177 @@ +# .NET SDK Testing + +## Overview + +You test Temporal .NET Workflows using the `Temporalio.Testing` namespace plus a normal .NET test framework. The .NET SDK is compatible with any testing framework; most samples use xUnit. The SDK provides `WorkflowEnvironment` for testing workflows in a local environment and `ActivityEnvironment` for isolated activity testing. + +## Test Environment Setup + +The core pattern is: + +1. Start a `WorkflowEnvironment` (`WorkflowEnvironment.StartLocalAsync()`). +2. Create a `TemporalWorker` in that environment with your Workflow and Activities registered. +3. Use the environment's client to execute the Workflow, using a fresh GUID for the task queue name and workflow ID. +4. Assert on the result or status. + +```csharp +using Temporalio.Testing; +using Temporalio.Worker; + +[Fact] +public async Task TestWorkflow() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + using var worker = new TemporalWorker( + env.Client, + new TemporalWorkerOptions($"task-queue-{Guid.NewGuid()}") + .AddWorkflow() + .AddAllActivities(new MyActivities())); + + await worker.ExecuteAsync(async () => + { + var result = await env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("input"), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + Assert.Equal("expected", result); + }); +} +``` + +Conveniently, the local `env` can be shared among tests, e.g. via a fixture class. + +If your workflows / tests involve long durations (such as using Temporal timers / sleeps), then you can use the time-skipping environment, via `WorkflowEnvironment.StartTimeSkippingAsync()`. Only use time-skipping if you must. It is not thread safe and cannot be shared among tests. + +## Activity Mocking + +The .NET SDK provides a straightforward way to mock Activities. Create a mock function with the `[Activity]` attribute and specify the name of the original Activity you want to mock: + +```csharp +[Fact] +public async Task TestWithMockActivity() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + [Activity("MyActivity")] + static Task MockMyActivity(string input) => + Task.FromResult($"mocked: {input}"); + + using var worker = new TemporalWorker( + env.Client, + new TemporalWorkerOptions($"task-queue-{Guid.NewGuid()}") + .AddWorkflow() + .AddActivity(MockMyActivity)); + + await worker.ExecuteAsync(async () => + { + var result = await env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("test"), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + Assert.Equal("mocked: test", result); + }); +} +``` + +**Note:** If the original activity method name ends with `Async` and returns a `Task`, the default activity name has `Async` trimmed off. For example, `MyActivityAsync` has default name `MyActivity`. + +## Testing Signals and Queries + +```csharp +[Fact] +public async Task TestSignalsAndQueries() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + using var worker = new TemporalWorker(/* ... */); + + await worker.ExecuteAsync(async () => + { + var handle = await env.Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync(), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + + // Send signal + await handle.SignalAsync(wf => wf.MySignalAsync("data")); + + // Query state + var status = await handle.QueryAsync(wf => wf.GetStatus()); + Assert.Equal("expected", status); + + // Wait for completion + var result = await handle.GetResultAsync(); + }); +} +``` + +## Testing Failure Cases + +```csharp +[Fact] +public async Task TestActivityFailureHandling() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + [Activity("RiskyActivity")] + static Task MockFailingActivity() => + throw new ApplicationFailureException("Simulated failure", nonRetryable: true); + + using var worker = new TemporalWorker(/* ... with mock activity */); + + await worker.ExecuteAsync(async () => + { + var ex = await Assert.ThrowsAsync(() => + env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync(), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!))); + }); +} +``` + +## Replay Testing + +```csharp +using Temporalio.Worker; + +[Fact] +public async Task TestReplay() +{ + var historyJson = await File.ReadAllTextAsync("example-history.json"); + var replayer = new WorkflowReplayer( + new WorkflowReplayerOptions() + .AddWorkflow()); + + await replayer.ReplayWorkflowAsync( + WorkflowHistory.FromJson("my-workflow-id", historyJson)); +} +``` + +## Activity Testing + +```csharp +using Temporalio.Testing; + +[Fact] +public async Task TestActivity() +{ + var env = new ActivityEnvironment(); + var activities = new MyActivities(); + var result = await env.RunAsync(() => activities.MyActivity("arg1")); + Assert.Equal("expected", result); +} +``` + +The `ActivityEnvironment` provides: + +- `Info` — Activity info, defaulted to basic values +- `CancellationTokenSource` — Token source for issuing cancellation +- `Heartbeater` — Callback invoked each heartbeat +- `Logger` — Activity logger + +## Best Practices + +1. Use the `WorkflowEnvironment.StartLocalAsync` environment for most testing +2. Use time-skipping environment for workflows with durable timers / durable sleeps +3. Mock external dependencies in activities +4. Test replay compatibility, especially when changing workflow code +5. Test signal/query handlers explicitly +6. Use unique workflow IDs and task queues per test to avoid conflicts — `Guid.NewGuid()` is easiest diff --git a/references/dotnet/versioning.md b/references/dotnet/versioning.md new file mode 100644 index 0000000..6371926 --- /dev/null +++ b/references/dotnet/versioning.md @@ -0,0 +1,307 @@ +# .NET SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +### The Patched() Method + +The `Workflow.Patched()` method checks whether a Workflow should run new or old code: + +```csharp +[Workflow] +public class ShippingWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + if (Workflow.Patched("send-email-instead-of-fax")) + { + // New code path + await Workflow.ExecuteActivityAsync( + (ShippingActivities a) => a.SendEmailAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + else + { + // Old code path (for replay of existing workflows) + await Workflow.ExecuteActivityAsync( + (ShippingActivities a) => a.SendFaxAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + } +} +``` + +**How it works:** + +- For new executions: `Patched()` returns `true` and records a marker in the Workflow history +- For replay with the marker: `Patched()` returns `true` (history includes this patch) +- For replay without the marker: `Patched()` returns `false` (history predates this patch) + +### Three-Step Patching Process + +**Warning:** Failing to follow this process correctly will result in non-determinism errors for in-flight workflows. + +**Step 1: Patch in New Code** + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + if (Workflow.Patched("add-fraud-check")) + { + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + } + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +**Step 2: Deprecate the Patch** + +Once all pre-patch Workflow Executions have completed: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + Workflow.DeprecatePatch("add-fraud-check"); + + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +**Step 3: Remove the Patch** + +After all workflows with the deprecated patch marker have completed, remove the `DeprecatePatch()` call entirely: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +### Query Filters for Finding Workflows by Version + +Use List Filters to find workflows with specific patch versions: + +```bash +# Find running workflows with a specific patch +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "add-fraud-check"' + +# Find running workflows without any patch (pre-patch versions) +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion IS NULL' +``` + +## Workflow Type Versioning + +For incompatible changes, create a new Workflow Type instead of using patches: + +```csharp +[Workflow("PizzaWorkflow")] +public class PizzaWorkflow +{ + [WorkflowRun] + public async Task RunAsync(PizzaOrder order) + { + return await ProcessOrderV1Async(order); + } +} + +[Workflow("PizzaWorkflowV2")] +public class PizzaWorkflowV2 +{ + [WorkflowRun] + public async Task RunAsync(PizzaOrder order) + { + return await ProcessOrderV2Async(order); + } +} +``` + +Register both with the Worker: + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("pizza-task-queue") + .AddWorkflow() + .AddWorkflow() + .AddAllActivities(new PizzaActivities())); +``` + +Update client code to start new workflows with the new type: + +```csharp +// Old workflows continue on PizzaWorkflow +// New workflows use PizzaWorkflowV2 +var handle = await client.StartWorkflowAsync( + (PizzaWorkflowV2 wf) => wf.RunAsync(order), + new(id: $"pizza-{order.Id}", taskQueue: "pizza-task-queue")); +``` + +Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "PizzaWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```csharp +using Temporalio.Worker; + +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + DeploymentOptions = new WorkerDeploymentOptions( + DeploymentName: "my-service", + BuildId: Environment.GetEnvironmentVariable("BUILD_ID") ?? "dev"), + UseWorkerVersioning = true, + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +**Configuration parameters:** + +- `UseWorkerVersioning`: Enables Worker Versioning +- `DeploymentOptions`: Identifies the Worker Deployment Version (deployment name + build ID) +- Build ID: Typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version: + +```csharp +[Workflow(VersioningBehavior = VersioningBehavior.Pinned)] +public class StableWorkflow { /* ... */ } +``` + +**When to use PINNED:** + +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions: + +```csharp +[Workflow(VersioningBehavior = VersioningBehavior.AutoUpgrade)] +public class UpgradableWorkflow { /* ... */ } +``` + +**When to use AUTO_UPGRADE:** + +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using patching APIs for version transitions + +**Important:** AUTO_UPGRADE workflows still need patching to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + DeploymentOptions = new WorkerDeploymentOptions( + DeploymentName: "order-service", + BuildId: Environment.GetEnvironmentVariable("BUILD_ID") ?? "dev") + { + DefaultVersioningBehavior = VersioningBehavior.Pinned, + }, + UseWorkerVersioning = true, + } + .AddWorkflow() + .AddAllActivities(new OrderActivities())); +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: + +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: + +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Check for open executions** before removing old code paths +2. **Use descriptive patch IDs** that explain the change (e.g., "add-fraud-check" not "patch-1") +3. **Deploy patches incrementally**: patch, deprecate, remove +4. **Use PINNED for short workflows** to simplify version management +5. **Use AUTO_UPGRADE with patching** for long-running workflows that need updates +6. **Generate Build IDs from code** (git hash) to ensure changes produce new versions +7. **Avoid rolling deployments** for high-availability services with long-running workflows diff --git a/references/go/advanced-features.md b/references/go/advanced-features.md new file mode 100644 index 0000000..b64ce94 --- /dev/null +++ b/references/go/advanced-features.md @@ -0,0 +1,189 @@ +# Go SDK Advanced Features + +## Schedules + +Create recurring workflow executions using the Schedule API. + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "daily-report", + Spec: client.ScheduleSpec{ + CronExpressions: []string{"0 9 * * *"}, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "daily-report-workflow", + Workflow: DailyReportWorkflow, + TaskQueue: "reports", + }, +}) +``` + +Using intervals instead of cron: + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "hourly-sync", + Spec: client.ScheduleSpec{ + Intervals: []client.ScheduleIntervalSpec{ + {Every: time.Hour}, + }, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "hourly-sync-workflow", + Workflow: SyncWorkflow, + TaskQueue: "sync", + }, +}) +``` + +Manage schedules: + +```go +handle := c.ScheduleClient().GetHandle(ctx, "daily-report") + +// Pause / unpause +handle.Pause(ctx, client.SchedulePauseOptions{Note: "Maintenance window"}) +handle.Unpause(ctx, client.ScheduleUnpauseOptions{Note: "Maintenance complete"}) + +// Trigger immediately +handle.Trigger(ctx, client.ScheduleTriggerOptions{}) + +// Describe +desc, err := handle.Describe(ctx) + +// Delete +handle.Delete(ctx) +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat_timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a heartbeat_timeout, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +**Step 1: Return `activity.ErrResultPending` from the activity.** + +```go +func RequestApproval(ctx context.Context, requestID string) (string, error) { + activityInfo := activity.GetInfo(ctx) + taskToken := activityInfo.TaskToken + + // Store taskToken externally (e.g., database) for later completion + err := storeTaskToken(requestID, taskToken) + if err != nil { + return "", err + } + + // Signal that this activity will be completed externally + return "", activity.ErrResultPending +} +``` + +**Step 2: Complete from another process using the task token.** + +```go +temporalClient, err := client.Dial(client.Options{}) + +// Complete the activity +err = temporalClient.CompleteActivity(ctx, taskToken, "approved", nil) + +// Or fail it +err = temporalClient.CompleteActivity(ctx, taskToken, nil, errors.New("rejected")) +``` + +Or complete by ID (no task token needed): + +```go +err = temporalClient.CompleteActivityByID(ctx, namespace, workflowID, runID, activityID, "approved", nil) +``` + +## Worker Tuning + +Configure `worker.Options` for production workloads: + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + // Max concurrent activity executions (default: 1000) + MaxConcurrentActivityExecutionSize: 500, + + // Max concurrent workflow task executions (default: 1000) + MaxConcurrentWorkflowTaskExecutionSize: 500, + + // Max concurrent activity task pollers (default: 2) + MaxConcurrentActivityTaskPollers: 4, + + // Max concurrent workflow task pollers (default: 2) + MaxConcurrentWorkflowTaskPollers: 4, + + // Graceful shutdown timeout (default: 0) + WorkerStopTimeout: 30 * time.Second, +}) +``` + +Scale pollers based on task queue throughput. If you observe high schedule-to-start latency, increase the number of pollers or add more workers. + +## Sessions + +Go-specific feature for routing multiple activities to the same worker. All activities using the session context execute on the same worker host. + +**Enable on the worker:** + +```go +w := worker.New(c, "fileprocessing", worker.Options{ + EnableSessionWorker: true, + MaxConcurrentSessionExecutionSize: 100, // default: 1000 +}) +``` + +**Use in a workflow:** + +```go +func FileProcessingWorkflow(ctx workflow.Context, file FileParam) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + sessionCtx, err := workflow.CreateSession(ctx, &workflow.SessionOptions{ + CreationTimeout: time.Minute, + ExecutionTimeout: 10 * time.Minute, + }) + if err != nil { + return err + } + defer workflow.CompleteSession(sessionCtx) + + // All three activities run on the same worker + var downloadResult string + err = workflow.ExecuteActivity(sessionCtx, DownloadFile, file.URL).Get(sessionCtx, &downloadResult) + if err != nil { + return err + } + + var processResult string + err = workflow.ExecuteActivity(sessionCtx, ProcessFile, downloadResult).Get(sessionCtx, &processResult) + if err != nil { + return err + } + + err = workflow.ExecuteActivity(sessionCtx, UploadFile, processResult).Get(sessionCtx, nil) + return err +} +``` + +Key points: + +- `workflow.ErrSessionFailed` is returned if the worker hosting the session dies +- `CompleteSession` releases resources -- always call it (use `defer`) +- Use case: file processing (download, process, upload on same host), GPU workloads, or any pipeline needing local state +- `MaxConcurrentSessionExecutionSize` on `worker.Options` limits how many sessions a single worker can handle + +**Limitations:** + +- Sessions do not survive worker process restarts — if the worker dies, the session fails and activities must be retried from the workflow level +- There is no server-side support for sessions — the Go SDK implements them entirely client-side using internal task queue routing +- Session concurrency limiting is per-process, not per-host — only one worker process per host if you rely on this + +**Relationship to worker-specific task queues:** Sessions are essentially a convenience API over the "worker-specific task queue" pattern, where each worker creates a unique task queue and routes activities to it. For simple cases where you don't need separate activities (e.g., download + process + upload can be one unit), consider using a single long-running activity with heartbeating instead. diff --git a/references/go/data-handling.md b/references/go/data-handling.md new file mode 100644 index 0000000..18ccf57 --- /dev/null +++ b/references/go/data-handling.md @@ -0,0 +1,264 @@ +# Go SDK Data Handling + +## Overview + +The Go SDK uses the `converter.DataConverter` interface to serialize/deserialize workflow inputs, outputs, and activity parameters. The default converter converts values to JSON. + +## Default Data Converter + +The default `CompositeDataConverter` applies converters in order until one returns a non-nil Payload: + +1. `converter.NewNilPayloadConverter()` -- nil values +2. `converter.NewByteSlicePayloadConverter()` -- `[]byte` +3. `converter.NewProtoJSONPayloadConverter()` -- Protobuf messages as JSON +4. `converter.NewProtoPayloadConverter()` -- Protobuf messages as binary +5. `converter.NewJSONPayloadConverter()` -- anything JSON-serializable + +Structs must have exported fields to be serialized. + +## Custom Data Converter + +In most cases you don't implement the full `DataConverter` interface directly. Instead, implement a **`PayloadConverter`** for your specific type and insert it into a `CompositeDataConverter`. The `PayloadConverter` interface has four methods: + +```go +type PayloadConverter interface { + ToPayload(value interface{}) (*commonpb.Payload, error) // return nil if this type isn't handled + FromPayload(payload *commonpb.Payload, valuePtr interface{}) error + ToString(payload *commonpb.Payload) string + Encoding() string // e.g. "json/msgpack" +} +``` + +**Example — custom msgpack PayloadConverter:** + +```go +import ( + "encoding/json" + "fmt" + + commonpb "go.temporal.io/api/common/v1" + "go.temporal.io/sdk/converter" + "github.com/vmihailenco/msgpack/v5" +) + +const encodingMsgpack = "binary/msgpack" + +type MsgpackPayloadConverter struct{} + +func (c *MsgpackPayloadConverter) Encoding() string { + return encodingMsgpack +} + +func (c *MsgpackPayloadConverter) ToPayload(value interface{}) (*commonpb.Payload, error) { + if value == nil { + return nil, nil + } + data, err := msgpack.Marshal(value) + if err != nil { + return nil, fmt.Errorf("msgpack marshal: %w", err) + } + return &commonpb.Payload{ + Metadata: map[string][]byte{ + converter.MetadataEncoding: []byte(encodingMsgpack), + }, + Data: data, + }, nil +} + +func (c *MsgpackPayloadConverter) FromPayload(payload *commonpb.Payload, valuePtr interface{}) error { + if string(payload.GetMetadata()[converter.MetadataEncoding]) != encodingMsgpack { + return fmt.Errorf("unsupported encoding") + } + return msgpack.Unmarshal(payload.Data, valuePtr) +} + +func (c *MsgpackPayloadConverter) ToString(payload *commonpb.Payload) string { + // Decode to a map for human-readable display + var v interface{} + if err := msgpack.Unmarshal(payload.Data, &v); err != nil { + return fmt.Sprintf("", err) + } + b, _ := json.Marshal(v) + return string(b) +} +``` + +**Register in a CompositeDataConverter and pass to the client:** + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + &MsgpackPayloadConverter{}, // handles your type; falls through to JSON for everything else + converter.NewJSONPayloadConverter(), +) + +c, err := client.Dial(client.Options{ + DataConverter: dataConverter, +}) +``` + +**Per-activity/child-workflow override** — use a different converter for specific calls: + +```go +actCtx := workflow.WithDataConverter(ctx, mySpecialConverter) +workflow.ExecuteActivity(actCtx, SensitiveActivity, input) +``` + +**Note:** If your converter makes remote calls (e.g., to a KMS for encryption), wrap it with `workflow.DataConverterWithoutDeadlockDetection` to avoid deadlock detection timeouts in workflow code. + +## Composition of Payload Converters + +Use `converter.NewCompositeDataConverter` to chain type-specific converters. The first converter that can handle the type wins. + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + converter.NewProtoJSONPayloadConverter(), + converter.NewProtoPayloadConverter(), + YourCustomPayloadConverter(), + converter.NewJSONPayloadConverter(), +) +``` + +## Protobuf Support + +Binary protobuf: + +```go +converter.NewProtoPayloadConverter() +``` + +JSON protobuf: + +```go +converter.NewProtoJSONPayloadConverter() +``` + +Both are included in the default data converter. SDK v1.26.0 (March 2024) migrated from gogo/protobuf to google/protobuf. If you need backward compatibility with older payloads encoded with gogo, use the `LegacyTemporalProtoCompat` option. + +## Payload Encryption + +Implement the `converter.PayloadCodec` interface (`Encode` and `Decode`) and wrap the default data converter: + +```go +// Codec implements converter.PayloadCodec for encryption. +type Codec struct{} + +func (Codec) Encode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + origBytes, err := p.Marshal() + if err != nil { + return payloads, err + } + encrypted := encrypt(origBytes) // your encryption logic + result[i] = &commonpb.Payload{ + Metadata: map[string][]byte{converter.MetadataEncoding: []byte("binary/encrypted")}, + Data: encrypted, + } + } + return result, nil +} + +func (Codec) Decode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + if string(p.Metadata[converter.MetadataEncoding]) != "binary/encrypted" { + result[i] = p + continue + } + decrypted := decrypt(p.Data) // your decryption logic + result[i] = &commonpb.Payload{} + err := result[i].Unmarshal(decrypted) + if err != nil { + return payloads, err + } + } + return result, nil +} +``` + +Wrap with `CodecDataConverter` and pass to client: + +```go +var DataConverter = converter.NewCodecDataConverter( + converter.GetDefaultDataConverter(), + &Codec{}, +) + +c, err := client.Dial(client.Options{ + DataConverter: DataConverter, +}) +``` + +## Search Attributes + +Set at workflow start: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + SearchAttributes: map[string]interface{}{ + "OrderStatus": "pending", + "CustomerId": "cust-456", + }, +}, OrderWorkflow, input) +``` + +Upsert from within a workflow: + +```go +err := workflow.UpsertSearchAttributes(ctx, map[string]interface{}{ + "OrderStatus": "completed", +}) +``` + +Typed search attributes (v1.26.0+, preferred): + +```go +var OrderStatusKey = temporal.NewSearchAttributeKeyKeyword("OrderStatus") + +err := workflow.UpsertTypedSearchAttributes(ctx, OrderStatusKey.ValueSet("completed")) +``` + +Query workflows by search attributes: + +```go +resp, err := c.ListWorkflow(ctx, &workflowservice.ListWorkflowExecutionsRequest{ + Query: `OrderStatus = "pending" AND CustomerId = "cust-456"`, +}) +``` + +## Workflow Memo + +Set in start options: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + Memo: map[string]interface{}{ + "customerName": "Alice", + "notes": "Priority customer", + }, +}, OrderWorkflow, input) +``` + +Read memo from workflow info. Upsert memo (Go SDK only): + +```go +err := workflow.UpsertMemo(ctx, map[string]interface{}{ + "notes": "Updated notes", +}) +``` + +## Best Practices + +1. Use structs with exported fields for inputs and outputs +2. Prefer JSON for readability during development, protobuf for performance in production +3. Keep payloads small -- see `references/core/gotchas.md` for limits +4. Use `PayloadCodec` for encryption; never store sensitive data unencrypted +5. Configure the same data converter on both client and worker diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md new file mode 100644 index 0000000..2cdd829 --- /dev/null +++ b/references/go/determinism-protection.md @@ -0,0 +1,103 @@ +# Go Workflow Determinism Protection + +## Overview + +The Go SDK has no runtime sandbox (only Python and TypeScript have sandboxing). Determinism is enforced by **developer convention** and **optional static analysis**. The Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing.md`). + +## workflowcheck Static Analysis + +### Install + +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +``` + +### Run + +```bash +workflowcheck ./... +``` + +No output means all registered workflows are deterministic. Non-deterministic code produces hierarchical output showing the call chain to the offending code. + +Use `-show-pos` for exact file positions: + +```bash +workflowcheck -show-pos ./... +``` + +### What It Detects + +**Non-deterministic functions/variables:** + +- `time.Now` -- obtaining current time +- `time.Sleep` -- sleeping +- `crypto/rand.Reader` -- crypto random reader +- `math/rand.globalRand` -- global pseudorandom +- `os.Stdin`, `os.Stdout`, `os.Stderr` -- standard I/O streams + +**Non-deterministic Go constructs:** + +- Starting a goroutine (`go func()`) +- Sending to a channel +- Receiving from a channel +- Iterating over a channel via `range` +- Iterating over a map via `range` + +### Limitations + +`workflowcheck` cannot catch everything. It does **not** detect: + +- Global variable mutation +- Non-determinism via reflection +- Runtime-conditional non-determinism + +### Suppressing False Positives + +Add `//workflowcheck:ignore` on or directly above the offending line: + +```go +now := time.Now() //workflowcheck:ignore +``` + +For broader suppression, use a YAML config file: + +```yaml +# workflowcheck.config.yaml +decls: + path/to/package.MyDeterministicFunc: false +``` + +```bash +workflowcheck -config workflowcheck.config.yaml ./... +``` + +## Determinism Rules + +**You must:** + +- Use `workflow.Go(ctx, func(ctx workflow.Context) { ... })` instead of `go` +- Use `workflow.NewChannel(ctx)` instead of `chan` +- Use `workflow.NewSelector(ctx)` instead of `select` +- Use `workflow.Sleep(ctx, duration)` instead of `time.Sleep()` +- Use `workflow.Now(ctx)` instead of `time.Now()` +- Use `workflow.GetLogger(ctx)` instead of `fmt.Println` / `log.Println` +- Sort map keys before iterating, or use `workflow.SideEffect` / an activity + +**You must not:** + +- Start native goroutines +- Use native channels or `select` +- Call `time.Now()` or `time.Sleep()` +- Use `math/rand` global functions or `crypto/rand.Reader` +- Access `os.Stdin`, `os.Stdout`, or `os.Stderr` +- Mutate global variables +- Make network calls, file I/O, or database queries (use activities) + +## Best Practices + +1. **Run `workflowcheck` in CI / pre-commit** -- catch non-deterministic code before it reaches production +2. **Keep workflow code thin** -- workflows should orchestrate; delegate all I/O and non-deterministic work to activities +3. **Use struct methods for activities** -- keeps imports clean and avoids pulling non-deterministic dependencies into workflow files +4. **Separate workflow and activity files** -- reduces the surface area that `workflowcheck` needs to analyze and keeps concerns isolated +5. **Test with replay** after any workflow code change to verify backward compatibility diff --git a/references/go/determinism.md b/references/go/determinism.md new file mode 100644 index 0000000..c8b52b9 --- /dev/null +++ b/references/go/determinism.md @@ -0,0 +1,52 @@ +# Go SDK Determinism + +## Overview + +The Go SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be deterministic for replay, and determinism is enforced entirely by developer convention and optional static analysis via the `workflowcheck` tool (see `references/go/determinism-protection.md`). + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. + +## Forbidden Operations in Workflows + +Do not use any of the following in workflow code (they are appropriate to use in activities): + +- **Native goroutines** (`go func()`) -- use `workflow.Go()` instead +- **Native channels** (`chan`, send, receive, `range` over channel) -- use `workflow.Channel` instead +- **Native `select`** -- use `workflow.Selector` instead +- **`time.Now()`** -- use `workflow.Now(ctx)` instead +- **`time.Sleep()`** -- use `workflow.Sleep(ctx, duration)` instead +- **`math/rand` global** (e.g., `rand.Intn()`) -- use `workflow.SideEffect` instead +- **`crypto/rand.Reader`** -- use an activity instead +- **`os.Stdin` / `os.Stdout` / `os.Stderr`** -- use `workflow.GetLogger(ctx)` for logging +- **Map range iteration** (`for k, v := range myMap`) -- sort keys first, then iterate +- **Mutating global variables** -- use local state or `workflow.SideEffect` +- **Anonymous functions as local activities** -- the name is derived from the function and will be non-deterministic across replays; always use named functions for local activities + +## Safe Builtin Alternatives + +| Instead of | Use | +|---|---| +| `go func() { ... }()` | `workflow.Go(ctx, func(ctx workflow.Context) { ... })` | +| `chan T` | `workflow.NewChannel(ctx)` / `workflow.NewBufferedChannel(ctx, size)` | +| `select { ... }` | `workflow.NewSelector(ctx)` | +| `time.Now()` | `workflow.Now(ctx)` | +| `time.Sleep(d)` | `workflow.Sleep(ctx, d)` | +| `rand.Intn(100)` | `workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { return rand.Intn(100) })` | +| `uuid.New()` | `workflow.SideEffect` or pass as activity result | +| `log.Println(...)` | `workflow.GetLogger(ctx).Info(...)` | + +## Testing Replay Compatibility + +Use `worker.WorkflowReplayer` to verify code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/go/testing.md` + +## Best Practices + +1. Run `workflowcheck ./...` in CI to catch non-deterministic code early +2. Always use `workflow.*` APIs instead of native Go concurrency and time primitives +3. Move all I/O operations (network, filesystem, database) into activities +4. Sort map keys before iterating if you must iterate over a map in workflow code +5. Use `workflow.GetLogger(ctx)` instead of `fmt.Println` or `log.Println` for replay-safe logging +6. Keep workflow code focused on orchestration; delegate non-deterministic work to activities +7. Test with replay after making changes to workflow definitions diff --git a/references/go/error-handling.md b/references/go/error-handling.md new file mode 100644 index 0000000..92a856b --- /dev/null +++ b/references/go/error-handling.md @@ -0,0 +1,184 @@ +# Go SDK Error Handling + +## Overview + +The Go SDK uses error return values (not exceptions). All Temporal errors implement the `error` interface. Activity errors returned to workflows are wrapped in `*temporal.ActivityError`; use `errors.As` to unwrap them. + +## Application Errors + +```go +import "go.temporal.io/sdk/temporal" + +func ValidateOrder(ctx context.Context, order Order) error { + if !order.IsValid() { + return temporal.NewApplicationError( + "Invalid order", + "ValidationError", + ) + } + return nil +} +``` + +`temporal.NewApplicationError(message, errType, details...)` creates a retryable `*temporal.ApplicationError`. Use `NewApplicationErrorWithCause` to include a wrapped cause. + +## Non-Retryable Errors + +```go +func ChargeCard(ctx context.Context, input ChargeCardInput) (string, error) { + if !isValidCard(input.CardNumber) { + return "", temporal.NewNonRetryableApplicationError( + "Permanent failure - invalid credit card", + "PaymentError", + nil, // cause + ) + } + return processPayment(input.CardNumber, input.Amount) +} +``` + +`temporal.NewNonRetryableApplicationError(message, errType, cause, details...)` is always non-retryable regardless of RetryPolicy. You can also mark error types as non-retryable in the RetryPolicy instead: + +```go +RetryPolicy: &temporal.RetryPolicy{ + NonRetryableErrorTypes: []string{"PaymentError", "ValidationError"}, +}, +``` + +## Handling Activity Errors in Workflows + +```go +import ( + "errors" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) (string, error) { + var result string + err := workflow.ExecuteActivity(ctx, RiskyActivity).Get(ctx, &result) + if err != nil { + var applicationErr *temporal.ApplicationError + if errors.As(err, &applicationErr) { + switch applicationErr.Type() { + case "ValidationError": + // handle validation error + case "PaymentError": + // handle payment error + default: + // handle unknown error type + } + } + + var timeoutErr *temporal.TimeoutError + if errors.As(err, &timeoutErr) { + switch timeoutErr.TimeoutType() { + case enumspb.TIMEOUT_TYPE_START_TO_CLOSE: + // handle start-to-close timeout + case enumspb.TIMEOUT_TYPE_HEARTBEAT: + // handle heartbeat timeout + } + } + + var canceledErr *temporal.CanceledError + if errors.As(err, &canceledErr) { + // handle cancellation + } + + var panicErr *temporal.PanicError + if errors.As(err, &panicErr) { + // panicErr.Error() and panicErr.StackTrace() + } + + return "", err + } + return result, nil +} +``` + +## Retry Configuration + +```go +import ( + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Minute, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: time.Second, + BackoffCoefficient: 2.0, + MaximumInterval: time.Minute, + MaximumAttempts: 5, + NonRetryableErrorTypes: []string{"ValidationError", "PaymentError"}, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + return workflow.ExecuteActivity(ctx, MyActivity).Get(ctx, nil) +} +``` + +Only set options such as `MaximumInterval`, `MaximumAttempts`, etc. if you have a domain-specific reason to. If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, // Single attempt max duration + ScheduleToCloseTimeout: 30 * time.Minute, // Total time including retries + ScheduleToStartTimeout: 10 * time.Minute, // Time waiting in task queue + HeartbeatTimeout: 2 * time.Minute, // Between heartbeats +} +ctx = workflow.WithActivityOptions(ctx, ao) +``` + +- **StartToCloseTimeout**: Max time for a single Activity Task Execution. Prefer this over ScheduleToCloseTimeout. +- **ScheduleToCloseTimeout**: Total time including retries. +- **ScheduleToStartTimeout**: Time an Activity Task can wait in the Task Queue before a Worker picks it up. Rarely needed. +- **HeartbeatTimeout**: Max time between heartbeats. Required for long-running activities to detect failures. + +Either `StartToCloseTimeout` or `ScheduleToCloseTimeout` must be set. + +## Workflow Failure + +Returning any error from a workflow function fails the execution. Return `nil` for success. + +**Important Go-specific behavior:** In the Go SDK, returning any error from a workflow fails the workflow execution by default — there is no automatic retry. This differs from other SDKs (Python, TypeScript) where non-`ApplicationError` exceptions cause the workflow task to retry indefinitely. In Go, if you want workflow-level retries, you must explicitly set a `RetryPolicy` on the `StartWorkflowOptions`. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + if someCondition { + return "", temporal.NewApplicationError( + "Cannot process order", + "BusinessError", + ) + } + return "success", nil +} +``` + +To prevent workflow retry, return a non-retryable error: + +```go +return "", temporal.NewNonRetryableApplicationError( + "Unrecoverable failure", + "FatalError", + nil, +) +``` + +**Note:** If an activity returns a non-retryable error, the workflow receives an `*temporal.ActivityError` wrapping it. To fail the workflow without retry, wrap it in a new `NewNonRetryableApplicationError`. + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Set appropriate timeouts; prefer `StartToCloseTimeout` over `ScheduleToCloseTimeout` +4. Let Temporal handle retries via RetryPolicy rather than implementing retry logic yourself +5. Use `errors.As` to unwrap and inspect specific error types +6. Design activities to be idempotent for safe retries (see `references/core/patterns.md`) diff --git a/references/go/go.md b/references/go/go.md new file mode 100644 index 0000000..6c42bed --- /dev/null +++ b/references/go/go.md @@ -0,0 +1,254 @@ +# Temporal Go SDK Reference + +## Overview + +The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic Go approach to building durable workflows. Workflows are regular exported Go functions. + +## Quick Start + +**Add Dependency:** In your Go module, add the Temporal SDK: + +```bash +go get go.temporal.io/sdk +``` + +**workflows/greeting.go** - Workflow definition: + +```go +package workflows + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func GreetingWorkflow(ctx workflow.Context, name string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, "Greet", name).Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` + +**activities/greet.go** - Activity definition: + +```go +package activities + +import ( + "context" + "fmt" +) + +type Activities struct{} + +func (a *Activities) Greet(ctx context.Context, name string) (string, error) { + return fmt.Sprintf("Hello, %s!", name), nil +} +``` + +**worker/main.go** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): + +```go +package main + +import ( + "log" + + "yourmodule/activities" + "yourmodule/workflows" + + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/worker" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + w := worker.New(c, "my-task-queue", worker.Options{}) + + w.RegisterWorkflow(workflows.GreetingWorkflow) + w.RegisterActivity(&activities.Activities{}) + + err = w.Run(worker.InterruptCh()) + if err != nil { + log.Fatalln("Unable to start worker", err) + } +} +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `go run worker/main.go` in the background. + +**starter/main.go** - Start a workflow execution: + +```go +package main + +import ( + "context" + "fmt" + "log" + + "yourmodule/workflows" + + "github.com/google/uuid" + "go.temporal.io/sdk/client" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + options := client.StartWorkflowOptions{ + ID: uuid.NewString(), + TaskQueue: "my-task-queue", + } + + we, err := c.ExecuteWorkflow(context.Background(), options, workflows.GreetingWorkflow, "my name") + if err != nil { + log.Fatalln("Unable to execute workflow", err) + } + + var result string + err = we.Get(context.Background(), &result) + if err != nil { + log.Fatalln("Unable to get workflow result", err) + } + + fmt.Println("Result:", result) +} +``` + +**Run the workflow:** Run `go run starter/main.go`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition + +- Exported function with `workflow.Context` as the first parameter +- Returns `(ResultType, error)` or just `error` +- Signature: `func MyWorkflow(ctx workflow.Context, input MyInput) (MyOutput, error)` +- Use `workflow.SetQueryHandler()`, `workflow.SetUpdateHandler()` for handlers +- Register with `w.RegisterWorkflow(MyWorkflow)` + +### Activity Definition + +- Regular function or struct methods with `context.Context` as the first parameter +- Struct methods are preferred for dependency injection +- Signature: `func (a *Activities) MyActivity(ctx context.Context, input string) (string, error)` +- Register struct with `w.RegisterActivity(&Activities{})` (registers all exported methods) + +### Worker Setup + +- Create client with `client.Dial(client.Options{})` +- Create worker with `worker.New(c, "task-queue", worker.Options{})` +- Register workflows and activities +- Run with `w.Run(worker.InterruptCh())` + +### Determinism + +**Workflow code must be deterministic!** The Go SDK has no sandbox -- determinism is enforced by convention and tooling. + +Use Temporal replacements instead of native Go constructs: + +- `workflow.Go()` instead of `go` (goroutines) +- `workflow.Channel` instead of `chan` +- `workflow.Selector` instead of `select` +- `workflow.Sleep()` instead of `time.Sleep()` +- `workflow.Now()` instead of `time.Now()` +- `workflow.GetLogger()` instead of `log` / `fmt.Println` for replay-safe logging + +Use the **`workflowcheck`** static analysis tool to catch non-deterministic code: + +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +workflowcheck ./... +``` + +Read `references/core/determinism.md` and `references/go/determinism.md` to understand more. + +## File Organization Best Practice + +**Use separate packages for workflows, activities, and worker.** Activities as struct methods enable dependency injection at the worker level. + +``` +myapp/ +├── workflows/ +│ └── greeting.go # Only Workflow functions +├── activities/ +│ └── greet.go # Activity struct and methods +├── worker/ +│ └── main.go # Worker setup, imports both +└── starter/ + └── main.go # Client code to start workflows +``` + +**Activities as struct methods for dependency injection:** + +```go +// activities/greet.go +type Activities struct { + HTTPClient *http.Client + DB *sql.DB +} + +func (a *Activities) FetchData(ctx context.Context, url string) (string, error) { + // Use a.HTTPClient, a.DB, etc. +} +``` + +```go +// worker/main.go - inject dependencies at worker startup +activities := &activities.Activities{ + HTTPClient: http.DefaultClient, + DB: db, +} +w.RegisterActivity(activities) +``` + +## Common Pitfalls + +1. **Using native goroutines/channels/select** - Use `workflow.Go()`, `workflow.Channel`, `workflow.Selector` +2. **Using `time.Sleep` or `time.Now`** - Use `workflow.Sleep()` and `workflow.Now()` +3. **Iterating over maps with `range`** - Map iteration order is non-deterministic; sort keys first +4. **Forgetting to register workflows/activities** - Worker will fail tasks for unregistered types +5. **Registering activity functions instead of struct** - Use `w.RegisterActivity(&Activities{})` not `w.RegisterActivity(a.MyMethod)` +6. **Forgetting to heartbeat** - Long-running activities need `activity.RecordHeartbeat(ctx, details)` +7. **Using `fmt.Println` in workflows** - Use `workflow.GetLogger(ctx)` for replay-safe logging +8. **Not setting Activity timeouts** - `StartToCloseTimeout` or `ScheduleToCloseTimeout` is required in `ActivityOptions` + +## Writing Tests + +See `references/go/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files + +- **`references/go/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/go/determinism.md`** - Determinism rules, workflowcheck tool, safe alternatives +- **`references/go/gotchas.md`** - Go-specific mistakes and anti-patterns +- **`references/go/error-handling.md`** - ApplicationError, retry policies, non-retryable errors +- **`references/go/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/go/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/go/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/go/data-handling.md`** - Data converters, payload codecs, encryption +- **`references/go/versioning.md`** - Patching API (`workflow.GetVersion`), Worker Versioning +- **`references/go/determinism-protection.md`** - Information on **`workflowcheck`** tool to help statically check for determinism issues. diff --git a/references/go/gotchas.md b/references/go/gotchas.md new file mode 100644 index 0000000..6ba46ff --- /dev/null +++ b/references/go/gotchas.md @@ -0,0 +1,291 @@ +# Go Gotchas + +Go-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## Goroutines and Concurrency + +### Using Native Go Concurrency Primitives + +**The Problem**: Native `go`, `chan`, and `select` are non-deterministic and will cause replay failures. + +```go +// BAD - Native goroutine +func MyWorkflow(ctx workflow.Context) error { + go func() { // Non-deterministic! + // do work + }() + return nil +} + +// GOOD - Use workflow.Go +func MyWorkflow(ctx workflow.Context) error { + workflow.Go(ctx, func(gCtx workflow.Context) { + // do work + }) + return nil +} +``` + +```go +// BAD - Native channel +func MyWorkflow(ctx workflow.Context) error { + ch := make(chan string) // Non-deterministic! + return nil +} + +// GOOD - Use workflow.Channel +func MyWorkflow(ctx workflow.Context) error { + ch := workflow.NewChannel(ctx) + return nil +} +``` + +```go +// BAD - Native select +select { +case val := <-ch1: + // handle +case val := <-ch2: + // handle +} + +// GOOD - Use workflow.Selector +selector := workflow.NewSelector(ctx) +selector.AddReceive(ch1, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.AddReceive(ch2, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.Select(ctx) +``` + +## Non-Deterministic Operations + +### Map Iteration + +```go +// BAD - Map range order is randomized +for k, v := range myMap { + // Non-deterministic order! +} + +// GOOD - Sort keys first +keys := make([]string, 0, len(myMap)) +for k := range myMap { + keys = append(keys, k) +} +sort.Strings(keys) +for _, k := range keys { + v := myMap[k] + // Deterministic order +} +``` + +### Time and Randomness + +```go +// BAD +t := time.Now() // System clock, non-deterministic +time.Sleep(time.Second) // Not replay-safe +r := rand.Intn(100) // Non-deterministic + +// GOOD +t := workflow.Now(ctx) // Deterministic +workflow.Sleep(ctx, time.Second) // Durable timer +encoded := workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { + return rand.Intn(100) +}) +var r int +encoded.Get(&r) +``` + +Use the `workflowcheck` static analysis tool to catch non-deterministic calls. For false positives, annotate with `//workflowcheck:ignore` on the line above. + +### Anonymous Functions as Local Activities + +**The Problem**: The Go SDK derives the local activity name from the function. Anonymous functions get a non-deterministic name that can change across builds, causing replay failures. + +```go +// BAD - anonymous function: name is non-deterministic +workflow.ExecuteLocalActivity(ctx, func(ctx context.Context) (string, error) { + return "result", nil +}) + +// GOOD - named function: stable, deterministic name +func QuickLookup(ctx context.Context) (string, error) { + return "result", nil +} + +workflow.ExecuteLocalActivity(ctx, QuickLookup) +``` + +Always use named functions for local activities (and regular activities). + +## Wrong Retry Classification + +**Example:** Transient network errors should be retried. Authentication errors should not be. +See `references/go/error-handling.md` for detailed guidance on error classification and retry policies. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```go +// BAD - No heartbeat, can't detect stuck activities or receive cancellation +func ProcessLargeFile(ctx context.Context, path string) error { + for _, chunk := range readChunks(path) { + process(chunk) // Takes hours, no heartbeat + } + return nil +} + +// GOOD - Regular heartbeats with progress +func ProcessLargeFile(ctx context.Context, path string) error { + for i, chunk := range readChunks(path) { + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing chunk %d", i)) + process(chunk) + } + return nil +} +``` + +### Heartbeat Timeout Too Short + +```go +// BAD - Heartbeat timeout shorter than processing time +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 10 * time.Second, // Too short! +} + +// GOOD - Heartbeat timeout allows for processing variance +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 2 * time.Minute, +} +``` + +Set heartbeat timeout as high as acceptable for your use case -- each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```go +// BAD - Cleanup doesn't run on cancellation +func BadWorkflow(ctx workflow.Context) error { + _ = workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, ReleaseResource).Get(ctx, nil) // Never runs if cancelled! + return nil +} + +// GOOD - Use defer with NewDisconnectedContext for cleanup +func GoodWorkflow(ctx workflow.Context) error { + defer func() { + if !errors.Is(ctx.Err(), workflow.ErrCanceled) { + return + } + newCtx, _ := workflow.NewDisconnectedContext(ctx) + _ = workflow.ExecuteActivity(newCtx, ReleaseResource).Get(newCtx, nil) + }() + + err := workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + if err != nil { + return err + } + return workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: + +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Checking ctx.Done()** - Detect when cancellation arrives + +```go +// BAD - Activity ignores cancellation +func LongActivity(ctx context.Context) error { + doExpensiveWork() // Runs to completion even if cancelled + return nil +} + +// GOOD - Heartbeat and check ctx.Done() +func LongActivity(ctx context.Context) error { + for i, item := range items { + select { + case <-ctx.Done(): + cleanup() + return ctx.Err() + default: + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing item %d", i)) + process(item) + } + } + return nil +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/go/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. Please see `references/go/testing.md` for more info. + +## Timers and Sleep + +### Using time.Sleep Instead of workflow.Sleep + +```go +// BAD: time.Sleep is not deterministic during replay +func BadWorkflow(ctx workflow.Context) error { + time.Sleep(60 * time.Second) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.Sleep for deterministic timers +func GoodWorkflow(ctx workflow.Context) error { + workflow.Sleep(ctx, 60*time.Second) // Deterministic + return nil +} +``` + +### Using time.After Instead of workflow.NewTimer + +```go +// BAD: time.After is not replay-safe +func BadWorkflow(ctx workflow.Context) error { + <-time.After(5 * time.Minute) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.NewTimer for durable timers +func GoodWorkflow(ctx workflow.Context) error { + timer := workflow.NewTimer(ctx, 5*time.Minute) + _ = timer.Get(ctx, nil) // Deterministic, durable + return nil +} +``` + +### Using time.Now() Instead of workflow.Now() + +```go +// BAD: time.Now() differs between execution and replay +deadline := time.Now().Add(24 * time.Hour) + +// GOOD: workflow.Now() is replay-safe +deadline := workflow.Now(ctx).Add(24 * time.Hour) +``` + +**Why this matters:** `time.Now()`, `time.Sleep()`, and `time.After()` use the system clock, which differs between original execution and replay. The `workflow.*` equivalents create durable, deterministic entries in the event history. diff --git a/references/go/observability.md b/references/go/observability.md new file mode 100644 index 0000000..a7867b3 --- /dev/null +++ b/references/go/observability.md @@ -0,0 +1,181 @@ +# Go SDK Observability + +## Overview + +The Go SDK provides replay-safe logging via `workflow.GetLogger`, metrics via the Tally library with Prometheus export, and tracing via OpenTelemetry, OpenTracing, or Datadog. + +## Logging / Replay-Aware Logging + +### Workflow Logging + +Use `workflow.GetLogger(ctx)` for replay-safe logging. This logger automatically suppresses duplicate messages during replay. + +```go +func MyWorkflow(ctx workflow.Context, input string) (string, error) { + logger := workflow.GetLogger(ctx) + logger.Info("Workflow started", "input", input) + + var result string + err := workflow.ExecuteActivity(ctx, MyActivity, input).Get(ctx, &result) + if err != nil { + logger.Error("Activity failed", "error", err) + return "", err + } + + logger.Info("Workflow completed", "result", result) + return result, nil +} +``` + +The workflow logger automatically: + +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `activity.GetLogger(ctx)` for context-aware activity logging: + +```go +func MyActivity(ctx context.Context, input string) (string, error) { + logger := activity.GetLogger(ctx) + logger.Info("Processing input", "input", input) + // ... + return "done", nil +} +``` + +Activity logger includes: + +- Activity ID, type, and task queue +- Workflow ID and run ID +- Attempt number (for retries) + +### Adding Persistent Fields + +Use `log.With` to create a logger with key-value pairs included in every entry: + +```go +logger := log.With(workflow.GetLogger(ctx), "orderId", orderId, "customerId", customerId) +logger.Info("Processing order") // includes orderId and customerId +``` + +## Customizing the Logger + +The SDK ships a single built-in **`slog` adapter** (`log.NewStructuredLogger`) and considers `slog` (go 1.21+) the universal bridge to other logging libraries. + +### The `log.Logger` Interface + +```go +// go.temporal.io/sdk/log +type Logger interface { + Debug(msg string, keyvals ...interface{}) + Info(msg string, keyvals ...interface{}) + Warn(msg string, keyvals ...interface{}) + Error(msg string, keyvals ...interface{}) +} +``` + +Optional companion interfaces: `WithLogger` (adds `.With()`) and `WithSkipCallers` (fixes caller frames). + +### Using slog (Recommended) + +```go +import ( + "log/slog" + "os" + + "go.temporal.io/sdk/log" +) + +slogHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}) +logger := log.NewStructuredLogger(slog.New(slogHandler)) + +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +### Using slog as a Bridge to Third-Party Loggers + +Any third-party logger that can back an `slog.Handler` works with `log.NewStructuredLogger` — this includes zap, zerolog, logrus, and most modern Go logging libraries. The pattern is: create an `slog.Handler` from your logger, then wrap it with `log.NewStructuredLogger`. + +**Example with Zap:** + +```go +import ( + "log/slog" + + "go.uber.org/zap" + "go.uber.org/zap/exp/zapslog" + "go.temporal.io/sdk/log" +) + +zapLogger, _ := zap.NewProduction() +handler := zapslog.NewHandler(zapLogger.Core()) +logger := log.NewStructuredLogger(slog.New(handler)) + +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +### Direct Adapter (Alternative) + +If you cannot use the slog bridge, you can implement the `log.Logger` interface directly. The Temporal samples repo has a ~60-line [zap adapter](https://github.com/temporalio/samples-go/blob/main/zapadapter/zap_adapter.go) that implements `Logger`, `WithLogger`, and `WithSkipCallers` and can be copied into your project. + +## Metrics + +Use the Tally library (`go.temporal.io/sdk/contrib/tally`) with Prometheus: + +```go +import ( + sdktally "go.temporal.io/sdk/contrib/tally" + "github.com/uber-go/tally/v4" + "github.com/uber-go/tally/v4/prometheus" +) + +func newPrometheusScope(c prometheus.Configuration) tally.Scope { + reporter, err := c.NewReporter( + prometheus.ConfigurationOptions{}, + ) + if err != nil { + log.Fatalln("error creating prometheus reporter", err) + } + scopeOpts := tally.ScopeOptions{ + CacheReporter: reporter, + Separator: "_", + SanitizeOptions: &sdktally.PrometheusSanitizeOptions, + } + scope, _ := tally.NewRootScope(scopeOpts, time.Second) + scope = sdktally.NewPrometheusNamingScope(scope) + return scope +} + +c, err := client.Dial(client.Options{ + MetricsHandler: sdktally.NewMetricsHandler(newPrometheusScope(prometheus.Configuration{ + ListenAddress: "0.0.0.0:9090", + TimerType: "histogram", + })), +}) +``` + +Key SDK metrics: + +- `temporal_workflow_task_execution_latency` -- Workflow task processing time +- `temporal_activity_execution_latency` -- Activity execution time +- `temporal_workflow_task_replay_latency` -- Replay duration +- `temporal_request` -- Client requests to server +- `temporal_activity_schedule_to_start_latency` -- Time from scheduling to start + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/go/data-handling.md` + +## Best Practices + +1. Always use `workflow.GetLogger(ctx)` in workflows -- never `fmt.Println` or `log.Println` (they produce duplicates on replay) +2. Use `activity.GetLogger(ctx)` in activities for structured context +3. Set up Prometheus metrics in production +4. Use search attributes for operational visibility and debugging +5. Use `workflow.IsReplaying(ctx)` only for custom side-effect-free logging -- the built-in logger handles replay suppression automatically diff --git a/references/go/patterns.md b/references/go/patterns.md new file mode 100644 index 0000000..298cca4 --- /dev/null +++ b/references/go/patterns.md @@ -0,0 +1,539 @@ +# Go SDK Patterns + +## Signals + +In Go, signals are received via channels, not handler functions. + +```go +func OrderWorkflow(ctx workflow.Context) (string, error) { + approved := false + var items []string + + approveCh := workflow.GetSignalChannel(ctx, "approve") + addItemCh := workflow.GetSignalChannel(ctx, "add-item") + + // Listen for signals in a goroutine so workflow can proceed + workflow.Go(ctx, func(ctx workflow.Context) { + for { + selector := workflow.NewSelector(ctx) + selector.AddReceive(approveCh, func(c workflow.ReceiveChannel, more bool) { + c.Receive(ctx, &approved) + }) + selector.AddReceive(addItemCh, func(c workflow.ReceiveChannel, more bool) { + var item string + c.Receive(ctx, &item) + items = append(items, item) + }) + selector.Select(ctx) + } + }) + + // Wait for approval + workflow.Await(ctx, func() bool { return approved }) + return fmt.Sprintf("Processed %d items", len(items)), nil +} +``` + +### Blocking receive from a single channel + +When waiting on a single signal, no Selector is needed: + +```go +var approveInput ApproveInput +workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approveInput) +``` + +## Queries + +**Important:** Queries must NOT modify workflow state. Query handlers run outside workflow context -- do not call `workflow.Go()`, `workflow.NewChannel()`, or any blocking workflow functions. + +```go +func StatusWorkflow(ctx workflow.Context) error { + currentState := "started" + progress := 0 + + err := workflow.SetQueryHandler(ctx, "get-status", func() (string, error) { + return currentState, nil + }) + if err != nil { + return err + } + + err = workflow.SetQueryHandler(ctx, "get-progress", func() (int, error) { + return progress, nil + }) + if err != nil { + return err + } + + // Workflow logic updates currentState and progress as it runs + currentState = "running" + for i := 0; i < 100; i++ { + progress = i + err := workflow.ExecuteActivity( + workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + }), + ProcessItem, i, + ).Get(ctx, nil) + if err != nil { + currentState = "failed" + return err + } + } + currentState = "done" + return nil +} +``` + +## Updates + +```go +func OrderWorkflow(ctx workflow.Context) (int, error) { + var items []string + + err := workflow.SetUpdateHandlerWithOptions( + ctx, + "add-item", + func(ctx workflow.Context, item string) (int, error) { + // Handler can mutate workflow state and return a value + items = append(items, item) + return len(items), nil + }, + workflow.UpdateHandlerOptions{ + Validator: func(ctx workflow.Context, item string) error { + if item == "" { + return fmt.Errorf("item cannot be empty") + } + if len(items) >= 100 { + return fmt.Errorf("order is full") + } + return nil + }, + }, + ) + if err != nil { + return 0, err + } + + // Block until cancelled + _ = ctx.Done().Receive(ctx, nil) + return len(items), nil +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Return an error to reject the update; return `nil` to accept. + +## Child Workflows + +```go +func ParentWorkflow(ctx workflow.Context, orders []Order) ([]string, error) { + cwo := workflow.ChildWorkflowOptions{ + WorkflowExecutionTimeout: 30 * time.Minute, + } + ctx = workflow.WithChildOptions(ctx, cwo) + + var results []string + for _, order := range orders { + var result string + err := workflow.ExecuteChildWorkflow(ctx, ProcessOrderWorkflow, order).Get(ctx, &result) + if err != nil { + return nil, err + } + results = append(results, result) + } + return results, nil +} +``` + +### Child Workflow Options + +```go +import enumspb "go.temporal.io/api/enums/v1" + +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: fmt.Sprintf("child-%s", workflow.GetInfo(ctx).WorkflowExecution.ID), + + // ParentClosePolicy - what happens to child when parent closes + // PARENT_CLOSE_POLICY_TERMINATE (default), PARENT_CLOSE_POLICY_ABANDON, PARENT_CLOSE_POLICY_REQUEST_CANCEL + ParentClosePolicy: enumspb.PARENT_CLOSE_POLICY_ABANDON, + + WorkflowExecutionTimeout: 10 * time.Minute, + WorkflowTaskTimeout: time.Minute, +} +ctx = workflow.WithChildOptions(ctx, cwo) + +future := workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, input) + +// Wait for child to start (important for ABANDON policy) +if err := future.GetChildWorkflowExecution().Get(ctx, nil); err != nil { + return err +} +``` + +## Handles to External Workflows + +```go +func CoordinatorWorkflow(ctx workflow.Context, targetWorkflowID string) error { + // Signal an external workflow + err := workflow.SignalExternalWorkflow(ctx, targetWorkflowID, "", "data-ready", payload).Get(ctx, nil) + if err != nil { + return err + } + + // Cancel an external workflow + err = workflow.RequestCancelExternalWorkflow(ctx, targetWorkflowID, "").Get(ctx, nil) + return err +} +``` + +## Parallel Execution + +Use `workflow.Go` to launch parallel work and `workflow.Selector` to collect results. + +```go +func ParallelWorkflow(ctx workflow.Context, items []string) ([]string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + // Launch activities in parallel + futures := make([]workflow.Future, len(items)) + for i, item := range items { + futures[i] = workflow.ExecuteActivity(actCtx, ProcessItem, item) + } + + // Collect all results + results := make([]string, len(items)) + for i, future := range futures { + if err := future.Get(ctx, &results[i]); err != nil { + return nil, err + } + } + return results, nil +} +``` + +### Using workflow.Go for background goroutines + +```go +ch := workflow.NewChannel(ctx) + +workflow.Go(ctx, func(ctx workflow.Context) { + // Background work + var result string + _ = workflow.ExecuteActivity(actCtx, SomeActivity).Get(ctx, &result) + ch.Send(ctx, result) +}) + +var result string +ch.Receive(ctx, &result) +``` + +## Selector Pattern + +`workflow.Selector` replaces Go's native `select` -- required for deterministic workflow execution. Use it to wait on multiple channels, futures, and timers simultaneously. + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var outcome string + signalCh := workflow.GetSignalChannel(ctx, "approve") + actFuture := workflow.ExecuteActivity(actCtx, AutoReviewActivity) + + // Cancel timer if signal or activity wins + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 24*time.Hour) + + selector := workflow.NewSelector(ctx) + + // Branch 1: Signal received + selector.AddReceive(signalCh, func(c workflow.ReceiveChannel, more bool) { + var approved bool + c.Receive(ctx, &approved) + cancelTimer() + if approved { + outcome = "approved-by-signal" + } else { + outcome = "rejected-by-signal" + } + }) + + // Branch 2: Activity completed + selector.AddFuture(actFuture, func(f workflow.Future) { + var result string + _ = f.Get(ctx, &result) + cancelTimer() + outcome = result + }) + + // Branch 3: Timeout + selector.AddFuture(timer, func(f workflow.Future) { + if err := f.Get(ctx, nil); err == nil { + outcome = "timed-out" + } + // If timer was cancelled, err is CanceledError -- ignore + }) + + selector.Select(ctx) // Blocks until one branch fires + return outcome, nil +} +``` + +Key points: + +- `AddReceive(channel, callback)` -- fires when a channel has a message (must consume with `c.Receive`) +- `AddFuture(future, callback)` -- fires when a future resolves (once per Selector) +- `AddDefault(callback)` -- fires immediately if nothing else is ready +- `Select(ctx)` -- blocks until one branch fires; call multiple times to process multiple events + +## Continue-as-New + +```go +func LongRunningWorkflow(ctx workflow.Context, state WorkflowState) (string, error) { + for { + state = processBatch(ctx, state) + + if state.IsComplete { + return "done", nil + } + + // Check if history is getting large + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) + } + } +} +``` + +Drain signals before continue-as-new to avoid signal loss: + +```go +for { + var signalVal string + ok := signalChan.ReceiveAsync(&signalVal) + if !ok { + break + } + // process signal +} +return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) +``` + +## Cancellation Handling + +Use `ctx.Done()` to detect cancellation and `workflow.NewDisconnectedContext` for cleanup that must run even after cancellation. + +```go +func MyWorkflow(ctx workflow.Context) error { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Hour, + }) + + err := workflow.ExecuteActivity(actCtx, LongRunningActivity).Get(ctx, nil) + if err != nil && temporal.IsCanceledError(ctx.Err()) { + // Workflow was cancelled -- run cleanup with a disconnected context + workflow.GetLogger(ctx).Info("Workflow cancelled, running cleanup") + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + disconnectedCtx = workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + _ = workflow.ExecuteActivity(disconnectedCtx, CleanupActivity).Get(disconnectedCtx, nil) + return err // Return CanceledError + } + return err +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent -- they may be retried (as with ALL activities). + +Use `workflow.NewDisconnectedContext` when running compensations so they execute even if the workflow is cancelled. + +```go +func OrderWorkflow(ctx workflow.Context, order Order) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var compensations []func(ctx workflow.Context) error + + // Helper to run all compensations in reverse, using a disconnected context + // so compensations run even if the workflow is cancelled. + runCompensations := func() { + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + compCtx := workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + for i := len(compensations) - 1; i >= 0; i-- { + if err := compensations[i](compCtx); err != nil { + workflow.GetLogger(ctx).Error("Compensation failed", "error", err) + } + } + } + + // Register compensation BEFORE running the activity. + // If the activity completes the effect but fails on return, + // we still need the compensation. + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, ReleaseInventoryIfReserved, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ReserveInventory, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, RefundPaymentIfCharged, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ChargePayment, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + if err := workflow.ExecuteActivity(actCtx, ShipOrder, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + return "Order completed", nil +} +``` + +## Wait Condition with Timeout + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + approved := false + + // Set up signal handler + workflow.Go(ctx, func(ctx workflow.Context) { + workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approved) + }) + + // Wait with 24-hour timeout -- returns (conditionMet, error) + conditionMet, err := workflow.AwaitWithTimeout(ctx, 24*time.Hour, func() bool { + return approved + }) + if err != nil { + return "", err + } + + if conditionMet { + return "approved", nil + } + return "auto-rejected due to timeout", nil +} +``` + +Without timeout: + +```go +err := workflow.Await(ctx, func() bool { return ready }) +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers may run activities asynchronously. Use `workflow.Await` with `workflow.AllHandlersFinished` before completing or continuing-as-new to prevent the workflow from closing while handlers are still running. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + // ... register handlers, main workflow logic ... + + // Before exiting, wait for all handlers to finish + err := workflow.Await(ctx, func() bool { + return workflow.AllHandlersFinished(ctx) + }) + if err != nil { + return "", err + } + return "done", nil +} +``` + +## Activity Heartbeat Details + +### WHY: + +- **Support activity cancellation** -- Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** -- Heartbeat details persist across retries + +### WHEN: + +- **Cancellable activities** -- Any activity that should respond to cancellation +- **Long-running activities** -- Track progress for resumability +- **Checkpointing** -- Save progress periodically + +```go +func ProcessLargeFile(ctx context.Context, filePath string) (string, error) { + // Recover from previous attempt + startIdx := 0 + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, &startIdx); err == nil { + startIdx++ // Resume from next item + } + } + + lines := readFileLines(filePath) + + for i := startIdx; i < len(lines); i++ { + processLine(lines[i]) + + // Heartbeat with progress -- if cancelled, ctx will be cancelled + activity.RecordHeartbeat(ctx, i) + + if ctx.Err() != nil { + // Activity was cancelled + cleanup() + return "", ctx.Err() + } + } + + return "completed", nil +} +``` + +## Timers + +```go +func TimerWorkflow(ctx workflow.Context) (string, error) { + // Simple sleep + err := workflow.Sleep(ctx, time.Hour) + if err != nil { + return "", err + } + + // Timer as a Future -- for use with Selector + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 30*time.Minute) + + // Cancel the timer when no longer needed + cancelTimer() + + return "Timer fired", nil +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + lao := workflow.LocalActivityOptions{ + StartToCloseTimeout: 5 * time.Second, + } + ctx = workflow.WithLocalActivityOptions(ctx, lao) + + var result string + err := workflow.ExecuteLocalActivity(ctx, QuickLookup, "key").Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` diff --git a/references/go/testing.md b/references/go/testing.md new file mode 100644 index 0000000..ab74bbd --- /dev/null +++ b/references/go/testing.md @@ -0,0 +1,238 @@ +# Go SDK Testing + +## Overview + +The Go SDK provides the `testsuite` package for testing Workflows and Activities. It uses the [testify](https://github.com/stretchr/testify) library for assertions (`assert`/`require`) and mocking (`mock`). The test environment supports automatic time-skipping for Workflows with timers. + +## Test Environment Setup + +Two approaches: struct-based with `suite.Suite` or function-based with `testsuite.NewTestWorkflowEnvironment()`. + +**Approach 1: Struct-based (testify suite)** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + + "go.temporal.io/sdk/testsuite" +) + +type UnitTestSuite struct { + suite.Suite + testsuite.WorkflowTestSuite + + env *testsuite.TestWorkflowEnvironment +} + +func (s *UnitTestSuite) SetupTest() { + s.env = s.NewTestWorkflowEnvironment() +} + +func (s *UnitTestSuite) AfterTest(suiteName, testName string) { + s.env.AssertExpectations(s.T()) +} + +func (s *UnitTestSuite) Test_MyWorkflow_Success() { + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} + +func TestUnitTestSuite(t *testing.T) { + suite.Run(t, new(UnitTestSuite)) +} +``` + +**Approach 2: Function-based** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/testsuite" +) + +func Test_MyWorkflow(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestWorkflowEnvironment() + env.RegisterActivity(MyActivity) + + env.ExecuteWorkflow(MyWorkflow, "input") + assert.True(t, env.IsWorkflowCompleted()) + assert.NoError(t, env.GetWorkflowError()) + + var result string + assert.NoError(t, env.GetWorkflowResult(&result)) + assert.Equal(t, "expected", result) +} +``` + +You must register all Activity Definitions used by the Workflow with `env.RegisterActivity(ActivityFunc)`. The Workflow itself does not need to be registered. + +## Activity Mocking + +Mock activities with `env.OnActivity()` to test Workflow logic in isolation. + +**Return mock values:** + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return("mock_result", nil) +``` + +**Return a function replacement** (for parameter validation or custom logic): + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + func(ctx context.Context, input string) (string, error) { + // Custom logic, assertions, etc. + return "computed_result", nil + }, +) +``` + +**Match specific arguments:** + +```go +env.OnActivity(MyActivity, mock.Anything, "specific_input").Return("result", nil) +``` + +When using mocks, you do not need to call `env.RegisterActivity()` for that Activity. The mock signature must match the original Activity function signature. + +## Testing Signals and Queries + +Use `RegisterDelayedCallback` to send Signals during Workflow execution. Use `QueryWorkflow` to test query handlers. + +```go +func (s *UnitTestSuite) Test_SignalsAndQueries() { + // Register a delayed callback to send a signal after 5 seconds + s.env.RegisterDelayedCallback(func() { + s.env.SignalWorkflow("approve", SignalData{Approved: true}) + }, time.Second*5) + + s.env.ExecuteWorkflow(ApprovalWorkflow, input) + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} +``` + +**Query a running Workflow** (must be called inside `RegisterDelayedCallback` or after `ExecuteWorkflow`): + +```go +s.env.RegisterDelayedCallback(func() { + res, err := s.env.QueryWorkflow("getProgress") + s.NoError(err) + + var progress int + err = res.Get(&progress) + s.NoError(err) + s.Equal(50, progress) +}, time.Second*10+time.Millisecond) +``` + +`QueryWorkflow` returns a `converter.EncodedValue`. Use `.Get(&result)` to decode the value. + +For "Signal-With-Start" testing, set the delay to `0`. + +## Testing Failure Cases + +```go +func (s *UnitTestSuite) Test_WorkflowFailure() { + // Mock activity to return an error + s.env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + "", errors.New("activity failed")) + + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + + err := s.env.GetWorkflowError() + s.Error(err) + + var applicationErr *temporal.ApplicationError + s.True(errors.As(err, &applicationErr)) + s.Equal("activity failed", applicationErr.Error()) +} +``` + +`env.GetWorkflowError()` returns the Workflow error. Use `errors.As(err, &applicationErr)` to check the error type. Mock activities returning errors to test Workflow error-handling paths. + +## Replay Testing + +Use `worker.NewWorkflowReplayer()` to verify that code changes do not break determinism. Load history from a JSON file exported via the Temporal CLI or Web UI. + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/worker" +) + +func Test_ReplayFromFile(t *testing.T) { + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err := replayer.ReplayWorkflowHistoryFromJSONFile(nil, "my_workflow_history.json") + assert.NoError(t, err) +} +``` + +Export history via CLI: `temporal workflow show --workflow-id --output json > history.json` + +**Replay from a programmatically fetched history:** + +```go +func Test_ReplayFromServer(t *testing.T) { + // Fetch history from the server + hist, err := GetWorkflowHistory(ctx, client, workflowID, runID) + assert.NoError(t, err) + + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err = replayer.ReplayWorkflowHistory(nil, hist) + assert.NoError(t, err) +} +``` + +## Activity Testing + +Test Activities in isolation using `TestActivityEnvironment`. No Worker or Workflow needed. + +```go +func Test_MyActivity(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestActivityEnvironment() + env.RegisterActivity(MyActivity) + + val, err := env.ExecuteActivity(MyActivity, "input") + assert.NoError(t, err) + + var result string + assert.NoError(t, val.Get(&result)) + assert.Equal(t, "expected_output", result) +} +``` + +`ExecuteActivity` returns `(converter.EncodedValue, error)`. Use `val.Get(&result)` to extract the typed result. The Activity executes synchronously in the calling goroutine. + +## Best Practices + +1. Register all Activities used by the Workflow with `env.RegisterActivity()`, unless you mock them with `env.OnActivity()` +2. Use mocks to isolate Workflow logic from Activity implementations +3. Test failure paths by mocking Activities that return errors +4. Use replay testing before deploying Workflow code changes to catch non-determinism errors +5. Use unique task queues per test when running integration tests +6. Call `env.AssertExpectations(s.T())` in `AfterTest` to verify all mocks were called diff --git a/references/go/versioning.md b/references/go/versioning.md new file mode 100644 index 0000000..c8f7280 --- /dev/null +++ b/references/go/versioning.md @@ -0,0 +1,238 @@ +# Go SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## GetVersion API + +`workflow.GetVersion` safely performs backwards-incompatible changes to Workflow Definitions. It returns the version to branch on, recording the result as a marker in the Event History. + +```go +v := workflow.GetVersion(ctx, "changeID", workflow.DefaultVersion, maxSupported) +``` + +- `changeID`: unique string identifying the change +- `minSupported`: oldest version still supported (`workflow.DefaultVersion` is `-1`) +- `maxSupported`: current/newest version +- Returns `maxSupported` for new executions; returns the recorded version on replay + +### Three-Step Lifecycle + +**Step 1: Add GetVersion with both code paths** + +Original code calls `ActivityA`. You want to replace it with `ActivityC`: + +```go +v := workflow.GetVersion(ctx, "Step1", workflow.DefaultVersion, 1) +if v == workflow.DefaultVersion { + // Old code path (for replay of existing workflows) + err = workflow.ExecuteActivity(ctx, ActivityA, data).Get(ctx, &result1) +} else { + // New code path + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} +``` + +For new executions, `GetVersion` returns `1` and records a marker. For replay of pre-change workflows (no marker), it returns `DefaultVersion` (`-1`). + +**Step 2: Remove old branch (increase minSupported)** + +After all `DefaultVersion` Workflow Executions have completed: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 1) +// Only the new code path remains +err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +``` + +Keep the `GetVersion` call even with a single branch. This ensures: + +1. If an older execution replays on this code, it fails fast instead of proceeding incorrectly +2. If you need further changes, you just bump `maxSupported` + +**Step 3: Further changes (bump maxSupported)** + +Later, replace `ActivityC` with `ActivityD`: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 2) +if v == 1 { + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} else { + err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +} +``` + +After all version-1 executions complete, collapse again: + +```go +_ = workflow.GetVersion(ctx, "Step1", 2, 2) +err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +``` + +### Using GetVersion in Loops + +The return value for a given `changeID` is immutable once recorded. In loops, append the iteration number to the `changeID`: + +```go +for i := 0; i < 10; i++ { + v := workflow.GetVersion(ctx, fmt.Sprintf("myChange-%d", i), workflow.DefaultVersion, 1) + if v == workflow.DefaultVersion { + // old path + } else { + // new path + } +} +``` + +## Workflow Type Versioning + +Create a new Workflow Type for incompatible changes: + +```go +// Original +func MyWorkflow(ctx workflow.Context, input Input) (string, error) { + // v1 implementation +} + +// New version +func MyWorkflowV2(ctx workflow.Context, input Input) (string, error) { + // v2 implementation +} +``` + +Register both with the Worker: + +```go +w := worker.New(c, "my-task-queue", worker.Options{}) +w.RegisterWorkflow(MyWorkflow) +w.RegisterWorkflow(MyWorkflowV2) +``` + +Route new executions to the new type. Old workflows continue on the old type. Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "my-service", + BuildId: "v1.0.0", // or git commit hash + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +**Configuration fields:** + +- `UseVersioning`: enables Worker Versioning +- `Version`: identifies the Worker Deployment Version (deployment name + build ID) +- `DefaultVersioningBehavior`: `VersioningBehaviorPinned` or `VersioningBehaviorAutoUpgrade` +- Build ID: typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version. + +**When to use PINNED:** + +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions. + +**When to use AUTO_UPGRADE:** + +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using GetVersion for version transitions + +**Important:** AUTO_UPGRADE workflows still need GetVersion to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```go +// For short-running workflows, prefer PINNED +w := worker.New(c, "orders-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "order-service", + BuildId: os.Getenv("BUILD_ID"), + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: + +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: + +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +This works well with Kubernetes where you manage multiple ReplicaSets running different Worker versions. + +Deploy a new version, then set it as current: + +```bash +temporal worker deployment set-current-version \ + --deployment-name my-service \ + --build-id v2.0.0 +``` + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Keep GetVersion calls** even when only a single branch remains -- it guards against stale replays and simplifies future changes +2. **Use `TemporalChangeVersion` search attribute** to find Workflows running on old versions: + ```bash + temporal workflow list --query \ + 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "Step1"' + ``` +3. **Test with replay** before removing old branches to verify determinism is preserved +4. **Prefer Worker Versioning** for large-scale deployments to avoid accumulating patching branches diff --git a/references/java/advanced-features.md b/references/java/advanced-features.md new file mode 100644 index 0000000..9db730c --- /dev/null +++ b/references/java/advanced-features.md @@ -0,0 +1,192 @@ +# Java SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```java +import io.temporal.client.schedules.*; + +ScheduleClient scheduleClient = ScheduleClient.newInstance(service); + +// Create a schedule +String scheduleId = "daily-report"; +ScheduleHandle handle = scheduleClient.createSchedule( + scheduleId, + Schedule.newBuilder() + .setAction( + ScheduleActionStartWorkflow.newBuilder() + .setWorkflowType(DailyReportWorkflow.class) + .setOptions( + WorkflowOptions.newBuilder() + .setWorkflowId("daily-report") + .setTaskQueue("reports") + .build() + ) + .build() + ) + .setSpec( + ScheduleSpec.newBuilder() + .setIntervals( + List.of(new ScheduleIntervalSpec(Duration.ofDays(1))) + ) + .build() + ) + .build(), + ScheduleOptions.newBuilder().build() +); + +// Manage schedules +ScheduleHandle scheduleHandle = scheduleClient.getHandle(scheduleId); +scheduleHandle.pause("Maintenance window"); +scheduleHandle.unpause(); +scheduleHandle.trigger(); // Run immediately +scheduleHandle.delete(); +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. + +**Note:** If the external system can reliably Signal back with the result and doesn't need to Heartbeat or receive Cancellation, consider using **signals** instead. + +```java +public class ApprovalActivitiesImpl implements ApprovalActivities { + @Override + public String requestApproval(String requestId) { + ActivityExecutionContext ctx = Activity.getExecutionContext(); + + // Get task token for async completion + byte[] taskToken = ctx.getTaskToken(); + + // Store task token for later completion (e.g., in database) + storeTaskToken(requestId, taskToken); + + // Mark this activity as waiting for external completion + ctx.doNotCompleteOnReturn(); + + return null; // Return value is ignored + } +} + +// Later, complete the activity from another process +public void completeApproval(String requestId, boolean approved) { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + + ActivityCompletionClient completionClient = client.newActivityCompletionClient(); + + // Retrieve the task token from external storage (e.g., database) + byte[] taskToken = getTaskToken(requestId); + + if (approved) { + completionClient.complete(taskToken, "approved"); + } else { + completionClient.completeExceptionally( + taskToken, + new RuntimeException("Rejected") + ); + } +} +``` + +## Worker Tuning + +Configure worker performance settings. + +```java +WorkerOptions workerOptions = WorkerOptions.newBuilder() + // Max concurrent workflow task executions (default: 200) + .setMaxConcurrentWorkflowTaskExecutionSize(200) + // Max concurrent activity executions (default: 200) + .setMaxConcurrentActivityExecutionSize(200) + // Max concurrent local activity executions (default: 200) + .setMaxConcurrentLocalActivityExecutionSize(200) + // Max workflow task pollers (default: 5) + .setMaxConcurrentWorkflowTaskPollers(5) + // Max activity task pollers (default: 5) + .setMaxConcurrentActivityTaskPollers(5) + .build(); + +WorkerFactory factory = WorkerFactory.newInstance(client); +Worker worker = factory.newWorker("my-queue", workerOptions); +worker.registerWorkflowImplementationTypes(MyWorkflowImpl.class); +worker.registerActivitiesImplementations(new MyActivitiesImpl()); +factory.start(); +``` + +## Workflow Init Annotation + +You should always put state initialization logic in the constructor of your workflow class, so that it happens before signals/updates arrive. + +Normally, your constructor must have no arguments. However, if you add the `@WorkflowInit` annotation, then your constructor instead receives the same workflow arguments that `run` receives: + +```java +public class MyWorkflowImpl implements MyWorkflow { + private final int foo; + + @WorkflowInit + public MyWorkflowImpl(MyInput input) { + foo = 1234; + } + + @Override + public ClusterManagerResult run(ClusterManagerInput input) { + // this.foo is already initialized + } +} +``` + +Constructor (with `@WorkflowInit`) and `run` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. + +## Workflow Failure Exception Types + +Control which exceptions cause workflow failures vs workflow task failures. + +By default, only `ApplicationFailure` (and its subclasses) fail the workflow execution. All other exceptions fail the **workflow task**, causing the task to retry indefinitely until the code is fixed or the workflow is terminated. + +### Per-Workflow Configuration + +Use `WorkflowImplementationOptions` to specify which exception types should fail the workflow: + +```java +Worker worker = factory.newWorker("my-queue"); +worker.registerWorkflowImplementationTypes( + WorkflowImplementationOptions.newBuilder() + .setFailWorkflowExceptionTypes( + IllegalArgumentException.class, + CustomBusinessException.class + ) + .build(), + MyWorkflowImpl.class +); +``` + +With this configuration, `IllegalArgumentException` and `CustomBusinessException` thrown from the workflow will fail the workflow execution instead of just the workflow task. + +### Worker-Level Configuration + +Apply to all workflows registered on the worker: + +```java +WorkerFactoryOptions factoryOptions = WorkerFactoryOptions.newBuilder() + .setWorkflowHostLocalTaskQueueScheduleToStartTimeout(Duration.ofSeconds(10)) + .build(); +WorkerFactory factory = WorkerFactory.newInstance(client, factoryOptions); + +Worker worker = factory.newWorker("my-queue"); +// Register each workflow type with its own failure exception types +worker.registerWorkflowImplementationTypes( + WorkflowImplementationOptions.newBuilder() + .setFailWorkflowExceptionTypes( + IllegalArgumentException.class, + CustomBusinessException.class + ) + .build(), + MyWorkflowImpl.class, + AnotherWorkflowImpl.class +); +``` + +- **Tip for testing:** Set `setFailWorkflowExceptionTypes(Throwable.class)` so any unhandled exception fails the workflow immediately rather than retrying the workflow task forever. This surfaces bugs faster. diff --git a/references/java/data-handling.md b/references/java/data-handling.md new file mode 100644 index 0000000..2ef1891 --- /dev/null +++ b/references/java/data-handling.md @@ -0,0 +1,288 @@ +# Java SDK Data Handling + +## Overview + +The Java SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. The `DataConverter` interface controls how values are converted to and from Temporal `Payload` protobufs. + +## Default Data Converter + +`DefaultDataConverter` applies converters in order, using the first that accepts the value: + +1. `NullPayloadConverter` — `null` values +2. `ByteArrayPayloadConverter` — `byte[]` as raw binary +3. `ProtobufJsonPayloadConverter` — Protobuf `Message` instances as JSON +4. `ProtobufPayloadConverter` — Protobuf `Message` instances as binary +5. `JacksonJsonPayloadConverter` — Everything else via Jackson `ObjectMapper` + +## Jackson Integration + +Use `JacksonJsonPayloadConverter` with a custom `ObjectMapper` for advanced serialization (e.g., Java 8 time module, custom serializers): + +```java +ObjectMapper mapper = new ObjectMapper() + .registerModule(new JavaTimeModule()) + .disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides( + new JacksonJsonPayloadConverter(mapper) + ); + +WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); +WorkflowClient client = WorkflowClient.newInstance( + service, + WorkflowClientOptions.newBuilder() + .setDataConverter(converter) + .build() +); +``` + +## Custom Data Converter + +Implement `PayloadConverter` for custom serialization: + +```java +public class MyCustomPayloadConverter implements PayloadConverter { + @Override + public String getEncodingType() { + return "json/my-custom"; + } + + @Override + public Optional toData(Object value) throws DataConverterException { + // Return Optional.empty() if this converter doesn't handle the type + if (!(value instanceof MyCustomType)) { + return Optional.empty(); + } + // Serialize to Payload + byte[] data = serialize(value); + return Optional.of( + Payload.newBuilder() + .putMetadata("encoding", ByteString.copyFromUtf8(getEncodingType())) + .setData(ByteString.copyFrom(data)) + .build() + ); + } + + @Override + public T fromData(Payload content, Class valueClass, Type valueType) + throws DataConverterException { + // Deserialize from Payload + return deserialize(content.getData().toByteArray(), valueClass); + } +} +``` + +Override specific converters in the default chain: + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides(new MyCustomPayloadConverter()); +``` + +## Composition of Payload Converters + +`DefaultDataConverter` holds a list of `PayloadConverter` instances tried in order. The first converter whose `toData()` returns a non-empty `Optional` wins. When using `withPayloadConverterOverrides()`, converters with matching encoding types replace existing ones. + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides( + new MyCustomPayloadConverter(), // encoding: "json/my-custom" + new JacksonJsonPayloadConverter(mapper) // replaces default Jackson converter + ); +``` + +## Protobuf Support + +Protobuf messages are handled by `ProtobufJsonPayloadConverter` (enabled by default). It serializes `com.google.protobuf.Message` instances as JSON for human readability in the Temporal UI. + +```java +// Protobuf messages work out of the box as workflow/activity params +@WorkflowInterface +public interface MyWorkflow { + @WorkflowMethod + MyProtoResult run(MyProtoInput input); +} +``` + +For binary protobuf encoding instead of JSON, use `ProtobufPayloadConverter`: + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides(new ProtobufPayloadConverter()); +``` + +## Payload Encryption + +Use `PayloadCodec` with `CodecDataConverter` to encrypt/compress payloads: + +```java +public class EncryptionCodec implements PayloadCodec { + private final SecretKey key; + + public EncryptionCodec(SecretKey key) { + this.key = key; + } + + @Override + public List encode(List payloads) { + return payloads.stream().map(payload -> { + // Encrypt payload.toByteArray() using your chosen algorithm (e.g., AES/GCM) + byte[] encrypted = encryptBytes(payload.toByteArray(), key); + return Payload.newBuilder() + .putMetadata("encoding", ByteString.copyFromUtf8("binary/encrypted")) + .setData(ByteString.copyFrom(encrypted)) + .build(); + }).collect(Collectors.toList()); + } + + @Override + public List decode(List payloads) { + return payloads.stream().map(payload -> { + String encoding = payload.getMetadataOrDefault( + "encoding", ByteString.EMPTY).toStringUtf8(); + if (!"binary/encrypted".equals(encoding)) return payload; + // Decrypt and reconstruct the original Payload + byte[] decrypted = decryptBytes(payload.getData().toByteArray(), key); + return Payload.parseFrom(decrypted); + }).collect(Collectors.toList()); + } +} +``` + +Apply the codec to the client: + +```java +CodecDataConverter codecDataConverter = new CodecDataConverter( + DefaultDataConverter.newDefaultInstance(), + Collections.singletonList(new EncryptionCodec(secretKey)) +); + +WorkflowClient client = WorkflowClient.newInstance( + service, + WorkflowClientOptions.newBuilder() + .setDataConverter(codecDataConverter) + .build() +); +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. + +```java +import io.temporal.common.SearchAttributeKey; +import io.temporal.common.SearchAttributes; + +// Define typed search attribute keys +static final SearchAttributeKey ORDER_ID = + SearchAttributeKey.forKeyword("OrderId"); +static final SearchAttributeKey ORDER_STATUS = + SearchAttributeKey.forKeyword("OrderStatus"); +static final SearchAttributeKey ORDER_TOTAL = + SearchAttributeKey.forDouble("OrderTotal"); +static final SearchAttributeKey CREATED_AT = + SearchAttributeKey.forOffsetDateTime("CreatedAt"); + +// Set at workflow start +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("order-" + orderId) + .setTaskQueue("orders") + .setTypedSearchAttributes( + SearchAttributes.newBuilder() + .set(ORDER_ID, orderId) + .set(ORDER_STATUS, "pending") + .set(ORDER_TOTAL, 99.99) + .set(CREATED_AT, OffsetDateTime.now()) + .build() + ) + .build(); +``` + +Upsert during workflow execution: + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(Order order); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + static final SearchAttributeKey ORDER_STATUS = + SearchAttributeKey.forKeyword("OrderStatus"); + + @Override + public String run(Order order) { + // ... process order ... + + Workflow.upsertTypedSearchAttributes( + ORDER_STATUS.valueSet("completed") + ); + return "done"; + } +} +``` + +### Querying Workflows by Search Attributes + +```java +ListWorkflowExecutionsRequest request = ListWorkflowExecutionsRequest.newBuilder() + .setNamespace("default") + .setQuery("OrderStatus = 'processing' OR OrderStatus = 'pending'") + .build(); +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```java +// Set memo at workflow start +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("order-" + orderId) + .setTaskQueue("orders") + .setMemo(Map.of( + "customer_name", order.getCustomerName(), + "notes", "Priority customer" + )) + .build(); +``` + +```java +// Read memo from workflow +@Override +public String run(Order order) { + String notes = Workflow.getMemo("notes", String.class); + // ... +} +``` + +## Deterministic APIs for Values + +Use these APIs within workflows for deterministic values: + +```java +@Override +public String run() { + // Deterministic UUID (same on replay) + String uniqueId = Workflow.randomUUID().toString(); + + // Deterministic random (same on replay) + Random rng = Workflow.newRandom(); + int value = rng.nextInt(100); + + // Deterministic current time (same on replay) + long now = Workflow.currentTimeMillis(); + + return uniqueId; +} +``` + +## Best Practices + +1. Use Jackson `ObjectMapper` customization for complex serialization needs +2. Keep payloads small — see `references/core/gotchas.md` for limits +3. Encrypt sensitive data with `PayloadCodec` and `CodecDataConverter` +4. Use POJOs or Protobuf messages for workflow/activity parameters +5. Use `Workflow.randomUUID()`, `Workflow.newRandom()`, and `Workflow.currentTimeMillis()` for deterministic values diff --git a/references/java/determinism-protection.md b/references/java/determinism-protection.md new file mode 100644 index 0000000..1894644 --- /dev/null +++ b/references/java/determinism-protection.md @@ -0,0 +1,85 @@ +# Java Determinism Protection + +## Overview + +The Java SDK has **no sandbox** (only Python and TypeScript have sandboxing). Java relies on developer conventions and runtime replay detection to enforce determinism. A static analysis tool (`temporal-workflowcheck`) is available in beta. + +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. + +```java +// BAD: Non-deterministic operations in workflow code +Thread.sleep(1000); +UUID id = UUID.randomUUID(); +double val = Math.random(); +long now = System.currentTimeMillis(); +new Thread(() -> doWork()).start(); +CompletableFuture.supplyAsync(() -> compute()); + +// GOOD: Deterministic Workflow.* alternatives +Workflow.sleep(Duration.ofSeconds(1)); +String id = Workflow.randomUUID().toString(); +int val = Workflow.newRandom().nextInt(); +long now = Workflow.currentTimeMillis(); +Promise promise = Async.procedure(() -> doWork()); +CompletablePromise promise = Workflow.newPromise(); +``` + +## Static Analysis with `temporal-workflowcheck` + +**Warning:** This tool is in beta. + +`temporal-workflowcheck` scans compiled bytecode to detect non-deterministic operations in workflow code. It catches threading, I/O, randomization, system time access, and non-final static field access — including transitive violations through call chains. + +### Setup (Gradle) + +Add the dependency as a compile-only check: + +```groovy +dependencies { + implementation 'io.temporal:temporal-sdk:1.+' + compileOnly 'io.temporal:temporal-workflowcheck:1.+' +} +``` + +See the [Gradle sample](https://github.com/temporalio/sdk-java/tree/master/temporal-workflowcheck/samples/gradle) for full task configuration. + +### Setup (Maven) + +See the [Maven sample](https://github.com/temporalio/sdk-java/tree/master/temporal-workflowcheck/samples/maven) for POM configuration. + +### Running Manually + +Download the `-all.jar` from Maven Central (`io.temporal:temporal-workflowcheck`) and run: + +```bash +java -jar temporal-workflowcheck--all.jar check +``` + +### Suppressing False Positives + +Use the `@WorkflowCheck.SuppressWarnings` annotation on methods: + +```java +@WorkflowCheck.SuppressWarnings(invalidMembers = "currentTimeMillis") +public long getCurrentMillis() { + return System.currentTimeMillis(); +} +``` + +Or use a `.properties` configuration file with `--config ` for third-party library false positives. + +## Convention-Based Enforcement + +Java workflow code runs in a cooperative threading model where only one workflow thread executes at a time under a global lock. The SDK does not intercept or block non-deterministic calls. Instead, non-determinism is detected at **replay time**: if replayed code produces results that differ from the recorded history, the SDK throws a `NonDeterministicException`. + +Use both `temporal-workflowcheck` (static, pre-deploy) and `WorkflowReplayer` (replay testing) to catch non-determinism before production. + +## Best Practices + +1. Run `temporal-workflowcheck` in CI to catch non-deterministic code statically +2. Always use `Workflow.*` APIs instead of standard Java equivalents for time, randomness, UUIDs, sleeping, and threading +3. Test all workflow code changes with `WorkflowReplayer` against recorded histories +4. Keep workflows focused on orchestration logic; move all I/O and side effects into activities +5. Avoid mutable static state shared across workflow instances diff --git a/references/java/determinism.md b/references/java/determinism.md new file mode 100644 index 0000000..29f25d5 --- /dev/null +++ b/references/java/determinism.md @@ -0,0 +1,57 @@ +# Java SDK Determinism + +## Overview + +The Java SDK has **no sandbox** (only Python and TypeScript have sandboxing). The Java SDK relies on developer conventions to enforce determinism. The SDK provides `Workflow.*` APIs as safe replacements for common non-deterministic operations. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time — see `references/java/determinism-protection.md`. + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. + +## SDK Protection + +Java workflow code runs in a cooperative threading model where only one workflow thread executes at a time under a global lock. The SDK does not intercept or block non-deterministic calls at runtime. If you call a forbidden operation, it will silently succeed during the initial execution but cause a `NonDeterministicException` when the workflow is replayed. + +`temporal-workflowcheck` (static analysis, beta) and `WorkflowReplayer` (replay testing) can help uncover some violations, but they are not exhaustive — careful code review and adherence to the rules below remain essential. + +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. + +- `Thread.sleep()` — blocks the real thread, bypasses Temporal timers +- `new Thread()` or thread pools — breaks the cooperative threading model +- `synchronized` blocks and explicit locks — can deadlock with the workflow executor +- `UUID.randomUUID()` — non-deterministic across replays +- `Math.random()` or `new Random()` — non-deterministic across replays +- `System.currentTimeMillis()` or `Instant.now()` — non-deterministic across replays +- Direct I/O (network, filesystem, database) — side effects must run in activities +- Mutable global/static state — shared state breaks isolation between workflow instances +- `CompletableFuture` — bypasses the workflow scheduler; use `Promise` instead + +## Safe Builtin Alternatives + +| Forbidden | Safe Alternative | +|-----------|------------------| +| `Thread.sleep(millis)` | `Workflow.sleep(Duration.ofMillis(millis))` | +| `UUID.randomUUID()` | `Workflow.randomUUID()` | +| `Math.random()` | `Workflow.newRandom().nextInt()` | +| `System.currentTimeMillis()` | `Workflow.currentTimeMillis()` | +| `new Thread(runnable)` | `Async.function(func)` / `Async.procedure(proc)` | +| `CompletableFuture` | `Promise` / `CompletablePromise` | +| `BlockingQueue` | `WorkflowQueue` | +| `Future` | `Promise` | + +## Testing Replay Compatibility + +Use the `WorkflowReplayer` class to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/java/testing.md`. + +## Best Practices + +1. Use `Workflow.currentTimeMillis()` for all time operations +2. Use `Workflow.newRandom()` for random values +3. Use `Workflow.randomUUID()` for unique identifiers +4. Use `Async.function()` / `Async.procedure()` instead of raw threads +5. Use `Promise` and `CompletablePromise` instead of `CompletableFuture` +6. Test with `WorkflowReplayer` to catch non-determinism +7. Keep workflows focused on orchestration, delegate I/O to activities +8. Use `Workflow.getLogger()` for replay-safe logging diff --git a/references/java/error-handling.md b/references/java/error-handling.md new file mode 100644 index 0000000..753d69a --- /dev/null +++ b/references/java/error-handling.md @@ -0,0 +1,193 @@ +# Java SDK Error Handling + +## Overview + +The Java SDK uses `ApplicationFailure` for application-specific errors and `RetryOptions` for retry configuration. Generally, the following information about errors and retryability applies across activities, child workflows and Nexus operations. + +## Application Errors + +```java +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; +import io.temporal.failure.ApplicationFailure; + +@ActivityInterface +public interface OrderActivities { + @ActivityMethod + void validateOrder(Order order); +} + +public class OrderActivitiesImpl implements OrderActivities { + @Override + public void validateOrder(Order order) { + if (!order.isValid()) { + throw ApplicationFailure.newFailure( + "Invalid order", + "ValidationError" + ); + } + } +} +``` + +Any exception that is not an `ApplicationFailure` is automatically converted to one, with the fully qualified class name as the type. For example, throwing `new NullPointerException("msg")` is equivalent to `ApplicationFailure.newFailure("msg", "java.lang.NullPointerException")`. + +## Non-Retryable Errors + +```java +import io.temporal.failure.ApplicationFailure; + +public class PaymentActivitiesImpl implements PaymentActivities { + @Override + public String chargeCard(String cardNumber, double amount) { + if (!isValidCard(cardNumber)) { + throw ApplicationFailure.newNonRetryableFailure( + "Permanent failure - invalid credit card", + "PaymentError" + ); + } + return processPayment(cardNumber, amount); + } +} +``` + +You can also mark error types as non-retryable via `RetryOptions.setDoNotRetry()`: + +```java +RetryOptions retryOptions = RetryOptions.newBuilder() + .setDoNotRetry( + CreditCardProcessingException.class.getName(), + "ValidationError" + ) + .build(); +``` + +Use `newNonRetryableFailure()` when the **activity implementer** knows the error is permanent. Use `setDoNotRetry()` when the **caller** wants to control retryability. + +## Activity Errors + +Activity failures are always wrapped in `ActivityFailure`. The original exception becomes the `cause`: + +- `ActivityFailure` → `ApplicationFailure` (application error) +- `ActivityFailure` → `TimeoutFailure` (timeout) +- `ActivityFailure` → `CanceledFailure` (cancellation) + +## Handling Activity Errors + +```java +import io.temporal.failure.ActivityFailure; +import io.temporal.failure.ApplicationFailure; +import io.temporal.failure.CanceledFailure; +import io.temporal.failure.TimeoutFailure; +import io.temporal.workflow.Workflow; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + try { + return activities.riskyOperation(); + } catch (ActivityFailure af) { + // Let cancellation propagate so the workflow is canceled, not failed + if (af.getCause() instanceof CanceledFailure) { + throw af; + } + if (af.getCause() instanceof ApplicationFailure) { + ApplicationFailure appFailure = (ApplicationFailure) af.getCause(); + String type = appFailure.getType(); + // Handle based on error type + } else if (af.getCause() instanceof TimeoutFailure) { + // Handle timeout + } + throw ApplicationFailure.newFailure( + "Workflow failed due to activity error", + "WorkflowError" + ); + } + } +} +``` + +## Retry Policy Configuration + +```java +import io.temporal.activity.ActivityOptions; +import io.temporal.common.RetryOptions; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +public class MyWorkflowImpl implements MyWorkflow { + + private final MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setMaximumInterval(Duration.ofMinutes(1)) + .setMaximumAttempts(5) + .setDoNotRetry("ValidationError", "PaymentError") + .build()) + .build() + ); + + @Override + public String run() { + return activities.myActivity(); + } +} +``` + +Only set options such as `maximumInterval`, `maximumAttempts` etc. if you have a domain-specific reason to. If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) // Single attempt + .setScheduleToCloseTimeout(Duration.ofMinutes(30)) // Including retries + .setHeartbeatTimeout(Duration.ofMinutes(2)) // Between heartbeats + .build(); +``` + +## Workflow Failure + +**IMPORTANT:** Only `ApplicationFailure` causes a workflow to fail. Any other exception thrown from workflow code causes the workflow task to retry indefinitely, not the workflow itself. + +```java +import io.temporal.failure.ApplicationFailure; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + if (someCondition) { + throw ApplicationFailure.newFailure( + "Cannot process order", + "BusinessError" + ); + } + return "success"; + } +} +``` + +To allow other exception types to fail the workflow instead of causing infinite task retries, see `references/java/advanced-features.md` for configuring `setFailWorkflowExceptionTypes()`. + +Use checked exceptions with `Workflow.wrap()` to rethrow them as unchecked: + +```java +try { + return someCall(); +} catch (Exception e) { + throw Workflow.wrap(e); +} +``` + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Configure appropriate retry policies +4. Log errors before re-raising +5. Catch `ActivityFailure` (not `ApplicationFailure`) for activity failures in workflows +6. Design code to be idempotent for safe retries (see more at `references/core/patterns.md`) +7. Use `ApplicationFailure.newFailure()` to fail workflows — other exceptions cause infinite task retries diff --git a/references/java/gotchas.md b/references/java/gotchas.md new file mode 100644 index 0000000..4943f0d --- /dev/null +++ b/references/java/gotchas.md @@ -0,0 +1,179 @@ +# Java Gotchas + +Java-specific mistakes and anti-patterns. See also [Common Gotchas](../core/gotchas.md) for language-agnostic concepts. + +## Non-Deterministic Operations + +**Critical: The Java SDK has NO sandbox.** Unlike Python (which uses a sandbox) or TypeScript (which uses V8 isolation), the Java SDK relies entirely on developer conventions. Non-deterministic calls silently succeed during initial execution but cause `NonDeterministicException` on replay. + +Forbidden in workflow code — use the Temporal `Workflow.*` equivalents instead: + +- `Thread.sleep` → `Workflow.sleep` +- `UUID.randomUUID` → `Workflow.randomUUID` +- `Math.random` → `Workflow.newRandom` +- `System.currentTimeMillis` → `Workflow.currentTimeMillis` +- `new Thread` → `Async.function` +- `synchronized` blocks → unnecessary (workflow code runs under a global lock) + +See `references/java/determinism.md` for the full table of forbidden operations, safe alternatives, and detailed examples. + +## Wrong Retry Classification + +**Example:** Transient networks errors should be retried. Authentication errors should not be. +See `references/java/error-handling.md` to understand how to classify errors. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```java +// BAD - No heartbeat, can't detect stuck activities +@Override +public void processLargeFile(String path) { + for (String chunk : readChunks(path)) { + process(chunk); // Takes hours, no heartbeat + } +} + +// GOOD - Regular heartbeats with progress +@Override +public void processLargeFile(String path) { + int i = 0; + for (String chunk : readChunks(path)) { + Activity.getExecutionContext().heartbeat("Processing chunk " + i++); + process(chunk); + } +} +``` + +### Heartbeat Timeout Too Short + +```java +// BAD - Heartbeat timeout shorter than processing time +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(30)) + .setHeartbeatTimeout(Duration.ofSeconds(10)) // Too short! + .build(); + +// GOOD - Heartbeat timeout allows for processing variance +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(30)) + .setHeartbeatTimeout(Duration.ofMinutes(2)) + .build(); +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```java +// BAD - Cleanup doesn't run on cancellation +public class BadWorkflow implements MyWorkflow { + @Override + public void run() { + activities.acquireResource(); + activities.doWork(); + activities.releaseResource(); // Never runs if cancelled! + } +} +``` + +```java +// GOOD - Use try/finally with CancellationScope.nonCancellable +import io.temporal.workflow.CancellationScope; +import io.temporal.workflow.Workflow; + +public class GoodWorkflow implements MyWorkflow { + @Override + public void run() { + activities.acquireResource(); + try { + activities.doWork(); + } finally { + CancellationScope scope = Workflow.newDetachedCancellationScope( + () -> activities.releaseResource() + ); + scope.run(); + } + } +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: + +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Catching CanceledFailure** - Thrown when heartbeat detects cancellation + +```java +// BAD - Activity ignores cancellation +@Override +public void longActivity() { + doExpensiveWork(); // Runs to completion even if cancelled +} +``` + +```java +// GOOD - Heartbeat and catch cancellation +import io.temporal.activity.Activity; +import io.temporal.failure.CanceledFailure; + +@Override +public void longActivity() { + try { + for (int i = 0; i < items.size(); i++) { + Activity.getExecutionContext().heartbeat(i); + process(items.get(i)); + } + } catch (CanceledFailure e) { + cleanup(); + throw e; + } +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/java/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. This is especially critical in Java since there is no sandbox. Please see `references/java/testing.md` for more info. + +## Timers and Sleep + +### Using Thread.sleep + +```java +// BAD - Thread.sleep is not deterministic during replay +public class BadWorkflow implements MyWorkflow { + @Override + public void run() { + try { + Thread.sleep(60000); // Non-deterministic! + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} +``` + +```java +// GOOD - Use Workflow.sleep for deterministic timers +import io.temporal.workflow.Workflow; +import java.time.Duration; + +public class GoodWorkflow implements MyWorkflow { + @Override + public void run() { + Workflow.sleep(Duration.ofSeconds(60)); // Deterministic + } +} +``` + +**Why this matters:** `Thread.sleep` uses the system clock, which differs between original execution and replay. `Workflow.sleep` creates a durable timer in the event history, ensuring consistent behavior during replay. Unlike Python and TypeScript, there is no sandbox to catch this — the call silently succeeds and only fails on replay. diff --git a/references/java/java.md b/references/java/java.md new file mode 100644 index 0000000..2adfc6d --- /dev/null +++ b/references/java/java.md @@ -0,0 +1,266 @@ +# Temporal Java SDK Reference + +## Overview + +The Temporal Java SDK (`io.temporal:temporal-sdk`) uses an interface + implementation pattern for both Workflows and Activities. Java 8+ required; Java 21+ strongly recommended for virtual thread support. + +## Quick Start + +**Add Dependencies:** + +Gradle: + +```groovy +implementation 'io.temporal:temporal-sdk:1.+' +``` + +Maven: + +```xml + + io.temporal + temporal-sdk + [1.0,) + +``` + +**GreetActivities.java** - Activity interface: + +```java +package greetingapp; + +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; + +@ActivityInterface +public interface GreetActivities { + + @ActivityMethod + String greet(String name); +} +``` + +**GreetActivitiesImpl.java** - Activity implementation: + +```java +package greetingapp; + +public class GreetActivitiesImpl implements GreetActivities { + + @Override + public String greet(String name) { + return "Hello, " + name + "!"; + } +} +``` + +**GreetingWorkflow.java** - Workflow interface: + +```java +package greetingapp; + +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +@WorkflowInterface +public interface GreetingWorkflow { + + @WorkflowMethod + String greet(String name); +} +``` + +**GreetingWorkflowImpl.java** - Workflow implementation: + +```java +package greetingapp; + +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +public class GreetingWorkflowImpl implements GreetingWorkflow { + + private final GreetActivities activities = Workflow.newActivityStub( + GreetActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build() + ); + + @Override + public String greet(String name) { + return activities.greet(name); + } +} +``` + +**GreetingWorker.java** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): + +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.serviceclient.WorkflowServiceStubs; +import io.temporal.worker.Worker; +import io.temporal.worker.WorkerFactory; + +public class GreetingWorker { + + public static void main(String[] args) { + // Create gRPC stubs for local dev server (localhost:7233) + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + + // Create client + WorkflowClient client = WorkflowClient.newInstance(service); + + // Create factory and worker + WorkerFactory factory = WorkerFactory.newInstance(client); + Worker worker = factory.newWorker("greeting-queue"); + + // Register workflow and activity implementations + worker.registerWorkflowImplementationTypes(GreetingWorkflowImpl.class); + worker.registerActivitiesImplementations(new GreetActivitiesImpl()); + + // Start polling + factory.start(); + } +} +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `GreetingWorker.main()` (e.g., `./gradlew run` or `mvn compile exec:java -Dexec.mainClass="greetingapp.GreetingWorker"`). + +**Starter.java** - Start a workflow execution: + +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import io.temporal.serviceclient.WorkflowServiceStubs; + +import java.util.UUID; + +public class Starter { + + public static void main(String[] args) { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + + GreetingWorkflow workflow = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") + .build() + ); + + String result = workflow.greet("my name"); + System.out.println("Result: " + result); + } +} +``` + +**Run the workflow:** Run `Starter.main()`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition + +- Annotate interface with `@WorkflowInterface` +- Put any state initialization logic in the workflow constructor to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@WorkflowInit` decorator and parameters to your constructor. +- Annotate entry point method with `@WorkflowMethod` (exactly one per interface) +- Use `@SignalMethod` for signal handlers +- Use `@QueryMethod` for query handlers +- Use `@UpdateMethod` for update handlers +- Implementation class implements the interface + +### Activity Definition + +- Annotate interface with `@ActivityInterface` +- Optionally annotate methods with `@ActivityMethod` (for custom names) +- Implementation class can throw any exception +- Call from workflow via `Workflow.newActivityStub()` + +### Worker Setup + +- `WorkflowServiceStubs` -- gRPC connection to Temporal Server +- `WorkflowClient` -- client used by worker to communicate with server +- `WorkerFactory` -- creates Worker instances +- `Worker` -- polls a single Task Queue, register workflows and activities on it +- Call `factory.start()` to begin polling + +For Spring Boot apps, `temporal-spring-boot-starter` handles all of the above automatically via auto-configuration. See `references/java/spring-boot.md`. + +## File Organization Best Practice + +**Keep Workflow and Activity definitions in separate files.** Separating them is good practice for clarity and maintainability. + +``` +greetingapp/ +├── GreetActivities.java # Activity interface +├── GreetActivitiesImpl.java # Activity implementation +├── GreetingWorkflow.java # Workflow interface +├── GreetingWorkflowImpl.java # Workflow implementation +├── GreetingWorker.java # Worker setup +└── Starter.java # Client code to start workflows +``` + +## Determinism Rules + +The Java SDK has **no sandbox**. The developer is fully responsible for writing deterministic workflow code. All non-deterministic operations must happen in Activities. + +**Do not use in workflow code:** + +- `Thread` / `new Thread()` -- use `Workflow.newTimer()` or `Async.function()` +- `synchronized` / `Lock` -- workflow code is single-threaded +- `UUID.randomUUID()` -- use `Workflow.randomUUID()` +- `Math.random()` -- use `Workflow.newRandom()` +- `System.currentTimeMillis()` / `Instant.now()` -- use `Workflow.currentTimeMillis()` +- File I/O, network calls, database access -- use Activities +- `Thread.sleep()` -- use `Workflow.sleep()` +- Mutable static fields -- workflow instances must not share state + +**Use `Workflow.*` APIs instead:** + +- `Workflow.sleep()` for timers +- `Workflow.currentTimeMillis()` for current time +- `Workflow.randomUUID()` for UUIDs +- `Workflow.newRandom()` for random numbers +- `Workflow.getLogger()` for replay-safe logging + +See `references/core/determinism.md` for detailed determinism rules. + +## Common Pitfalls + +1. **Non-deterministic code in workflows** - Use `Workflow.*` APIs instead of standard Java APIs; perform I/O in Activities +2. **Forgetting `@WorkflowInterface` or `@ActivityInterface`** - Annotations are required on interfaces for registration +3. **Multiple `@WorkflowMethod` on one interface** - Only one `@WorkflowMethod` is allowed per `@WorkflowInterface` +4. **Using `Thread.sleep()` in workflows** - Use `Workflow.sleep()` for deterministic timers +5. **Forgetting to heartbeat** - Long-running activities need `Activity.getExecutionContext().heartbeat()` +6. **Using `System.out.println()` in workflows** - Use `Workflow.getLogger()` for replay-safe logging +7. **Not registering activities as instances** - `registerActivitiesImplementations()` takes object instances (`new MyActivitiesImpl()`), not classes +8. **Blocking the workflow thread** - Never perform I/O or long computations in workflow code; use Activities +9. **Sharing mutable state between workflow instances** - Each workflow execution must be independent + +## Writing Tests + +See `references/java/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files + +- **`references/java/spring-boot.md`** - Spring Boot integration: auto-discovery, dependency injection, worker lifecycle, testing +- **`references/java/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/java/determinism.md`** - Determinism rules and safe alternatives for Java +- **`references/java/gotchas.md`** - Java-specific mistakes and anti-patterns +- **`references/java/error-handling.md`** - ApplicationFailure, retry policies, non-retryable errors +- **`references/java/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/java/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/java/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/java/data-handling.md`** - Data converters, Jackson, payload encryption +- **`references/java/versioning.md`** - Patching API, workflow type versioning, Worker Versioning diff --git a/references/java/observability.md b/references/java/observability.md new file mode 100644 index 0000000..338fcb7 --- /dev/null +++ b/references/java/observability.md @@ -0,0 +1,135 @@ +# Java SDK Observability + +## Overview + +The Java SDK provides observability through replay-safe logging, Micrometer-based metrics, and visibility (Search Attributes). + +## Logging + +### Workflow Logging (Replay-Safe) + +Use `Workflow.getLogger()` for replay-safe logging that suppresses duplicate messages during replay: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + private static final Logger logger = Workflow.getLogger(OrderWorkflowImpl.class); + + @Override + public String run(Order order) { + logger.info("Workflow started for order {}", order.getId()); + + String result = Workflow.newActivityStub(OrderActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build() + ).processOrder(order); + + logger.info("Activity completed with result {}", result); + return result; + } +} +``` + +The workflow logger automatically: + +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) +- Uses SLF4J under the hood + +### Activity Logging + +Use standard SLF4J loggers in activities. Activity context is available via `Activity.getExecutionContext()`: + +```java +public class OrderActivitiesImpl implements OrderActivities { + private static final Logger logger = + LoggerFactory.getLogger(OrderActivitiesImpl.class); + + @Override + public String processOrder(Order order) { + logger.info("Processing order {}", order.getId()); + + // Access activity context for metadata + ActivityExecutionContext ctx = Activity.getExecutionContext(); + logger.info("Activity ID: {}, attempt: {}", + ctx.getInfo().getActivityId(), + ctx.getInfo().getAttempt()); + + // Perform work... + logger.info("Order processed successfully"); + return "completed"; + } +} +``` + +## Customizing the Logger + +The Java SDK uses SLF4J. Configure your preferred backend: + +### Logback (logback.xml) + +```xml + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + +``` + +Log4j2 is also supported as an SLF4J backend with equivalent configuration. + +## Metrics + +### Micrometer with Prometheus + +The Java SDK uses Micrometer for metrics collection. Configure with `MicrometerClientStatsReporter`: + +```java +import io.micrometer.prometheus.PrometheusConfig; +import io.micrometer.prometheus.PrometheusMeterRegistry; +import io.temporal.common.reporter.MicrometerClientStatsReporter; +import com.uber.m3.tally.RootScopeBuilder; +import com.uber.m3.tally.Scope; +import com.uber.m3.util.Duration; + +// Set up Prometheus registry +PrometheusMeterRegistry registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + +// Create the Temporal metrics scope +Scope scope = new RootScopeBuilder() + .reporter(new MicrometerClientStatsReporter(registry)) + .reportEvery(Duration.ofSeconds(10)); + +// Apply to service stubs +WorkflowServiceStubs service = WorkflowServiceStubs.newServiceStubs( + WorkflowServiceStubsOptions.newBuilder() + .setMetricsScope(scope) + .build() +); + +// Expose Prometheus endpoint (e.g., via HTTP server) +// registry.scrape() returns the metrics in Prometheus format +``` + +### Key SDK Metrics + +- `temporal_request` — Client requests to server +- `temporal_workflow_task_execution_latency` — Workflow task processing time +- `temporal_activity_execution_latency` — Activity execution time +- `temporal_workflow_task_replay_latency` — Replay duration + +## Best Practices + +1. Use `Workflow.getLogger()` in workflows, standard SLF4J loggers in activities +2. Do not use `System.out.println()` in workflows — it produces duplicate output on replay +3. Configure Micrometer metrics for production monitoring +4. Use Search Attributes for business-level visibility — see `references/java/data-handling.md` diff --git a/references/java/patterns.md b/references/java/patterns.md new file mode 100644 index 0000000..e6428a9 --- /dev/null +++ b/references/java/patterns.md @@ -0,0 +1,511 @@ +# Java SDK Patterns + +## Signals + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(); + + @SignalMethod + void approve(); + + @SignalMethod + void addItem(String item); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + private boolean approved = false; + private final List items = new ArrayList<>(); + + @Override + public void approve() { + this.approved = true; + } + + @Override + public void addItem(String item) { + this.items.add(item); + } + + @Override + public String run() { + Workflow.await(() -> this.approved); + return "Processed " + this.items.size() + " items"; + } +} +``` + +### Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```java +public class DynamicSignalWorkflowImpl implements DynamicSignalWorkflow { + private final Map> signals = new HashMap<>(); + + @Override + public String run() { + Workflow.registerListener( + (DynamicSignalHandler) (signalName, encodedArgs) -> { + signals.computeIfAbsent(signalName, k -> new ArrayList<>()) + .add(encodedArgs.get(0, String.class)); + }); + // ... workflow logic ... + } +} +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```java +@WorkflowInterface +public interface StatusWorkflow { + @WorkflowMethod + String run(); + + @QueryMethod + String getStatus(); + + @QueryMethod + int getProgress(); +} + +public class StatusWorkflowImpl implements StatusWorkflow { + private String status = "pending"; + private int progress = 0; + + @Override + public String getStatus() { + return this.status; + } + + @Override + public int getProgress() { + return this.progress; + } + + @Override + public String run() { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(1)) + .build()); + + this.status = "running"; + for (int i = 0; i < 100; i++) { + this.progress = i; + activities.processItem(i); + } + this.status = "completed"; + return "done"; + } +} +``` + +### Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```java +Workflow.registerListener( + (DynamicQueryHandler) (queryName, encodedArgs) -> { + if (queryName.equals("getField")) { + String fieldName = encodedArgs.get(0, String.class); + return fields.get(fieldName); + } + return null; + }); +``` + +## Updates + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(); + + @UpdateMethod + int addItem(String item); + + @UpdateValidatorMethod(updateName = "addItem") + void validateAddItem(String item); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + private final List items = new ArrayList<>(); + + @Override + public int addItem(String item) { + this.items.add(item); + return this.items.size(); // Returns new count to caller + } + + @Override + public void validateAddItem(String item) { + if (item == null || item.isEmpty()) { + throw new IllegalArgumentException("Item cannot be empty"); + } + if (this.items.size() >= 100) { + throw new IllegalArgumentException("Order is full"); + } + } + + // ... run() ... +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an exception to reject the update; return normally to accept. + +## Child Workflows + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public List run(List orders) { + List results = new ArrayList<>(); + for (Order order : orders) { + ProcessOrderWorkflow child = Workflow.newChildWorkflowStub( + ProcessOrderWorkflow.class, + ChildWorkflowOptions.newBuilder() + .setWorkflowId("order-" + order.getId()) + .build()); + results.add(child.run(order)); + } + return results; + } +} +``` + +## Child Workflow Options + +```java +ChildWorkflowOptions options = ChildWorkflowOptions.newBuilder() + .setWorkflowId("child-workflow-id") + // Control what happens to child when parent closes + .setParentClosePolicy(ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON) + // Control what happens to child when parent is cancelled + .setCancellationType(ChildWorkflowCancellationType.WAIT_CANCELLATION_COMPLETED) + .setWorkflowExecutionTimeout(Duration.ofMinutes(10)) + .build(); + +ProcessOrderWorkflow child = Workflow.newChildWorkflowStub( + ProcessOrderWorkflow.class, options); +``` + +## Handles to External Workflows + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public void run(String targetWorkflowId) { + // Get handle to external workflow + TargetWorkflow external = Workflow.newExternalWorkflowStub( + TargetWorkflow.class, targetWorkflowId); + + // Signal the external workflow + external.dataReady(dataPayload); + + // Or cancel it using untyped stub + ExternalWorkflowStub untypedExternal = + Workflow.newUntypedExternalWorkflowStub(targetWorkflowId); + untypedExternal.cancel(); + } +} +``` + +## Parallel Execution + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public List run(List items) { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + + // Execute activities in parallel + List> promises = new ArrayList<>(); + for (String item : items) { + promises.add(Async.function(activities::processItem, item)); + } + + // Wait for all to complete + Promise.allOf(promises).get(); + + // Collect results + List results = new ArrayList<>(); + for (Promise promise : promises) { + results.add(promise.get()); + } + return results; + } +} +``` + +## Continue-as-New + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(WorkflowState state) { + while (true) { + state = processBatch(state); + + if (state.isComplete()) { + return "done"; + } + + // Continue with fresh history before hitting limits + if (Workflow.getInfo().isContinueAsNewSuggested()) { + Workflow.continueAsNew(state); + } + } + } +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent — they may be retried (as with ALL activities). + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(Order order) { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + + List compensations = new ArrayList<>(); + + try { + // Note - we save the compensation BEFORE running the activity, + // because the following could happen: + // 1. reserveInventory starts running + // 2. it does successfully reserve inventory + // 3. but then fails for some other reason (timeout, reporting metrics, etc.) + // 4. in that case, the activity would have failed, but the effect still happened + // So, the compensation needs to handle both reserved and unreserved states. + compensations.add(() -> activities.releaseInventoryIfReserved(order)); + activities.reserveInventory(order); + + compensations.add(() -> activities.refundPaymentIfCharged(order)); + activities.chargePayment(order); + + activities.shipOrder(order); + + return "Order completed"; + + } catch (Exception e) { + Workflow.getLogger(MyWorkflowImpl.class) + .error("Order failed, running compensations", e); + // Use a detached cancellation scope so compensations run even if + // the workflow itself was cancelled. + CancellationScope compensationScope = Workflow.newDetachedCancellationScope(() -> { + Collections.reverse(compensations); + for (Runnable compensate : compensations) { + try { + compensate.run(); + } catch (Exception compErr) { + Workflow.getLogger(MyWorkflowImpl.class) + .error("Compensation failed", compErr); + } + } + }); + compensationScope.run(); + throw Workflow.wrap(e); + } + } +} +``` + +## Cancellation Scopes + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + try { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofHours(1)) + .build()); + + activities.longRunningActivity(); + return "completed"; + + } catch (CanceledFailure e) { + // Workflow was cancelled - perform cleanup + Workflow.getLogger(MyWorkflowImpl.class) + .info("Workflow cancelled, running cleanup"); + + // Use nonCancellable scope so cleanup activities still run + CancellationScope cleanupScope = Workflow.newDetachedCancellationScope( + () -> { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + activities.cleanupActivity(); + }); + cleanupScope.run(); + throw e; // Re-throw to mark workflow as cancelled + } + } +} +``` + +Timeout scope: + +```java +CancellationScope timeoutScope = Workflow.newCancellationScope( + () -> { + // This scope will be cancelled after 30 minutes + activities.longRunningActivity(); + }); +timeoutScope.run(); +// Cancel after timeout +Workflow.newTimer(Duration.ofMinutes(30)).thenApply(r -> { + timeoutScope.cancel(); + return null; +}); +``` + +## Wait Condition with Timeout + +```java +public class MyWorkflowImpl implements MyWorkflow { + private boolean approved = false; + + @Override + public String run() { + // Wait for approval with 24-hour timeout + boolean received = Workflow.await(Duration.ofHours(24), () -> this.approved); + if (received) { + return "approved"; + } + return "auto-rejected due to timeout"; + } +} +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When handlers do run async operations, call `Workflow.await(() -> Workflow.isEveryHandlerFinished())` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + // ... main workflow logic ... + + // Before exiting, wait for all handlers to finish + Workflow.await(() -> Workflow.isEveryHandlerFinished()); + return "done"; + } +} +``` + +## Activity Heartbeat Details + +### WHY: + +- **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** — Heartbeat details persist across retries + +### WHEN: + +- **Cancellable activities** — Any activity that should respond to cancellation +- **Long-running activities** — Track progress for resumability +- **Checkpointing** — Save progress periodically + +```java +@ActivityInterface +public interface MyActivities { + @ActivityMethod + String processLargeFile(String filePath); +} + +public class MyActivitiesImpl implements MyActivities { + @Override + public String processLargeFile(String filePath) { + ActivityExecutionContext ctx = Activity.getExecutionContext(); + + // Get heartbeat details from previous attempt (if any) + Optional lastLine = ctx.getHeartbeatDetails(Integer.class); + int startLine = lastLine.orElse(0); + + try { + List lines = readFile(filePath); + for (int i = startLine; i < lines.size(); i++) { + processLine(lines.get(i)); + + // Heartbeat with progress + // If cancelled, heartbeat() throws CanceledFailure + ctx.heartbeat(i + 1); + } + return "completed"; + } catch (ActivityCompletionException e) { + // CanceledFailure extends ActivityCompletionException + cleanup(); + throw e; + } + } +} +``` + +Set `heartbeatTimeout` in `ActivityOptions` to enable heartbeat-based failure detection: + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofHours(1)) + .setHeartbeatTimeout(Duration.ofSeconds(30)) + .build(); +``` + +## Timers + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + Workflow.sleep(Duration.ofHours(1)); + + return "Timer fired"; + } +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + MyActivities localActivities = Workflow.newLocalActivityStub( + MyActivities.class, + LocalActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(5)) + .build()); + + String result = localActivities.quickLookup("key"); + return result; + } +} +``` diff --git a/references/java/spring-boot.md b/references/java/spring-boot.md new file mode 100644 index 0000000..ceaaaec --- /dev/null +++ b/references/java/spring-boot.md @@ -0,0 +1,287 @@ +# Temporal Spring Boot Integration + +## Overview + +`temporal-spring-boot-starter` auto-configures workers, registers workflow/activity implementations, and exposes `WorkflowClient` as a Spring bean. This eliminates the manual `WorkflowServiceStubs` → `WorkflowClient` → `WorkerFactory` setup required without Spring. + +## Dependency Setup + +Maven: +```xml + + io.temporal + temporal-spring-boot-starter + [1.0,) + +``` + +Gradle: +```groovy +implementation 'io.temporal:temporal-spring-boot-starter:1.+' +``` + +The starter transitively includes `temporal-sdk` and the autoconfigure module. You can declare both `temporal-sdk` and `temporal-spring-boot-starter` explicitly, but the starter alone is sufficient. + +## Minimal Configuration + +`application.properties`: +```properties +spring.temporal.connection.target=local +spring.temporal.start-workers=true +spring.temporal.workersAutoDiscovery.packages=greetingapp +``` + +`application.yml` equivalent: +```yaml +spring: + temporal: + connection: + target: local # shorthand for localhost:7233 + start-workers: true + workersAutoDiscovery: + packages: + - greetingapp + workers: + - task-queue: greeting-queue + name: greeting-worker +``` + +For self-hosted Temporal, replace `local` with the server address: +```properties +spring.temporal.connection.target=temporal.internal:7233 +``` + +## Interface Design + Spring Annotation Layering + +The key concept: Temporal SDK annotations go on **interfaces**, Spring Boot autoconfigure annotations go on **implementation classes**. This is identical to non-Spring usage at the interface level. + +### Workflow Interface (unchanged from non-Spring) +```java +package greetingapp; + +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +@WorkflowInterface +public interface GreetingWorkflow { + @WorkflowMethod + String greet(String name); +} +``` + +### Workflow Implementation +```java +package greetingapp; + +import io.temporal.activity.ActivityOptions; +import io.temporal.spring.boot.WorkflowImpl; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +// @WorkflowImpl replaces manual worker.registerWorkflowImplementationTypes() +// No @Component — workflows are NOT Spring beans; Temporal creates a new instance per execution +@WorkflowImpl(taskQueues = "greeting-queue") +public class GreetingWorkflowImpl implements GreetingWorkflow { + + // Activity stubs created via Workflow.newActivityStub() as usual + private final GreetActivities activities = Workflow.newActivityStub( + GreetActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setTaskQueue("greeting-queue") + .build() + ); + + @Override + public String greet(String name) { + return activities.greet(name); + } +} +``` + +### Activity Interface (unchanged from non-Spring) +```java +package greetingapp; + +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; + +@ActivityInterface +public interface GreetActivities { + @ActivityMethod + String greet(String name); +} +``` + +### Activity Implementation +```java +package greetingapp; + +import io.temporal.spring.boot.ActivityImpl; +import org.springframework.stereotype.Component; + +// @Component makes this a Spring bean — dependencies can be injected normally +// @ActivityImpl replaces manual worker.registerActivitiesImplementations() +@Component +@ActivityImpl(taskQueues = "greeting-queue") +public class GreetActivitiesImpl implements GreetActivities { + + private final GreetingService greetingService; + + // Constructor injection works because this is a Spring bean + public GreetActivitiesImpl(GreetingService greetingService) { + this.greetingService = greetingService; + } + + @Override + public String greet(String name) { + return greetingService.composeGreeting(name); + } +} +``` + +## Auto-Discovery + +Auto-discovery is how the autoconfigure finds and registers implementations without explicit configuration. It requires **both** of the following: + +1. `@WorkflowImpl(taskQueues = "...")` or `@ActivityImpl(taskQueues = "...")` on the implementation class +2. `spring.temporal.workersAutoDiscovery.packages` pointing to a package that contains those classes + +Missing either one results in silent non-registration — no error, nothing polls the task queue. + +The `taskQueues` attribute routes implementations to the right worker when multiple task queues exist. A worker configured with task queue `"greeting-queue"` only picks up implementations annotated with `taskQueues = "greeting-queue"`. + +**Important:** `@ActivityImpl(taskQueues = "greeting-queue")` only registers the activity bean with that worker. It does not route individual activity task executions. Inside the workflow, `ActivityOptions.setTaskQueue("greeting-queue")` must also be set on the activity stub to route activity tasks to the correct queue. + +### Comparison: Auto-Discovery vs Explicit YAML Registration + +Auto-discovery via annotations: +```properties +spring.temporal.workersAutoDiscovery.packages=greetingapp +``` +```java +@Component +@ActivityImpl(taskQueues = "greeting-queue") +public class GreetActivitiesImpl implements GreetActivities { ... } +``` + +Explicit YAML registration (alternative): +```yaml +spring: + temporal: + workers: + - task-queue: greeting-queue + name: greeting-worker + activity-beans: + - greetActivitiesImpl + workflow-classes: + - greetingapp.GreetingWorkflowImpl +``` + +Use auto-discovery when implementations are colocated in a single package tree (most apps). Use explicit YAML when you need fine-grained control, want to exclude specific classes, or are registering beans defined elsewhere. + +## WorkflowClient Injection + +`WorkflowClient` is automatically registered as a Spring bean by the autoconfigure. Inject it into any `@Service` or `@RestController`: + +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import org.springframework.stereotype.Service; + +import java.util.UUID; + +@Service +public class GreetingStarter { + + private final WorkflowClient client; + + public GreetingStarter(WorkflowClient client) { + this.client = client; + } + + public String startGreeting(String name) { + var stub = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") // must match the worker's task queue + .build() + ); + // Synchronous — blocks until workflow completes + return stub.greet(name); + } + + public void startGreetingAsync(String name) { + var stub = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") + .build() + ); + // Fire-and-forget — returns immediately + WorkflowClient.start(stub::greet, name); + } +} +``` + +## Worker Lifecycle + +Workers start on `ApplicationReadyEvent` — after the full Spring context is initialized (DB migrations run, all beans wired). This means activity beans are fully ready before any workflow tasks are processed. + +To run a client-only app (one that submits workflows but does not execute them): +```properties +spring.temporal.start-workers=false +``` + +## Testing Strategies + +See `references/java/testing.md` for full details on both approaches. + +**Spring integration tests** — uses an embedded Temporal test server wired into the Spring context: +```properties +# src/test/resources/application-test.properties +spring.temporal.test-server.enabled=true +``` +```java +@SpringBootTest +@ActiveProfiles("test") +class GreetingIntegrationTest { + @Autowired WorkflowClient client; // points at the embedded test server + + @Test + void testWorkflowThroughSpringContext() { ... } +} +``` + +**Unit tests without Spring** — use `TestWorkflowEnvironment` or `TestWorkflowExtension` directly. No Spring context, faster startup, full time-skipping support: +```java +@RegisterExtension +static final TestWorkflowExtension testWorkflow = TestWorkflowExtension.newBuilder() + .setWorkflowTypes(GreetingWorkflowImpl.class) + .setDoNotStart(true) + .build(); +``` + +Do not mix approaches in the same test class — choose one or the other. + +## Spring-Specific Gotchas + +**Workflow impls must not have `@Component`** +Temporal creates a new workflow instance per execution via `beanFactory.createBean()` (not `getBean()`). Adding `@Component` means Spring also registers it as a singleton bean, which can cause confusing lifecycle behavior. Leave `@WorkflowImpl` classes as plain classes with no Spring annotations. + +**Activity beans are Spring singletons** +Temporal may invoke activity methods concurrently across many workflow executions. Keep activity implementations stateless — no mutable instance fields. Use injected services (which are themselves stateless or thread-safe) for all state. + +**`@WorkflowImpl` / `@ActivityImpl` without `workersAutoDiscovery.packages` → silently ignored** +This is the most common setup mistake. If auto-discovery packages are not configured, the annotations are never scanned and nothing registers with the worker. Verify with the Temporal UI that the worker is registering the expected workflow/activity types. + +**`ActivityOptions.setTaskQueue(...)` is required on activity stubs** +`@ActivityImpl(taskQueues = "greeting-queue")` registers the activity bean with the worker — it does not set the default task queue for activity execution. Inside workflow code, always set `.setTaskQueue(...)` in `ActivityOptions` to explicitly route activity tasks to the correct worker. + +**Multiple `DataConverter` beans** +If you define more than one `DataConverter` bean (e.g., a custom JSON converter and a default), the autoconfigure fails with an ambiguity error. Name one of them `mainDataConverter` to designate it as the primary. diff --git a/references/java/testing.md b/references/java/testing.md new file mode 100644 index 0000000..b46db29 --- /dev/null +++ b/references/java/testing.md @@ -0,0 +1,255 @@ +# Java SDK Testing + +## Overview + +You test Temporal Java Workflows using `TestWorkflowEnvironment` (manual setup) or `TestWorkflowExtension` (JUnit 5). Activity mocking uses Mockito. The SDK provides `WorkflowReplayer` for replay-based compatibility testing. + +## Workflow Test Environment + +```java +import io.temporal.testing.TestWorkflowExtension; +import io.temporal.testing.TestWorkflowEnvironment; +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import io.temporal.worker.Worker; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class MyWorkflowTest { + + @RegisterExtension + public static final TestWorkflowExtension testWorkflowExtension = + TestWorkflowExtension.newBuilder() + .setWorkflowTypes(MyWorkflowImpl.class) + .setDoNotStart(true) + .build(); + + @Test + void testWorkflow(TestWorkflowEnvironment env, Worker worker, WorkflowClient client) { + worker.registerActivitiesImplementations(new MyActivitiesImpl()); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + String result = workflow.run("input"); + assertEquals("expected", result); + } +} +``` + +For manual lifecycle control (e.g., JUnit 4 or custom setups), use `TestWorkflowEnvironment` directly with `@BeforeEach`/`@AfterEach`. + +## Mocking Activities + +```java +import static org.mockito.Mockito.*; + +@Test +void testWithMockedActivities( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + // withoutAnnotations() prevents Mockito from copying Temporal annotations + MyActivities activities = mock(MyActivities.class, withSettings().withoutAnnotations()); + when(activities.composeGreeting("Hello", "World")).thenReturn("mocked result"); + + worker.registerActivitiesImplementations(activities); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + String result = workflow.run("input"); + assertEquals("mocked result", result); + verify(activities).composeGreeting("Hello", "World"); +} +``` + +## Testing Signals and Queries + +```java +@Test +void testSignalsAndQueries( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + worker.registerActivitiesImplementations(new MyActivitiesImpl()); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + // Start workflow asynchronously + WorkflowClient.start(workflow::run, "input"); + + // Send signal + workflow.mySignal("data"); + + // Query state + String status = workflow.getStatus(); + assertEquals("expected", status); + + // Wait for completion + String result = WorkflowStub.fromTyped(workflow).getResult(String.class); +} +``` + +## Testing Failure Cases + +```java +import io.temporal.client.WorkflowException; + +@Test +void testActivityFailure( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + MyActivities activities = mock(MyActivities.class, withSettings().withoutAnnotations()); + when(activities.unreliableAction(anyString())) + .thenThrow(new RuntimeException("Simulated failure")); + + worker.registerActivitiesImplementations(activities); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + assertThrows(WorkflowException.class, () -> workflow.run("input")); +} +``` + +## Workflow Replay Testing + +```java +import io.temporal.testing.WorkflowReplayer; + +@Test +void testReplayFromHistory() throws Exception { + WorkflowReplayer.replayWorkflowExecutionFromResource( + "my-workflow-history.json", + MyWorkflowImpl.class); +} +``` + +Replay from a `WorkflowHistory` object: + +```java +import io.temporal.common.WorkflowExecutionHistory; + +@Test +void testReplayFromJsonString() throws Exception { + String historyJson = new String(Files.readAllBytes(Paths.get("history.json"))); + WorkflowReplayer.replayWorkflowExecution( + WorkflowExecutionHistory.fromJson(historyJson), + MyWorkflowImpl.class); +} +``` + +## Activity Testing + +Activity implementations are plain Java classes. Test them directly: + +```java +@Test +void testActivity() { + MyActivitiesImpl activities = new MyActivitiesImpl(); + String result = activities.composeGreeting("Hello", "World"); + assertEquals("Hello World", result); +} +``` + +For activities that use `Activity.getExecutionContext()` or heartbeating, use `TestActivityEnvironment` to provide the activity context. + +## Best Practices + +1. Use `TestWorkflowExtension` with JUnit 5 for concise test setup +2. Always use `withSettings().withoutAnnotations()` when mocking activity interfaces with Mockito +3. Mock external dependencies in activities, not in workflows +4. Test replay compatibility when changing workflow code (see `references/java/determinism.md`) +5. Test signal/query handlers explicitly +6. Use unique task queues per test to avoid conflicts (handled automatically by `TestWorkflowExtension`) + +## Spring Boot Testing + +Two strategies — choose one per test class, do not mix them. + +### Embedded test server in Spring context + +For full integration tests that exercise the Spring context (DB, beans, config): + +```properties +# src/test/resources/application-test.properties +spring.temporal.test-server.enabled=true +``` + +```java +@SpringBootTest +@ActiveProfiles("test") +class TeeTimeMonitorIntegrationTest { + + @Autowired + WorkflowClient client; // auto-configured to point at the embedded test server + + @Test + void testWorkflow() { + var stub = client.newWorkflowStub( + TeeTimeMonitorWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId("test-" + UUID.randomUUID()) + .setTaskQueue("golfnow") + .build() + ); + var result = stub.monitorTeeTimes(new TTMonitorRequest(...)); + assertNotNull(result); + } +} +``` + +The embedded server does not support time-skipping. Use this when you need Spring beans (real DB, email service, etc.) wired alongside Temporal. + +### Unit tests without Spring context + +For faster, isolated tests with time-skipping support, use `TestWorkflowExtension` or `TestWorkflowEnvironment` directly. No Spring context starts, so activity dependencies must be provided manually (real instances or Mockito mocks): + +```java +public class TeeTimeMonitorWorkflowTest { + + @RegisterExtension + static final TestWorkflowExtension testWorkflow = TestWorkflowExtension.newBuilder() + .setWorkflowTypes(TeeTimeMonitorWorkflowImpl.class) + .setDoNotStart(true) + .build(); + + @Test + void testWorkflow(TestWorkflowEnvironment env, Worker worker, WorkflowClient client) { + GolfNowActivities activities = mock(GolfNowActivities.class, withSettings().withoutAnnotations()); + when(activities.searchTeeTimes(any())).thenReturn(List.of()); + + worker.registerActivitiesImplementations(activities); + env.start(); + + var stub = client.newWorkflowStub( + TeeTimeMonitorWorkflow.class, + WorkflowOptions.newBuilder().setTaskQueue(worker.getTaskQueue()).build() + ); + stub.monitorTeeTimes(new TTMonitorRequest(...)); + verify(activities).searchTeeTimes(any()); + } +} +``` + +See the sections above for more detail on mocking, signals/queries, and replay testing. diff --git a/references/java/versioning.md b/references/java/versioning.md new file mode 100644 index 0000000..0e520f2 --- /dev/null +++ b/references/java/versioning.md @@ -0,0 +1,282 @@ +# Java SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +### Workflow.getVersion() + +`Workflow.getVersion(String changeId, int minSupported, int maxSupported)` returns the version to use for a given change: + +```java +import io.temporal.workflow.Workflow; + +@WorkflowInterface +public interface ShippingWorkflow { + @WorkflowMethod + void run(); +} + +public class ShippingWorkflowImpl implements ShippingWorkflow { + @Override + public void run() { + int version = Workflow.getVersion( + "send-email-instead-of-fax", + Workflow.DEFAULT_VERSION, // minSupported (no change) + 1 // maxSupported (current version) + ); + + if (version == 1) { + // New code path + Workflow.newActivityStub(MyActivities.class, options).sendEmail(); + } else { + // Old code path (for replay of existing workflows) + Workflow.newActivityStub(MyActivities.class, options).sendFax(); + } + } +} +``` + +**How it works:** + +- For new executions: returns `maxSupported` and records a marker in history +- For replay with the marker: returns the recorded version +- For replay without the marker: returns `DEFAULT_VERSION` (-1) + +### Three-Step Patching Process + +**Step 1: Patch in New Code** + +Add the version check with both old and new code paths: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + int version = Workflow.getVersion( + "add-fraud-check", + Workflow.DEFAULT_VERSION, + 1); + + if (version >= 1) { + activities.checkFraud(order); + } + + return activities.processPayment(order); + } +} +``` + +**Step 2: Remove Old Code Path** + +Once all pre-patch Workflow Executions have completed, remove the old branch and set `minSupported` to `1`: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + Workflow.getVersion("add-fraud-check", 1, 1); + + activities.checkFraud(order); + return activities.processPayment(order); + } +} +``` + +**Step 3: Remove the Patch** + +After all workflows with the patch marker have completed, remove the `getVersion` call entirely: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + activities.checkFraud(order); + return activities.processPayment(order); + } +} +``` + +### Recording TemporalChangeVersion Search Attribute + +Unlike the Python and TypeScript SDKs, the Java SDK does **not** automatically record the `TemporalChangeVersion` search attribute. You must manually upsert it: + +```java +import io.temporal.workflow.Workflow; +import io.temporal.common.SearchAttributeKey; +import java.util.List; + +public class OrderWorkflowImpl implements OrderWorkflow { + private static final SearchAttributeKey> TEMPORAL_CHANGE_VERSION = + SearchAttributeKey.forKeywordList("TemporalChangeVersion"); + + @Override + public String run(Order order) { + int version = Workflow.getVersion("add-fraud-check", Workflow.DEFAULT_VERSION, 1); + + // Manually record for query filtering + Workflow.upsertTypedSearchAttributes( + TEMPORAL_CHANGE_VERSION.valueSet(List.of("add-fraud-check-1"))); + + if (version >= 1) { + activities.checkFraud(order); + } + return activities.processPayment(order); + } +} +``` + +Query with: + +```bash +temporal workflow list --query \ + 'TemporalChangeVersion = "add-fraud-check-1" AND ExecutionStatus = "Running"' +``` + +## Workflow Type Versioning + +For incompatible changes, create a new Workflow Type: + +```java +@WorkflowInterface +public interface PizzaWorkflow { + @WorkflowMethod + String run(PizzaOrder order); +} + +// Original implementation +public class PizzaWorkflowImpl implements PizzaWorkflow { + @Override + public String run(PizzaOrder order) { + return processOrderV1(order); + } +} + +// New workflow type for incompatible changes +@WorkflowInterface +public interface PizzaWorkflowV2 { + @WorkflowMethod + String run(PizzaOrder order); +} + +public class PizzaWorkflowV2Impl implements PizzaWorkflowV2 { + @Override + public String run(PizzaOrder order) { + return processOrderV2(order); + } +} +``` + +Register both with the Worker: + +```java +worker.registerWorkflowImplementationTypes( + PizzaWorkflowImpl.class, + PizzaWorkflowV2Impl.class); +``` + +Start new workflows with the new type: + +```java +PizzaWorkflowV2 workflow = client.newWorkflowStub( + PizzaWorkflowV2.class, + WorkflowOptions.newBuilder() + .setTaskQueue("pizza-task-queue") + .build()); +workflow.run(order); +``` + +Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "PizzaWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level. Available since Java SDK v1.29. + +### Key Concepts + +- **Worker Deployment**: A logical group of Workers processing the same Task Queue, identified by a deployment name (e.g., `"order-service"`). +- **Worker Deployment Version**: A specific version within a deployment, identified by the combination of deployment name and Build ID (e.g., `"order-service:v1.0.0"`). Each version corresponds to a particular code revision. + +### Configuring Workers + +```java +import io.temporal.worker.Worker; +import io.temporal.worker.WorkerFactory; +import io.temporal.worker.WorkerOptions; +import io.temporal.worker.WorkerDeploymentOptions; +import io.temporal.worker.WorkerDeploymentVersion; + +WorkerDeploymentVersion version = WorkerDeploymentVersion.newBuilder() + .setDeploymentName("order-service") + .setBuildId("v1.0.0") // or git commit hash + .build(); + +WorkerDeploymentOptions deploymentOptions = WorkerDeploymentOptions.newBuilder() + .setVersion(version) + .setUseWorkerVersioning(true) + .build(); + +WorkerFactory factory = WorkerFactory.newInstance(client); +Worker worker = factory.newWorker( + "my-task-queue", + WorkerOptions.newBuilder() + .setDeploymentOptions(deploymentOptions) + .build()); + +worker.registerWorkflowImplementationTypes(MyWorkflowImpl.class); +worker.registerActivitiesImplementations(new MyActivitiesImpl()); +factory.start(); +``` + +### PINNED vs AUTO_UPGRADE Behaviors + +Set the versioning behavior on the workflow definition: + +```java +import io.temporal.workflow.VersioningBehavior; +import io.temporal.workflow.Workflow; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(String input) { + Workflow.setVersioningBehavior(VersioningBehavior.PINNED); + // ... workflow logic + } +} +``` + +**PINNED**: Workflow stays on the Worker version that started it. Use for short-running workflows or when consistency within a single execution is critical. New workflows start on the current version; existing ones stay put. + +**AUTO_UPGRADE**: Workflow moves to the latest Worker version on the next Workflow Task. Use for long-running workflows that need bug fixes or feature updates. Combine with `Workflow.getVersion()` patching to handle version transitions safely. + +### Deployment Strategies + +**Blue-Green**: Run two deployment versions simultaneously. Set the new version as the current deployment. PINNED workflows finish on the old version; new workflows start on the new version. Drain the old version once all its workflows complete. + +**Rainbow**: Run multiple versions concurrently for gradual rollouts. Each version handles its own workflows. Useful when you have many long-running PINNED workflows across several code revisions. + +### Querying Workflows by Worker Version + +```bash +# List workflows running on a specific version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "order-service:v1.0.0" AND ExecutionStatus = "Running"' + +# Count workflows per version to monitor drain progress +temporal workflow count --query \ + 'TemporalWorkerDeploymentVersion = "order-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Check for open executions** before removing old code paths +2. **Use descriptive change IDs** that explain the change (e.g., `"add-fraud-check"` not `"patch-1"`) +3. **Deploy patches incrementally**: patch, remove old path, remove `getVersion` +4. **Manually upsert `TemporalChangeVersion`** search attribute when using `getVersion` if you need query filtering +5. **Use PINNED for short workflows** to simplify version management +6. **Use AUTO_UPGRADE with patching** for long-running workflows that need updates +7. **Generate Build IDs from code** (git hash) to ensure changes produce new versions diff --git a/references/python/advanced-features.md b/references/python/advanced-features.md index e0d3297..c5ec1b3 100644 --- a/references/python/advanced-features.md +++ b/references/python/advanced-features.md @@ -62,6 +62,7 @@ async def request_approval(request_id: str) -> None: # Later, complete the activity from another process async def complete_approval(request_id: str, approved: bool): client = await Client.connect("localhost:7233", namespace="default") + # Retrieve the task token from external storage (e.g., database) task_token = await get_task_token(request_id) handle = client.get_async_activity_handle(task_token=task_token) @@ -85,6 +86,7 @@ The Python SDK runs workflows in a sandbox to help you ensure determinism. You c **The Python SDK is NOT compatible with gevent.** Gevent's monkey patching modifies Python's asyncio event loop in ways that break the SDK's deterministic execution model. If your application uses gevent: + - You cannot run Temporal workers in the same process - Consider running workers in a separate process without gevent - Use a message queue or HTTP API to communicate between gevent and Temporal processes @@ -114,9 +116,9 @@ worker = Worker( ## Workflow Init Decorator -Use `@workflow.init` to run initialization code when a workflow is first created. +You should always put state initialization logic in the `__init__` of your workflow class, so that it happens before signals/updates arrive. -**Purpose:** Execute some setup code before signal/update happens or run is invoked. +Normally, your `__init__` must have no arguments. However, if you add the `@workflow.init` decorator, then your `__init__` instead receives the same workflow arguments that `@workflow.run` receives: ```python @workflow.defn @@ -128,11 +130,13 @@ class MyWorkflow: self._items: list[str] = [] @workflow.run - async def run(self) -> str: + async def run(self, initial_value: str) -> str: # self._value and self._items are already initialized return self._value ``` +`__init__` (with `@workflow.init`) and `@workflow.run` must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the `__init__`. + ## Workflow Failure Exception Types Control which exceptions cause workflow task failures vs workflow failures. @@ -163,4 +167,3 @@ worker = Worker( workflow_failure_exception_types=[ValueError, CustomBusinessError], ) ``` - diff --git a/references/python/ai-patterns.md b/references/python/ai-patterns.md index a07e30a..6a45272 100644 --- a/references/python/ai-patterns.md +++ b/references/python/ai-patterns.md @@ -2,7 +2,7 @@ ## Overview -This document provides Python-specific implementation details for integrating LLMs with Temporal. For conceptual patterns, see `references/core/ai-integration.md`. +This document provides Python-specific implementation details for integrating LLMs with Temporal. For conceptual patterns, see `references/core/ai-patterns.md`. ## Pydantic Data Converter Setup diff --git a/references/python/data-handling.md b/references/python/data-handling.md index 662101e..65f4a99 100644 --- a/references/python/data-handling.md +++ b/references/python/data-handling.md @@ -7,6 +7,7 @@ The Python SDK uses data converters to serialize/deserialize workflow inputs, ou ## Default Data Converter The default converter handles: + - `None` - `bytes` (as binary) - Protobuf messages @@ -59,6 +60,7 @@ client = await Client.connect( ## Custom Data Conversion Usually the easiest way to do this is via implementing an EncodingPayloadConverter and CompositePayloadConverter. See: + - https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/shared.py - https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/starter.py diff --git a/references/python/determinism-protection.md b/references/python/determinism-protection.md index 1376ced..2eba418 100644 --- a/references/python/determinism-protection.md +++ b/references/python/determinism-protection.md @@ -7,14 +7,15 @@ The Python SDK runs workflows in a sandbox that provides automatic protection ag ## How the Sandbox Works The sandbox: + - Isolates global state via `exec` compilation - Restricts non-deterministic library calls via proxy objects - Passes through standard library with restrictions - Reloads workflow files on each execution -## Forbidden Operations +## Forbidden Operations in Workflows -These operations will fail in the sandbox: +These operations are forbidden inside workflow code (appropriate in activities) and will fail in the sandbox: - **Direct I/O**: Network calls, file reads/writes - **Threading**: `threading` module operations @@ -35,6 +36,7 @@ with workflow.unsafe.imports_passed_through(): ``` **When to use pass-through:** + - Data classes and models (Pydantic, dataclasses) - Serialization libraries - Type definitions diff --git a/references/python/determinism.md b/references/python/determinism.md index 7276360..2be8f75 100644 --- a/references/python/determinism.md +++ b/references/python/determinism.md @@ -8,7 +8,9 @@ The Python SDK runs workflows in a sandbox that provides automatic protection ag Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. - Direct I/O (network, filesystem) - Threading operations @@ -23,7 +25,7 @@ Temporal provides durable execution through **History Replay**. When a Worker ne |-----------|------------------| | `datetime.now()` | `workflow.now()` | | `datetime.utcnow()` | `workflow.now()` | -| `random.random()` | `rng = workflow.new_random() ; rng.randint(1, 100)` | +| `random.random()` | `rng = workflow.random() ; rng.randint(1, 100)` | | `uuid.uuid4()` | `workflow.uuid4()` | | `time.time()` | `workflow.now().timestamp()` | @@ -34,6 +36,7 @@ Use the `Replayer` class to verify your code changes are compatible with existin ## Sandbox Behavior The sandbox: + - Isolates global state via `exec` compilation - Restricts non-deterministic library calls via proxy objects - Passes through standard library with restrictions diff --git a/references/python/error-handling.md b/references/python/error-handling.md index 19460cb..ed9e69d 100644 --- a/references/python/error-handling.md +++ b/references/python/error-handling.md @@ -47,7 +47,7 @@ async def charge_card(input: ChargeCardInput) -> str: ```python from datetime import timedelta from temporalio import workflow -from temporalio.exceptions import ActivityError, ApplicationError +from temporalio.exceptions import ActivityError, ApplicationError, is_cancelled_exception @workflow.defn class MyWorkflow: @@ -59,6 +59,9 @@ class MyWorkflow: start_to_close_timeout=timedelta(minutes=5), ) except ActivityError as e: + # Let cancellation propagate so the workflow is canceled, not failed + if is_cancelled_exception(e): + raise workflow.logger.error(f"Activity failed: {e}") # Handle or re-raise raise ApplicationError("Workflow failed due to activity error") diff --git a/references/python/gotchas.md b/references/python/gotchas.md index 95ebe8a..a32b045 100644 --- a/references/python/gotchas.md +++ b/references/python/gotchas.md @@ -211,10 +211,12 @@ class GoodWorkflow: ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Catching the cancellation exception** - Exception is raised when heartbeat detects cancellation **Cancellation exceptions:** + - Async activities: `asyncio.CancelledError` - Sync threaded activities: `temporalio.exceptions.CancelledError` diff --git a/references/python/observability.md b/references/python/observability.md index 26296c3..0130d89 100644 --- a/references/python/observability.md +++ b/references/python/observability.md @@ -27,6 +27,7 @@ class MyWorkflow: ``` The workflow logger automatically: + - Suppresses duplicate logs during replay - Includes workflow context (workflow ID, run ID, etc.) @@ -46,6 +47,7 @@ async def process_order(order_id: str) -> str: ``` Activity logger includes: + - Activity ID, type, and task queue - Workflow ID and run ID - Attempt number (for retries) @@ -92,7 +94,6 @@ Runtime.set_default(runtime, error_if_already_set=True) - `temporal_activity_execution_latency` - Activity execution time - `temporal_workflow_task_replay_latency` - Replay duration - ## Search Attributes (Visibility) See the Search Attributes section of `references/python/data-handling.md` diff --git a/references/python/patterns.md b/references/python/patterns.md index 762977b..ae70757 100644 --- a/references/python/patterns.md +++ b/references/python/patterns.md @@ -106,6 +106,8 @@ class OrderWorkflow: raise ValueError("Order is full") ``` +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Raise an exception to reject the update; return `None` to accept. + ## Child Workflows ```python @@ -243,11 +245,14 @@ class MyWorkflow: except Exception as e: workflow.logger.error(f"Order failed: {e}, running compensations") - for compensate in reversed(compensations): - try: - await compensate() - except Exception as comp_err: - workflow.logger.error(f"Compensation failed: {comp_err}") + # asyncio.shield ensures compensations run even if the workflow is cancelled. + async def run_compensations(): + for compensate in reversed(compensations): + try: + await compensate() + except Exception as comp_err: + workflow.logger.error(f"Compensation failed: {comp_err}") + await asyncio.shield(asyncio.ensure_future(run_compensations())) raise ``` @@ -316,14 +321,17 @@ class MyWorkflow: ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** - Heartbeat details persist across retries **Cancellation exceptions:** + - Async activities: `asyncio.CancelledError` - Sync threaded activities: `temporalio.exceptions.CancelledError` ### WHEN: + - **Cancellable activities** - Any activity that should respond to cancellation - **Long-running activities** - Track progress for resumability - **Checkpointing** - Save progress periodically diff --git a/references/python/python.md b/references/python/python.md index 130b1eb..d3c0e9c 100644 --- a/references/python/python.md +++ b/references/python/python.md @@ -9,6 +9,7 @@ The Temporal Python SDK (`temporalio`) provides a fully async, type-safe approac **Add Dependency on Temporal:** In the package management system of the Python project you are working on, add a dependency on `temporalio`. **activities/greet.py** - Activity definitions (separate file for performance): + ```python from temporalio import activity @@ -18,6 +19,7 @@ def greet(name: str) -> str: ``` **workflows/greeting.py** - Workflow definition (import activities through sandbox): + ```python from datetime import timedelta from temporalio import workflow @@ -34,7 +36,8 @@ class GreetingWorkflow: ) ``` -**worker.py** - Worker setup (imports activity and workflow, runs indefinitely and processes tasks): +**worker.py** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): + ```python import asyncio import concurrent.futures @@ -70,6 +73,7 @@ if __name__ == "__main__": **Start the worker:** Start `python worker.py` in the background (appropriately adjust command for your project, like `uv run python worker.py`) **starter.py** - Start a workflow execution: + ```python import asyncio from temporalio.client import Client @@ -93,16 +97,18 @@ if __name__ == "__main__": **Run the workflow:** Run `python starter.py` (or uv run, etc.). Should output: `Result: Hello, my-name!`. - ## Key Concepts ### Workflow Definition + - Use `@workflow.defn` decorator on class +- Put any state initialization logic in the `__init__` of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@workflow.init` decorator and parameters to your `__init__`. - Use `@workflow.run` on the entry point method - Must be async (`async def`) - Use `@workflow.signal`, `@workflow.query`, `@workflow.update` for handlers ### Activity Definition + - Use `@activity.defn` decorator - Can be sync or async functions - **Default to sync activities** - safer and easier to debug @@ -112,6 +118,7 @@ if __name__ == "__main__": See `sync-vs-async.md` for detailed guidance on choosing between sync and async. ### Worker Setup + - Connect client, create Worker with workflows and activities - Run the worker - Activities can specify custom executor @@ -135,6 +142,7 @@ my_temporal_app/ ``` **In the Workflow file, import Activities through the sandbox:** + ```python # workflows/greeting.py from temporalio import workflow @@ -161,6 +169,7 @@ See `references/python/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/python/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/python/determinism.md`** - Sandbox behavior, safe alternatives, pass-through pattern, history replay - **`references/python/gotchas.md`** - Python-specific mistakes and anti-patterns diff --git a/references/python/sync-vs-async.md b/references/python/sync-vs-async.md index 7875582..247b0e5 100644 --- a/references/python/sync-vs-async.md +++ b/references/python/sync-vs-async.md @@ -19,6 +19,7 @@ Activities should be synchronous by default. Use async only when certain the cod The Python async event loop runs in a single thread. When any task runs, no other tasks can execute until an `await` is reached. If code makes a blocking call (file I/O, synchronous HTTP, etc.), the entire event loop freezes. **Consequences of blocking the event loop:** + - Worker cannot communicate with Temporal Server - Workflow progress blocks across the worker - Potential deadlocks and unpredictable behavior @@ -73,6 +74,7 @@ async def my_async_activity(name: str) -> str: | `httpx` | Both | Yes (use async mode) | **Example: Wrong way (blocks event loop)** + ```python @activity.defn async def bad_activity(url: str) -> str: @@ -82,6 +84,7 @@ async def bad_activity(url: str) -> str: ``` **Example: Correct way (async-safe)** + ```python @activity.defn async def good_activity(url: str) -> str: @@ -150,6 +153,7 @@ For CPU-bound work and multi-core usage: ### Separate Workers for Workflows vs Activities Some teams deploy: + - Workflow-only workers (CPU-bound, need deadlock detection) - Activity-only workers (I/O-bound, may need more parallelism) diff --git a/references/python/testing.md b/references/python/testing.md index 63a0d14..71a47b1 100644 --- a/references/python/testing.md +++ b/references/python/testing.md @@ -136,11 +136,10 @@ async def test_replay(): # From JSON file await replayer.replay_workflow( - WorkflowHistory.from_json(workflow_id=str(uuid.uuid4()), history_json) + WorkflowHistory.from_json(str(uuid.uuid4()), history_json) ) ``` - ## Activity Testing ```python diff --git a/references/python/versioning.md b/references/python/versioning.md index abd4445..c1ad39a 100644 --- a/references/python/versioning.md +++ b/references/python/versioning.md @@ -30,6 +30,7 @@ class ShippingWorkflow: ``` **How it works:** + - For new executions: `patched()` returns `True` and records a marker in the Workflow history - For replay with the marker: `patched()` returns `True` (history includes this patch) - For replay without the marker: `patched()` returns `False` (history predates this patch) @@ -213,6 +214,7 @@ worker = Worker( ``` **Configuration parameters:** + - `use_worker_versioning`: Enables Worker Versioning - `version`: Identifies the Worker Deployment Version (deployment name + build ID) - Build ID: Typically a git commit hash, version number, or timestamp @@ -224,13 +226,13 @@ worker = Worker( Workflows stay locked to their original Worker version: ```python -from temporalio.workflow import VersioningBehavior +from temporalio import workflow +from temporalio.common import VersioningBehavior -@workflow.defn +@workflow.defn(versioning_behavior=VersioningBehavior.PINNED) class StableWorkflow: @workflow.run async def run(self) -> str: - # This workflow will always run on its assigned version return await workflow.execute_activity( process_order, start_to_close_timeout=timedelta(minutes=5), @@ -238,6 +240,7 @@ class StableWorkflow: ``` **When to use PINNED:** + - Short-running workflows (minutes to hours) - Consistency is critical (e.g., financial transactions) - You want to eliminate version compatibility complexity @@ -247,7 +250,22 @@ class StableWorkflow: Workflows can move to newer versions: +```python +from temporalio import workflow +from temporalio.common import VersioningBehavior + +@workflow.defn(versioning_behavior=VersioningBehavior.AUTO_UPGRADE) +class UpgradableWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_activity( + process_order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + **When to use AUTO_UPGRADE:** + - Long-running workflows (weeks or months) - Workflows need to benefit from bug fixes during execution - Migrating from traditional rolling deployments @@ -258,7 +276,6 @@ Workflows can move to newer versions: ### Worker Configuration with Default Behavior ```python -# For short-running workflows, prefer PINNED worker = Worker( client, task_queue="orders-task-queue", @@ -270,7 +287,7 @@ worker = Worker( build_id=os.environ["BUILD_ID"], ), use_worker_versioning=True, - # default_versioning_behavior=VersioningBehavior.PINNED, + default_versioning_behavior=VersioningBehavior.PINNED, ), ) ``` @@ -280,6 +297,7 @@ worker = Worker( **Blue-Green Deployments** Maintain two environments and switch traffic between them: + 1. Deploy new code to idle environment 2. Run tests and validation 3. Switch traffic to new environment @@ -288,6 +306,7 @@ Maintain two environments and switch traffic between them: **Rainbow Deployments** Multiple versions run simultaneously: + - New workflows use latest version - Existing workflows complete on their original version - Add new versions alongside existing ones diff --git a/references/typescript/advanced-features.md b/references/typescript/advanced-features.md index 17b7e61..ed9817d 100644 --- a/references/typescript/advanced-features.md +++ b/references/typescript/advanced-features.md @@ -39,6 +39,7 @@ await handle.delete(); Complete an activity asynchronously from outside the activity function. Useful when the activity needs to wait for an external event. **In the activity - return the task token:** + ```typescript import { CompleteAsyncError, activityInfo } from '@temporalio/activity'; @@ -50,6 +51,7 @@ export async function doSomethingAsync(): Promise { ``` **External completion (from another process, machine, etc.):** + ```typescript import { Client } from '@temporalio/client'; @@ -61,6 +63,7 @@ async function doSomeWork(taskToken: Uint8Array): Promise { ``` **When to use:** + - Waiting for human approval - Waiting for external webhook callback - Long-polling external systems @@ -93,6 +96,7 @@ const worker = await Worker.create({ ``` **Key settings:** + - `maxConcurrentWorkflowTaskExecutions`: Max workflows running simultaneously (default: 40) - `maxConcurrentActivityTaskExecutions`: Max activities running simultaneously (default: 100) - `shutdownGraceTime`: Time to wait for in-progress work before forced shutdown diff --git a/references/typescript/data-handling.md b/references/typescript/data-handling.md index bfd4925..c8be6f8 100644 --- a/references/typescript/data-handling.md +++ b/references/typescript/data-handling.md @@ -7,6 +7,7 @@ The TypeScript SDK uses data converters to serialize/deserialize workflow inputs ## Default Data Converter The default converter handles: + - `undefined` and `null` - `Uint8Array` (as binary) - JSON-serializable types diff --git a/references/typescript/determinism-protection.md b/references/typescript/determinism-protection.md index 54303ba..81c513a 100644 --- a/references/typescript/determinism-protection.md +++ b/references/typescript/determinism-protection.md @@ -29,7 +29,6 @@ const worker = await Worker.create({ Use this with *extreme caution*. - ## Function Replacement Functions like `Math.random()`, `Date`, and `setTimeout()` are replaced by deterministic versions. diff --git a/references/typescript/determinism.md b/references/typescript/determinism.md index 47f8948..dfd3464 100644 --- a/references/typescript/determinism.md +++ b/references/typescript/determinism.md @@ -28,7 +28,9 @@ The Temporal workflow sandbox will use the same random seed when replaying a wor See `references/typescript/determinism-protection.md` for more information about the sandbox. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. ```typescript // DO NOT do these in workflows: diff --git a/references/typescript/gotchas.md b/references/typescript/gotchas.md index d234f74..61763b3 100644 --- a/references/typescript/gotchas.md +++ b/references/typescript/gotchas.md @@ -145,6 +145,7 @@ export async function workflowWithCleanup(): Promise { ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Checking for cancellation** - Either await `Context.current().cancelled` or use `cancellationSignal()` diff --git a/references/typescript/observability.md b/references/typescript/observability.md index 10244d7..211fbc6 100644 --- a/references/typescript/observability.md +++ b/references/typescript/observability.md @@ -100,6 +100,10 @@ Runtime.install({ }); ``` +## Search Attributes (Visibility) + +See the Search Attributes section of `references/typescript/data-handling.md` + ## Best Practices 1. Use `log` from `@temporalio/workflow` for production observability. For temporary print debugging, `console.log()` is fine—it's direct and immediate, whereas `log` goes through sinks which may lose messages on workflow errors diff --git a/references/typescript/patterns.md b/references/typescript/patterns.md index 878f9f0..6dc2b32 100644 --- a/references/typescript/patterns.md +++ b/references/typescript/patterns.md @@ -132,6 +132,8 @@ export async function orderWorkflow(): Promise { } ``` +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an error to reject the update; return normally to accept. + ## Child Workflows ```typescript @@ -224,7 +226,7 @@ export async function longRunningWorkflow(state: State): Promise { **Important:** Compensation activities should be idempotent. ```typescript -import { log } from '@temporalio/workflow'; +import { CancellationScope, log } from '@temporalio/workflow'; export async function sagaWorkflow(order: Order): Promise { const compensations: Array<() => Promise> = []; @@ -233,22 +235,25 @@ export async function sagaWorkflow(order: Order): Promise { // IMPORTANT: Save compensation BEFORE calling the activity // If activity fails after completing but before returning, // compensation must still be registered - await reserveInventory(order); compensations.push(() => releaseInventory(order)); + await reserveInventory(order); - await chargePayment(order); compensations.push(() => refundPayment(order)); + await chargePayment(order); await shipOrder(order); return 'Order completed'; } catch (err) { - for (const compensate of compensations.reverse()) { - try { - await compensate(); - } catch (compErr) { - log.warn('Compensation failed', { error: compErr }); + // nonCancellable ensures compensations run even if the workflow is cancelled + await CancellationScope.nonCancellable(async () => { + for (const compensate of compensations.reverse()) { + try { + await compensate(); + } catch (compErr) { + log.warn('Compensation failed', { error: compErr }); + } } - } + }); throw err; } } @@ -284,6 +289,7 @@ export async function scopedWorkflow(): Promise { **WHY**: Triggers provide a one-shot promise that resolves when a signal is received. Cleaner than condition() for single-value signals. **WHEN to use**: + - Waiting for a single response (approval, completion notification) - Converting signal-based events into awaitable promises @@ -346,10 +352,12 @@ export async function handlerAwareWorkflow(): Promise { ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** - Heartbeat details persist across retries ### WHEN: + - **Cancellable activities** - Any activity that should respond to cancellation - **Long-running activities** - Track progress for resumability - **Checkpointing** - Save progress periodically diff --git a/references/typescript/typescript.md b/references/typescript/typescript.md index 9918ee7..96fc089 100644 --- a/references/typescript/typescript.md +++ b/references/typescript/typescript.md @@ -13,13 +13,15 @@ Temporal workflows are durable through history replay. For details on how this w ## Quick Start **Add Dependencies:** Install the Temporal SDK packages (use the package manager appropriate for your project): + ```bash npm install @temporalio/client @temporalio/worker @temporalio/workflow @temporalio/activity ``` -Note: if you are working in production, it is strongly advised to use ~ version constraints, i.e. `npm install ... --save-prefix='~'` if using NPM. +Note: if you are working in production, it is strongly advised to use ~ version constraints, i.e. `npm install ... --save-prefix='~'` if using NPM. **activities.ts** - Activity definitions (separate file to distinguish workflow vs activity code): + ```typescript export async function greet(name: string): Promise { return `Hello, ${name}!`; @@ -27,6 +29,7 @@ export async function greet(name: string): Promise { ``` **workflows.ts** - Workflow definition (use type-only imports for activities): + ```typescript import { proxyActivities } from '@temporalio/workflow'; import type * as activities from './activities'; @@ -40,7 +43,8 @@ export async function greetingWorkflow(name: string): Promise { } ``` -**worker.ts** - Worker setup (imports activities and workflows, runs indefinitely): +**worker.ts** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): + ```typescript import { Worker } from '@temporalio/worker'; import * as activities from './activities'; @@ -62,6 +66,7 @@ run().catch(console.error); **Start the worker:** Run `npx ts-node worker.ts` in the background. **client.ts** - Start a workflow execution: + ```typescript import { Client } from '@temporalio/client'; import { greetingWorkflow } from './workflows'; @@ -87,16 +92,19 @@ run().catch(console.error); ## Key Concepts ### Workflow Definition + - Async functions exported from workflow file - Use `proxyActivities()` with type-only imports - Use `defineSignal()`, `defineQuery()`, `defineUpdate()`, `setHandler()` for handlers ### Activity Definition + - Regular async functions - Can perform I/O, network calls, etc. - Use `heartbeat()` for long operations ### Worker Setup + - Use `Worker.create()` with `workflowsPath` (dev) or `workflowBundle` (production) - see `references/typescript/gotchas.md` - Import activities directly (not via proxy) @@ -115,6 +123,7 @@ my_temporal_app/ ``` **In the Workflow file, use type-only imports for activities:** + ```typescript // workflows/greeting.ts import { proxyActivities } from '@temporalio/workflow'; @@ -130,11 +139,13 @@ const { translate } = proxyActivities({ The TypeScript SDK runs workflows in an isolated V8 sandbox. **Automatic replacements:** + - `Math.random()` → deterministic seeded PRNG - `Date.now()` → workflow start time - `setTimeout` → deterministic timer **Safe to use:** + - `sleep()` from `@temporalio/workflow` - `condition()` for waiting - Standard JavaScript operations @@ -160,6 +171,7 @@ See `references/typescript/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/typescript/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/typescript/determinism.md`** - Essentials of determinism in TypeScript - **`references/typescript/gotchas.md`** - TypeScript-specific mistakes and anti-patterns diff --git a/references/typescript/versioning.md b/references/typescript/versioning.md index a9f57a2..b4b8e19 100644 --- a/references/typescript/versioning.md +++ b/references/typescript/versioning.md @@ -25,6 +25,7 @@ export async function myWorkflow(): Promise { ``` **How it works:** + - If the Workflow is running for the first time, `patched()` returns `true` and inserts a marker into the Event History - During replay, if the history contains a marker with the same `patchId`, `patched()` returns `true` - During replay, if no matching marker exists, `patched()` returns `false` @@ -175,6 +176,7 @@ const worker = await Worker.create({ ``` **Configuration options:** + - `useWorkerVersioning`: Enables Worker Versioning - `version.deploymentName`: Logical name for your service (consistent across versions) - `version.buildId`: Unique identifier for this build @@ -195,6 +197,7 @@ const worker = await Worker.create({ ### When to Use Worker Versioning Worker Versioning is best suited for: + - **Short-running Workflows**: Old Workers only need to run briefly during deployment transitions - **Frequent deployments**: Eliminates the need for code-level patching on every change - **Blue-green deployments**: Run old and new versions simultaneously with traffic control