diff --git a/.github/workflows/agentex-tutorials-test.yml b/.github/workflows/agentex-tutorials-test.yml index f19c58d4d..41b495d71 100644 --- a/.github/workflows/agentex-tutorials-test.yml +++ b/.github/workflows/agentex-tutorials-test.yml @@ -49,6 +49,29 @@ jobs: curl -LsSf https://astral.sh/uv/install.sh | sh echo "$HOME/.local/bin" >> $GITHUB_PATH + # Subprocess-CLI harnesses: install the relevant CLI only for the + # claude-code / codex tutorials (no-op for every other tutorial). npm is + # preinstalled on ubuntu runners. Versions mirror the golden agent's + # sandbox image (teams/sgp/agents/golden_agent/sandbox/Dockerfile): claude-code + # is pinned to the same CLAUDE_CODE_VERSION; codex is left unpinned there, + # so it is left unpinned here too. Bump CLAUDE_CODE_VERSION in lockstep + # with the sandbox Dockerfile. + - name: Install harness CLI (claude-code / codex only) + if: ${{ contains(matrix.tutorial, 'claude_code') || contains(matrix.tutorial, 'codex') }} + env: + CLAUDE_CODE_VERSION: "2.1.142" + run: | + if [[ "${{ matrix.tutorial }}" == *claude_code* ]]; then + echo "📦 Installing Claude Code CLI (v${CLAUDE_CODE_VERSION})..." + npm install -g "@anthropic-ai/claude-code@${CLAUDE_CODE_VERSION}" + claude --version || true + fi + if [[ "${{ matrix.tutorial }}" == *codex* ]]; then + echo "📦 Installing Codex CLI..." + npm install -g @openai/codex + codex --version || true + fi + - name: Pull latest AgentEx image run: | echo "🐳 Pulling latest Scale AgentEx Docker image..." @@ -136,6 +159,11 @@ jobs: working-directory: ./examples/tutorials env: OPENAI_API_KEY: ${{ secrets.TUTORIAL_OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.TUTORIAL_ANTHROPIC_API_KEY }} + # Enable the gated live tests only for the matching subprocess-CLI + # harness tutorial (the CLI is installed for it in the step above). + CLAUDE_LIVE_TESTS: ${{ contains(matrix.tutorial, 'claude_code') && '1' || '' }} + CODEX_LIVE_TESTS: ${{ contains(matrix.tutorial, 'codex') && '1' || '' }} HEALTH_CHECK_PORT: 8080 # Use non-privileged port for temporal worker health checks run: | echo "Testing tutorial: ${{ matrix.tutorial }}" diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml new file mode 100644 index 000000000..075ee5cf3 --- /dev/null +++ b/.github/workflows/harness-integration.yml @@ -0,0 +1,61 @@ +name: Harness Integration + +on: + push: + branches: [main] + pull_request: + paths: + - "src/agentex/lib/core/harness/**" + - "src/agentex/lib/adk/_modules/**" + - "tests/lib/core/harness/test_harness_pydantic_ai_*.py" + - "tests/lib/core/harness/test_harness_langgraph_*.py" + - ".github/workflows/harness-integration.yml" + +jobs: + conformance: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + # Defer to scripts/test so the harness suite runs under the exact same + # invocation as the main CI test job: DEFER_PYDANTIC_BUILD=false and + # `uv run --isolated --all-packages --all-extras pytest`, across the + # min/max supported Python versions. Running `uv run pytest` directly + # would risk an all-extras-only dep passing locally but failing in CI. + - name: Conformance suite + run: ./scripts/test tests/lib/core/harness/ -v + + # Offline harness integration tests (sync / async / temporal channels) for each + # migrated harness. These use fake streams / TestModel + fake streaming/tracing + # and require no live infrastructure. Future harness migration PRs (6-8) add + # their harness to the matrix below and their test paths to the triggers above. + live-matrix: + runs-on: ubuntu-latest + strategy: + matrix: + harness: [pydantic_ai, langgraph] + channel: [sync, async, temporal] + fail-fast: false + name: ${{ matrix.harness }}-${{ matrix.channel }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + - name: ${{ matrix.harness }} ${{ matrix.channel }} integration tests (offline) + run: | + ./scripts/test tests/lib/core/harness/test_harness_${{ matrix.harness }}_${{ matrix.channel }}.py -v diff --git a/.release-please-manifest.json b/.release-please-manifest.json index be44cf037..9a40fa434 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { - ".": "0.14.0", - "adk": "0.13.2" + ".": "0.15.0", + "adk": "0.14.0" } diff --git a/.stats.yml b/.stats.yml index 5375d17e3..60af41b79 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ -configured_endpoints: 64 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/sgp/agentex-sdk-ae2571b5ac5d337ba5ced527cec0ff6e3088296fa67c3c836ed5a06544b25cb8.yml -openapi_spec_hash: 962a2f20444c7823fd3a34f95365146e -config_hash: 138b7c0b394e7393133c8ff16a6d0eb3 +configured_endpoints: 65 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/sgp/agentex-sdk-cd43ba4b554ca024dd7ee7b74e4f4700a743282c17def704a0967e6ff251c09b.yml +openapi_spec_hash: 9369ccc9c0289e9d6f641a526d244d1c +config_hash: 1ae003838971335aac550f3ad5872f54 diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f81295a9..a9b0590c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,38 @@ * **tracing:** emit OTel metrics for async span queue depth, batch drain, and SGP export success/failure (HTTP status labels). Disable SDK-side recording with ``AGENTEX_TRACING_METRICS=0``. +## 0.15.0 (2026-06-23) + +Full Changelog: [agentex-client-v0.14.0...agentex-client-v0.15.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-client-v0.14.0...agentex-client-v0.15.0) + +### Features + +* **api:** add webhook endpoint ([37c7d9d](https://github.com/scaleapi/scale-agentex-python/commit/37c7d9d465943184ab84922ba1079b939516d534)) +* **claude-code:** stream-json parser tap for the unified harness surface ([#420](https://github.com/scaleapi/scale-agentex-python/issues/420)) ([904339c](https://github.com/scaleapi/scale-agentex-python/commit/904339c21b8cd641a02d903c03d4a8730b4d7e84)) +* **codex:** event-stream parser tap for the unified harness surface ([#421](https://github.com/scaleapi/scale-agentex-python/issues/421)) ([9b2b031](https://github.com/scaleapi/scale-agentex-python/commit/9b2b03144cc67bb497e0a301686207aba2629758)) +* **harness:** public adk facade + docs for the unified harness surface (PR 9) ([#423](https://github.com/scaleapi/scale-agentex-python/issues/423)) ([fa60632](https://github.com/scaleapi/scale-agentex-python/commit/fa60632f9be84315a3fdc627745ae5b605994bd8)) +* **harness:** unified harness surface — foundation (span derivation, delivery adapters, emitter) ([#412](https://github.com/scaleapi/scale-agentex-python/issues/412)) ([a9cacf4](https://github.com/scaleapi/scale-agentex-python/commit/a9cacf4eb71697351ee658a570636f04bbf31ad5)) +* **langgraph:** migrate LangGraph harness onto unified surface ([#417](https://github.com/scaleapi/scale-agentex-python/issues/417)) ([d344228](https://github.com/scaleapi/scale-agentex-python/commit/d34422845de4b80ed69d2dccfdb0c680ef2fbca3)) +* **openai-agents:** migrate onto the unified harness surface ([#416](https://github.com/scaleapi/scale-agentex-python/issues/416)) ([d10e151](https://github.com/scaleapi/scale-agentex-python/commit/d10e1510bd5da44ad5acc5cac638750122083fce)) +* **pydantic-ai:** migrate onto unified harness surface (PR4) ([#415](https://github.com/scaleapi/scale-agentex-python/issues/415)) ([5ec62c2](https://github.com/scaleapi/scale-agentex-python/commit/5ec62c20781d24fc3e0b92734fcd444b1e791d70)) +* **sdk:** add webhook helper for forward-route handlers ([#419](https://github.com/scaleapi/scale-agentex-python/issues/419)) ([514075d](https://github.com/scaleapi/scale-agentex-python/commit/514075de2189f33be4ade0ac84368019e55ed7ea)) +* **streaming:** stream tool call argument deltas in TemporalStreamingModel ([#355](https://github.com/scaleapi/scale-agentex-python/issues/355)) ([c8de1d4](https://github.com/scaleapi/scale-agentex-python/commit/c8de1d4c9c3b5b3c16ad4aaf9644c1ba0d618757)) +* **tracing:** skip Agentex span-start write by default (end-only ingest) ([#438](https://github.com/scaleapi/scale-agentex-python/issues/438)) ([10d22a2](https://github.com/scaleapi/scale-agentex-python/commit/10d22a27091c9c410ae808dab9cfce5dab3816a8)) + + +### Bug Fixes + +* **harness:** assert cross-channel (yield vs auto-send) conformance equivalence [AGX1-373] ([#414](https://github.com/scaleapi/scale-agentex-python/issues/414)) ([694960f](https://github.com/scaleapi/scale-agentex-python/commit/694960f913b8ba521d9236e876e5e00f57a3a3ff)) +* **harness:** correct codex & openai reasoning stream envelopes ([#441](https://github.com/scaleapi/scale-agentex-python/issues/441)) ([1d86e8a](https://github.com/scaleapi/scale-agentex-python/commit/1d86e8a47a369814540b6e853cd20240c6098f27)) +* **tests:** use relative import for assert_matches_type in webhooks test ([#440](https://github.com/scaleapi/scale-agentex-python/issues/440)) ([5954a9f](https://github.com/scaleapi/scale-agentex-python/commit/5954a9fc8c7961ef5ceb41abf3ca32e6e78590c5)) +* **tracing:** fail open temporal span activities ([#437](https://github.com/scaleapi/scale-agentex-python/issues/437)) ([2d63eef](https://github.com/scaleapi/scale-agentex-python/commit/2d63eef53bdb919bb6568e04708e3b7abcb8075b)) + + +### Refactors + +* **cli:** migrate existing langgraph/pydantic-ai templates to unified surface ([#429](https://github.com/scaleapi/scale-agentex-python/issues/429)) ([ee41408](https://github.com/scaleapi/scale-agentex-python/commit/ee41408c420eba5c6b8fe8719c8ebd445dcd220c)) +* **tutorials:** migrate to the unified harness surface + renumber ([#428](https://github.com/scaleapi/scale-agentex-python/issues/428)) ([ebaf617](https://github.com/scaleapi/scale-agentex-python/commit/ebaf617256c7971dde12fd7e25f02b05f2f42fca)) + ## 0.14.0 (2026-06-22) Full Changelog: [agentex-client-v0.13.1...agentex-client-v0.14.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-client-v0.13.1...agentex-client-v0.14.0) diff --git a/adk/CHANGELOG.md b/adk/CHANGELOG.md index 8c15355d9..ac7404e6b 100644 --- a/adk/CHANGELOG.md +++ b/adk/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.14.0 (2026-06-23) + +Full Changelog: [agentex-sdk-v0.13.2...agentex-sdk-v0.14.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-sdk-v0.13.2...agentex-sdk-v0.14.0) + +### Features + +* **harness:** public adk facade + docs for the unified harness surface (PR 9) ([#423](https://github.com/scaleapi/scale-agentex-python/issues/423)) ([fa60632](https://github.com/scaleapi/scale-agentex-python/commit/fa60632f9be84315a3fdc627745ae5b605994bd8)) + ## 0.13.2 (2026-06-22) Full Changelog: [agentex-sdk-v0.13.1...agentex-sdk-v0.13.2](https://github.com/scaleapi/scale-agentex-python/compare/agentex-sdk-v0.13.1...agentex-sdk-v0.13.2) diff --git a/adk/docs/harness.md b/adk/docs/harness.md new file mode 100644 index 000000000..6a9d8947a --- /dev/null +++ b/adk/docs/harness.md @@ -0,0 +1,196 @@ +# Unified Harness Surface + +The unified harness surface gives every agent harness (pydantic-ai, LangGraph, OpenAI Agents, and future parsers) a single, shared path to streaming, message persistence, and tracing. The Agentex `StreamTaskMessage*` event stream is the canonical wire format. A harness tap produces that stream once; the shared machinery delivers it and derives spans from it. + +All public names are re-exported from `agentex.lib.adk`: + +```python +from agentex.lib.adk import ( + UnifiedEmitter, + SpanTracer, + TurnUsage, + TurnResult, + HarnessTurn, + StreamTaskMessage, + OpenSpan, + CloseSpan, + SpanSignal, +) +``` + +The implementation lives at `src/agentex/lib/core/harness/`. + +--- + +## The canonical stream: `StreamTaskMessage` + +`StreamTaskMessage` is a union of the four wire-protocol update types: + +``` +StreamTaskMessageStart - opens a content slot (text, reasoning, tool request, ...) +StreamTaskMessageDelta - appends a token/fragment to an open slot +StreamTaskMessageFull - posts a complete message in one shot (tool response, ...) +StreamTaskMessageDone - closes an open slot +``` + +Every harness tap produces a sequence of these. Everything downstream (delivery, tracing) reads the same sequence. + +--- + +## Per-harness taps: `convert__to_agentex_events` + +A tap is an async generator that translates the harness's native event stream into `StreamTaskMessage*` events. The currently shipped taps are: + +| Harness | Tap function | Exported from | +|---|---|---| +| pydantic-ai | `convert_pydantic_ai_to_agentex_events` | `agentex.lib.adk` | +| LangGraph | `convert_langgraph_to_agentex_events` | `agentex.lib.adk` | + +Taps for claude-code and codex will be added in subsequent PRs (AGX1-420, AGX1-421) and exported from `agentex.lib.adk` in the same way. + +--- + +## `HarnessTurn` protocol + +`HarnessTurn` is the interface a harness turn object must satisfy to plug into `UnifiedEmitter`: + +```python +@runtime_checkable +class HarnessTurn(Protocol): + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... +``` + +`events` is the canonical stream for this turn. `usage()` is valid only after `events` is exhausted (async generators cannot cleanly return a value to the consumer, so usage travels out-of-band). + +--- + +## `TurnUsage` + +Token counts and cost for one turn, harness-independent: + +```python +class TurnUsage(BaseModel): + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 +``` + +Field names align with `agentex.lib.core.observability.llm_metrics` for easy conversion. + +--- + +## `UnifiedEmitter` + +`UnifiedEmitter` ties a turn's canonical stream, tracing context, and delivery mode together. Construct one per turn with the task/trace context from the request: + +```python +emitter = UnifiedEmitter( + task_id=params.task.id, + trace_id=params.task.id, # or None to disable tracing + parent_span_id=turn_span.id if turn_span else None, +) +``` + +**Tracing is on by default** when `trace_id` is provided. To disable it explicitly, pass `tracer=False`. To inject a custom `SpanTracer` (e.g. in tests), pass it as `tracer=`. + +### Delivery mode 1: `yield_turn` (sync HTTP ACP) + +For sync ACP agents that return events directly over the HTTP response: + +```python +@acp.on_message_send +async def handle(params): + turn = MyHarnessTurn(params) # implements HarnessTurn + async for event in emitter.yield_turn(turn): + yield event +``` + +`yield_turn` forwards each event to the caller and traces spans as a side effect. It is a passthrough when `tracer` is `None`. + +### Delivery mode 2: `auto_send_turn` (async/Temporal) + +For async or Temporal agents that push to the task stream via Redis: + +```python +result: TurnResult = await emitter.auto_send_turn(turn, created_at=workflow.now()) +``` + +`auto_send_turn` drives `adk.streaming` contexts for every message in the stream, derives and records spans, and returns a `TurnResult` with the final text and usage. Pass `created_at` under Temporal to back-date message timestamps deterministically. + +--- + +## `TurnResult` + +```python +class TurnResult(BaseModel): + final_text: str = "" + usage: TurnUsage = TurnUsage() +``` + +Returned by `auto_send_turn`. `final_text` is the last text segment of the turn (multi-step runs return only the final segment, matching `stream_langgraph_events` / `stream_pydantic_ai_events` semantics). + +--- + +## Tracing: span derivation + +Spans are derived from the canonical stream by `SpanDeriver` (pure, no `adk` dependency) and dispatched to `adk.tracing` by `SpanTracer`. The mapping: + +- `StreamTaskMessageStart(ToolRequestContent)` + `StreamTaskMessageDone` on that index -> tool span open (keyed by `tool_call_id`) +- `StreamTaskMessageFull(ToolResponseContent)` whose `tool_call_id` was opened -> tool span close +- `StreamTaskMessageFull(ToolRequestContent)` (harnesses that emit tool calls as Full) -> opens a tool span; matching `Full(ToolResponseContent)` closes it +- `StreamTaskMessageStart(ReasoningContent)` + `StreamTaskMessageDone` -> reasoning span + +`SpanTracer` is `SpanDeriver`'s consumer. You can inject a custom `SpanTracer` via `UnifiedEmitter(tracer=)` for advanced use or testing. + +--- + +## Usage examples by channel + +### Sync ACP (pydantic-ai tap) + +```python +import agentex.lib.adk as adk +from agentex.lib.adk import UnifiedEmitter, convert_pydantic_ai_to_agentex_events + +@acp.on_message_send +async def handle(params): + task_id = params.task.id + async with adk.tracing.span(trace_id=task_id, name="message", ...) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + tap = convert_pydantic_ai_to_agentex_events(pydantic_stream) + # wrap tap in a HarnessTurn then yield_turn, or yield directly: + async for event in tap: + yield event +``` + +For the pre-unified sync path the tap is still yielded directly; `UnifiedEmitter.yield_turn` is the forward-looking integration point when a `HarnessTurn` wrapper is available. + +### Async Temporal (auto-send) + +```python +from agentex.lib.adk import UnifiedEmitter + +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=parent_span_id, +) +result = await emitter.auto_send_turn(turn, created_at=workflow.now()) +# result.final_text — last text segment +# result.usage — TurnUsage (tokens, cost, ...) +``` diff --git a/adk/pyproject.toml b/adk/pyproject.toml index 946367d7f..1d8c00a40 100644 --- a/adk/pyproject.toml +++ b/adk/pyproject.toml @@ -4,7 +4,7 @@ # (agentex/{__init__.py, _*.py, types/, resources/}) ships from the slim # sibling package `agentex-client` which is pinned as a runtime dep. name = "agentex-sdk" -version = "0.13.2" +version = "0.14.0" description = "Agent Development Kit (ADK) overlay for the Agentex API — FastACP server, Temporal workflows, LLM provider integrations, observability" license = "Apache-2.0" authors = [ diff --git a/api.md b/api.md index 4c0d9b9c1..7c1b4eb68 100644 --- a/api.md +++ b/api.md @@ -245,3 +245,15 @@ Methods: - client.checkpoints.get_tuple(\*\*params) -> Optional[CheckpointGetTupleResponse] - client.checkpoints.put(\*\*params) -> CheckpointPutResponse - client.checkpoints.put_writes(\*\*params) -> None + +# Webhooks + +Types: + +```python +from agentex.types import WebhookCreateWebhookTriggerResponse +``` + +Methods: + +- client.webhooks.create_webhook_trigger(\*\*params) -> WebhookCreateWebhookTriggerResponse diff --git a/examples/tutorials/00_sync/030_langgraph/README.md b/examples/tutorials/00_sync/030_langgraph/README.md index e5b1db0f7..5a68792cc 100644 --- a/examples/tutorials/00_sync/030_langgraph/README.md +++ b/examples/tutorials/00_sync/030_langgraph/README.md @@ -1,43 +1,50 @@ -# Tutorial 030: Sync LangGraph Agent +# Tutorial: Sync LangGraph Agent -This tutorial demonstrates how to build a **synchronous** LangGraph agent on AgentEx with: -- Tool calling (ReAct pattern) -- Streaming token output -- Multi-turn conversation memory via AgentEx checkpointer -- Tracing integration +This tutorial demonstrates how to build a **synchronous** LangGraph agent on AgentEx +using the **unified harness surface**: -## Graph Structure +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +async for event in emitter.yield_turn(turn): + yield event +``` -![Graph](graph.png) +The `LangGraphTurn` + `UnifiedEmitter` path replaces calling the lower-level +``convert_langgraph_to_agentex_events`` helper directly. ## Key Concepts -### Sync ACP -The sync ACP model uses HTTP request/response for communication. The `@acp.on_message_send` handler receives a message and yields streaming events back to the client. +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). + +`UnifiedEmitter.yield_turn(turn)` iterates the turn's events and yields them +to the sync ACP handler unchanged. The same `LangGraphTurn` object can also be +passed to `UnifiedEmitter.auto_send_turn` in the async/temporal channels. -### LangGraph Integration -- **StateGraph**: Defines the agent's state machine with `AgentState` (message history) -- **ToolNode**: Automatically executes tool calls from the LLM -- **tools_condition**: Routes between tool execution and final response -- **Checkpointer**: Uses AgentEx's HTTP checkpointer for cross-request memory +### AGX1-377 Note -### Streaming -The agent streams tokens as they're generated using `convert_langgraph_to_agentex_events()`, which converts LangGraph's stream events into AgentEx `TaskMessageUpdate` events. +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. ## Files | File | Description | |------|-------------| -| `project/acp.py` | ACP server and message handler | -| `project/graph.py` | LangGraph state graph definition | +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + yield_turn) | +| `project/graph.py` | LangGraph state graph (weather example) | | `project/tools.py` | Tool definitions (weather example) | | `tests/test_agent.py` | Integration tests | -| `manifest.yaml` | Agent configuration | +| `manifest.yaml` | Agent configuration (name: s030-langgraph) | ## Running Locally ```bash -# From this directory agentex agents run ``` diff --git a/examples/tutorials/00_sync/030_langgraph/graph.png b/examples/tutorials/00_sync/030_langgraph/graph.png deleted file mode 100644 index 16d22a1e7..000000000 Binary files a/examples/tutorials/00_sync/030_langgraph/graph.png and /dev/null differ diff --git a/examples/tutorials/00_sync/030_langgraph/manifest.yaml b/examples/tutorials/00_sync/030_langgraph/manifest.yaml index bfe005626..9a52a3dce 100644 --- a/examples/tutorials/00_sync/030_langgraph/manifest.yaml +++ b/examples/tutorials/00_sync/030_langgraph/manifest.yaml @@ -17,7 +17,7 @@ local_development: agent: acp_type: sync name: s030-langgraph - description: A sync LangGraph agent with tool calling and streaming + description: A sync LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) temporal: enabled: false @@ -47,7 +47,7 @@ deployment: global: agent: name: "s030-langgraph" - description: "A sync LangGraph agent with tool calling and streaming" + description: "A sync LangGraph agent using the unified harness surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/00_sync/030_langgraph/project/acp.py b/examples/tutorials/00_sync/030_langgraph/project/acp.py index 517a00322..e42b0f4ea 100644 --- a/examples/tutorials/00_sync/030_langgraph/project/acp.py +++ b/examples/tutorials/00_sync/030_langgraph/project/acp.py @@ -1,8 +1,20 @@ -""" -ACP (Agent Communication Protocol) handler for Agentex. - -This is the API layer — it manages the graph lifecycle and streams -tokens and tool calls from the LangGraph graph to the Agentex frontend. +"""ACP handler for the sync LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.yield_turn`` converts it into +the AgentEx ``TaskMessageUpdate`` event stream expected by the sync ACP. + +Properties of the unified surface: +- Tracing is wired through the tracing manager (no bespoke handler boilerplate). +- No manual text-delta accumulation for the span output. +- Tool calls are emitted as ``StreamTaskMessageFull`` (not Start+Delta+Done) + via the same code path as the async/temporal channels. +- Usage data (token counts) is captured on the ``LangGraphTurn`` object and + can be read after the turn completes. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. """ from __future__ import annotations @@ -16,29 +28,29 @@ import agentex.lib.adk as adk from project.graph import create_graph -from agentex.lib.adk import create_langgraph_tracing_handler, convert_langgraph_to_agentex_events from agentex.lib.types.acp import SendMessageParams from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import TaskMessageUpdate from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) -# Register the Agentex tracing processor so spans are shipped to the backend add_tracing_processor_config( SGPTracingProcessorConfig( sgp_api_key=os.environ.get("SGP_API_KEY", ""), sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), - )) -# Create ACP server + ) +) + acp = FastACP.create(acp_type="sync") -# Compiled graph (lazy-initialized on first request) _graph = None @@ -54,41 +66,42 @@ async def get_graph(): async def handle_message_send( params: SendMessageParams, ) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: - """Handle incoming messages from Agentex, streaming tokens and tool calls.""" + """Handle incoming messages, streaming tokens and tool calls via unified harness.""" graph = await get_graph() - thread_id = params.task.id + task_id = params.task.id user_message = params.content.content - logger.info(f"Processing message for thread {thread_id}") + logger.info(f"Processing message for task {task_id}") async with adk.tracing.span( - trace_id=thread_id, + trace_id=task_id, + task_id=task_id, name="message", input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - callback = create_langgraph_tracing_handler( - trace_id=thread_id, - parent_span_id=turn_span.id if turn_span else None, - ) - stream = graph.astream( {"messages": [{"role": "user", "content": user_message}]}, - config={ - "configurable": {"thread_id": thread_id}, - "callbacks": [callback], - }, + config={"configurable": {"thread_id": task_id}}, stream_mode=["messages", "updates"], ) + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + final_text = "" - async for event in convert_langgraph_to_agentex_events(stream): - # Accumulate text deltas for span output + async for event in emitter.yield_turn(turn): + # Accumulate text deltas so the span's final_output is the assistant + # text (matching the async tutorial), not the usage metrics. delta = getattr(event, "delta", None) if isinstance(delta, TextDelta) and delta.text_delta: final_text += delta.text_delta yield event if turn_span: - turn_span.output = {"final_output": final_text} + turn_span.output = {"final_output": final_text, "usage": turn.usage().model_dump()} diff --git a/examples/tutorials/00_sync/030_langgraph/project/graph.py b/examples/tutorials/00_sync/030_langgraph/project/graph.py index 53728cd58..6709719e5 100644 --- a/examples/tutorials/00_sync/030_langgraph/project/graph.py +++ b/examples/tutorials/00_sync/030_langgraph/project/graph.py @@ -1,8 +1,7 @@ -""" -LangGraph graph definition. +"""LangGraph graph definition for the 030_langgraph sync agent. -Defines the state, nodes, edges, and compiles the graph. -The compiled graph is the boundary between this module and the API layer. +Identical to ``030_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. """ from __future__ import annotations @@ -35,15 +34,12 @@ class AgentState(TypedDict): """State schema for the agent graph.""" + messages: Annotated[list[Any], add_messages] async def create_graph(): - """Create and compile the agent graph with checkpointer. - - Returns: - A compiled LangGraph StateGraph ready for invocation. - """ + """Create and compile the agent graph with checkpointer.""" llm = ChatOpenAI( model=MODEL_NAME, reasoning={"effort": "high", "summary": "auto"}, @@ -56,9 +52,7 @@ def agent_node(state: AgentState) -> dict[str, Any]: """Process the current state and generate a response.""" messages = state["messages"] if not messages or not isinstance(messages[0], SystemMessage): - system_content = SYSTEM_PROMPT.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ) + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) messages = [SystemMessage(content=system_content)] + messages response = llm_with_tools.invoke(messages) return {"messages": [response]} diff --git a/examples/tutorials/00_sync/030_langgraph/project/tools.py b/examples/tutorials/00_sync/030_langgraph/project/tools.py index 1b402a906..b3e5dba34 100644 --- a/examples/tutorials/00_sync/030_langgraph/project/tools.py +++ b/examples/tutorials/00_sync/030_langgraph/project/tools.py @@ -1,9 +1,4 @@ -""" -Tool definitions for the LangGraph agent. - -Add your custom tools here. Each tool should be a function decorated with @tool -or created using the Tool class. -""" +"""Tool definitions for the 030_langgraph sync agent.""" from langchain_core.tools import Tool @@ -17,16 +12,13 @@ def get_weather(city: str) -> str: Returns: A string describing the weather conditions. """ - # TODO: Replace with actual weather API call return f"The weather in {city} is sunny and 72°F" -# Define tools weather_tool = Tool( name="get_weather", func=get_weather, description="Get the current weather for a city. Input should be a city name.", ) -# Export all tools as a list TOOLS = [weather_tool] diff --git a/examples/tutorials/00_sync/030_langgraph/pyproject.toml b/examples/tutorials/00_sync/030_langgraph/pyproject.toml index fc9f99971..33bea16b5 100644 --- a/examples/tutorials/00_sync/030_langgraph/pyproject.toml +++ b/examples/tutorials/00_sync/030_langgraph/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "s030-langgraph" version = "0.1.0" -description = "A sync LangGraph agent with tool calling and streaming" +description = "A sync LangGraph agent using the unified harness surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ diff --git a/examples/tutorials/00_sync/030_langgraph/tests/test_agent.py b/examples/tutorials/00_sync/030_langgraph/tests/test_agent.py index 36fcf418f..dabd83e76 100644 --- a/examples/tutorials/00_sync/030_langgraph/tests/test_agent.py +++ b/examples/tutorials/00_sync/030_langgraph/tests/test_agent.py @@ -1,14 +1,8 @@ """ -Tests for the sync LangGraph agent. +Tests for the sync harness LangGraph agent. -This test suite validates: -- Non-streaming message sending with tool-calling LangGraph agent -- Streaming message sending with token-by-token output - -To run these tests: -1. Make sure the agent is running (via docker-compose or `agentex agents run`) -2. Set the AGENTEX_API_BASE_URL environment variable if not using default -3. Run: pytest test_agent.py -v +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) +end-to-end against a live AgentEx server. Configuration: - AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) @@ -25,26 +19,22 @@ from agentex.types.agent_rpc_params import ParamsCreateTaskRequest, ParamsSendMessageRequest from agentex.lib.sdk.fastacp.base.base_acp_server import uuid -# Configuration from environment variables AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") AGENT_NAME = os.environ.get("AGENT_NAME", "s030-langgraph") @pytest.fixture def client(): - """Create an AgentEx client instance for testing.""" return Agentex(base_url=AGENTEX_API_BASE_URL) @pytest.fixture def agent_name(): - """Return the agent name for testing.""" return AGENT_NAME @pytest.fixture def agent_id(client, agent_name): - """Retrieve the agent ID based on the agent name.""" agents = client.agents.list() for agent in agents: if agent.name == agent_name: @@ -53,10 +43,7 @@ def agent_id(client, agent_name): class TestNonStreamingMessages: - """Test non-streaming message sending with LangGraph agent.""" - def test_send_simple_message(self, client: Agentex, agent_name: str): - """Test sending a simple message and receiving a response.""" response = client.agents.send_message( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -72,7 +59,6 @@ def test_send_simple_message(self, client: Agentex, agent_name: str): assert len(result) >= 1 def test_tool_calling(self, client: Agentex, agent_name: str): - """Test that the agent can use tools (e.g., weather tool).""" response = client.agents.send_message( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -88,12 +74,10 @@ def test_tool_calling(self, client: Agentex, agent_name: str): assert len(result) >= 1 def test_multiturn_conversation(self, client: Agentex, agent_name: str, agent_id: str): - """Test multi-turn conversation with memory via LangGraph checkpointer.""" task_response = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None - # First message response1 = client.agents.send_message( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -107,7 +91,6 @@ def test_multiturn_conversation(self, client: Agentex, agent_name: str, agent_id ) assert response1.result is not None - # Second message - agent should remember the name response2 = client.agents.send_message( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -126,10 +109,7 @@ def test_multiturn_conversation(self, client: Agentex, agent_name: str, agent_id class TestStreamingMessages: - """Test streaming message sending with LangGraph agent.""" - def test_stream_simple_message(self, client: Agentex, agent_name: str): - """Test streaming a simple message response.""" stream = client.agents.send_message_stream( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -140,14 +120,11 @@ def test_stream_simple_message(self, client: Agentex, agent_name: str): ) ), ) - aggregated_content, chunks = collect_streaming_response(stream) - assert aggregated_content is not None assert len(chunks) > 1, "No chunks received in streaming response." def test_stream_tool_calling(self, client: Agentex, agent_name: str): - """Test streaming with tool calls.""" stream = client.agents.send_message_stream( agent_name=agent_name, params=ParamsSendMessageRequest( @@ -158,9 +135,7 @@ def test_stream_tool_calling(self, client: Agentex, agent_name: str): ) ), ) - aggregated_content, chunks = collect_streaming_response(stream) - assert aggregated_content is not None assert len(chunks) > 0, "No chunks received in streaming response." diff --git a/examples/tutorials/00_sync/040_pydantic_ai/README.md b/examples/tutorials/00_sync/040_pydantic_ai/README.md index 02c3b57c7..ef52c7c77 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/README.md +++ b/examples/tutorials/00_sync/040_pydantic_ai/README.md @@ -1,46 +1,52 @@ -# Tutorial 040: Sync Pydantic AI Agent +# Sync Pydantic AI Agent -This tutorial demonstrates how to build a **synchronous** Pydantic AI agent on AgentEx with: -- Tool calling (Pydantic AI handles the tool loop internally) -- Streaming token output (including token-by-token tool-call argument streaming) +A minimal **synchronous** Pydantic AI agent that drives the **unified harness +surface** (`UnifiedEmitter.yield_turn` + `PydanticAITurn`) on the sync +(HTTP-yield) channel. -## Key Concepts +## Why this agent exists -### Sync ACP -The sync ACP model uses HTTP request/response for communication. The `@acp.on_message_send` handler receives a message and yields streaming events back to the client. +This agent is the sync coverage for the unified surface: it shows an agent +author wiring the sync channel through `UnifiedEmitter.yield_turn` and getting +automatic span derivation (tool spans nested under the per-turn span) for free, +exactly like the async/temporal channels. -### Pydantic AI Integration -- **Agent**: A single `pydantic_ai.Agent` that owns the model and tools. No graph required — Pydantic AI runs its own tool-call loop until the model is done. -- **`@agent.tool_plain`**: Registers a Python function as a tool. Pydantic AI infers the schema from type hints and docstring. -- **`agent.run_stream_events(...)`**: Yields `AgentStreamEvent`s (PartStartEvent / PartDeltaEvent / PartEndEvent / FunctionToolResultEvent) as the model produces them. +## How it wires the unified surface -### Streaming -The agent streams tokens and tool-call arguments as they're generated using `convert_pydantic_ai_to_agentex_events()`, which adapts Pydantic AI's stream into AgentEx `TaskMessageUpdate` events. Notably, **tool-call arguments stream as `ToolRequestDelta` tokens** rather than arriving as a single complete payload — a richer experience than what OpenAI Agents SDK currently exposes. +In `project/acp.py`: -## Files +```python +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, +) +async with agent.run_stream_events(user_message) as stream: + turn = PydanticAITurn(stream, model=MODEL_NAME) # coalesce off: stream tool-call arg tokens + async for ev in emitter.yield_turn(turn): + yield ev +``` -| File | Description | -|------|-------------| -| `project/acp.py` | ACP server and message handler | -| `project/agent.py` | Pydantic AI agent + tool registration | -| `project/tools.py` | Tool definitions (weather example) | -| `tests/test_agent.py` | Integration tests | -| `manifest.yaml` | Agent configuration | +- `coalesce_tool_requests=False` (the default) preserves token-by-token + tool-call argument streaming on the sync channel. +- The `UnifiedEmitter` is constructed from the ACP/streaming context + (`task_id` + `trace_id` + `parent_span_id`) so tool spans nest under the + per-turn `AGENT_WORKFLOW` span automatically. -## Running Locally +## Files -```bash -# From this directory -agentex agents run -``` +- `project/acp.py` — sync ACP handler using `emitter.yield_turn(...)`. +- `project/agent.py` — builds the `pydantic_ai.Agent` with one tool. +- `project/tools.py` — `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires a running agent). -## Running Tests +## Tools -```bash -pytest tests/test_agent.py -v -``` +- `get_weather(city: str) -> str`: returns a fixed "sunny and 72°F" string so a + run deterministically exercises text + a tool call + a tool response. -## Notes +## Offline coverage -- Multi-turn conversation memory is not wired in this tutorial. Pydantic AI does not ship a checkpointer like LangGraph; to add memory, load prior messages via `adk.messages.list(task_id=...)` and pass them to `agent.run_stream_events(..., message_history=...)`. -- Reasoning/thinking tokens are not exercised here because `gpt-4o-mini` does not emit `ThinkingPart`s. Swap to a reasoning-capable model (e.g. `openai:o1-mini` via Pydantic AI's appropriate provider) if you want to test that branch end-to-end. +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no network) live in the SDK repo under +`tests/lib/core/harness/` (the pydantic-ai sync suite). diff --git a/examples/tutorials/00_sync/040_pydantic_ai/manifest.yaml b/examples/tutorials/00_sync/040_pydantic_ai/manifest.yaml index 68d3b4a00..9563de39c 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/manifest.yaml +++ b/examples/tutorials/00_sync/040_pydantic_ai/manifest.yaml @@ -17,7 +17,7 @@ local_development: agent: acp_type: sync name: s040-pydantic-ai - description: A sync Pydantic AI agent with tool calling and streaming + description: A sync Pydantic AI harness test agent using the unified emitter surface temporal: enabled: false @@ -47,7 +47,7 @@ deployment: global: agent: name: "s040-pydantic-ai" - description: "A sync Pydantic AI agent with tool calling and streaming" + description: "A sync Pydantic AI harness test agent using the unified emitter surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/00_sync/040_pydantic_ai/project/acp.py b/examples/tutorials/00_sync/040_pydantic_ai/project/acp.py index 0c096893f..f23cd7960 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/project/acp.py +++ b/examples/tutorials/00_sync/040_pydantic_ai/project/acp.py @@ -1,7 +1,17 @@ -"""ACP (Agent Communication Protocol) handler for Agentex. - -This is the API layer — it owns the agent lifecycle and streams tokens -and tool calls from the Pydantic AI agent to the Agentex frontend. +"""ACP handler for the sync harness Pydantic AI test agent. + +This agent exercises the UNIFIED HARNESS SURFACE on the sync (HTTP-yield) +channel — ``UnifiedEmitter.yield_turn(PydanticAITurn(...))`` — rather than the +bare ``convert_pydantic_ai_to_agentex_events`` converter used by the +``040_pydantic_ai`` tutorial. The unified surface gives the sync channel the +same tracing (span derivation) the async/temporal channels get for free. + +Flow: +1. Open a per-turn AGENT_WORKFLOW span via ``adk.tracing.span``. +2. Construct a ``UnifiedEmitter`` from the ACP/streaming context (task_id + + trace_id + parent_span_id) so tool spans nest under the turn span. +3. Wrap ``agent.run_stream_events(...)`` in a ``PydanticAITurn`` and forward + events with ``emitter.yield_turn(turn)`` — yielding each to the client. """ from __future__ import annotations @@ -14,17 +24,15 @@ load_dotenv() import agentex.lib.adk as adk -from project.agent import create_agent -from agentex.lib.adk import ( - create_pydantic_ai_tracing_handler, - convert_pydantic_ai_to_agentex_events, -) +from project.agent import MODEL_NAME, create_agent from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.sdk.fastacp.fastacp import FastACP from agentex.types.task_message_update import TaskMessageUpdate from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) @@ -54,7 +62,7 @@ def get_agent(): async def handle_message_send( params: SendMessageParams, ) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: - """Handle incoming messages from Agentex, streaming tokens and tool calls.""" + """Handle incoming messages, streaming events through the unified surface.""" agent = get_agent() task_id = params.task.id @@ -68,11 +76,17 @@ async def handle_message_send( input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - tracing_handler = create_pydantic_ai_tracing_handler( + # Construct the UnifiedEmitter from the ACP/streaming context so tracing + # is automatic: tool spans nest under this turn's span. + emitter = UnifiedEmitter( + task_id=task_id, trace_id=task_id, parent_span_id=turn_span.id if turn_span else None, - task_id=task_id, ) + async with agent.run_stream_events(user_message) as stream: - async for event in convert_pydantic_ai_to_agentex_events(stream, tracing_handler=tracing_handler): - yield event + # PydanticAITurn preserves token-by-token tool-call argument + # streaming (Start+Delta+Done) on the sync/HTTP channel. + turn = PydanticAITurn(stream, model=MODEL_NAME) + async for ev in emitter.yield_turn(turn): + yield ev diff --git a/examples/tutorials/00_sync/040_pydantic_ai/project/agent.py b/examples/tutorials/00_sync/040_pydantic_ai/project/agent.py index 2c0f6f10c..72fd74173 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/project/agent.py +++ b/examples/tutorials/00_sync/040_pydantic_ai/project/agent.py @@ -1,4 +1,4 @@ -"""Pydantic AI agent definition. +"""Pydantic AI agent definition for the sync harness test agent. The Agent is the boundary between this module and the API layer (acp.py). Pydantic AI handles its own tool-call loop internally — no graph required. @@ -12,6 +12,8 @@ from project.tools import get_weather +__all__ = ["create_agent", "MODEL_NAME"] + MODEL_NAME = "openai:gpt-4o-mini" SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. @@ -29,9 +31,7 @@ def create_agent() -> Agent: """Build and return the Pydantic AI agent with tools registered.""" agent = Agent( MODEL_NAME, - system_prompt=SYSTEM_PROMPT.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ), + system_prompt=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ) agent.tool_plain(get_weather) diff --git a/examples/tutorials/00_sync/040_pydantic_ai/project/tools.py b/examples/tutorials/00_sync/040_pydantic_ai/project/tools.py index bab87942a..d649c75f1 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/project/tools.py +++ b/examples/tutorials/00_sync/040_pydantic_ai/project/tools.py @@ -1,8 +1,8 @@ -"""Tool definitions for the Pydantic AI agent. +"""Tool definitions for the sync harness Pydantic AI agent. Pydantic AI tools are registered directly on the Agent via decorators -(see project.agent). This module hosts the bare functions so they're -easy to unit-test in isolation. +(see project.agent). This module hosts the bare function so it is easy to +unit-test in isolation. """ from __future__ import annotations diff --git a/examples/tutorials/00_sync/040_pydantic_ai/pyproject.toml b/examples/tutorials/00_sync/040_pydantic_ai/pyproject.toml index 3e645fa15..748a9f3cb 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/pyproject.toml +++ b/examples/tutorials/00_sync/040_pydantic_ai/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "s040-pydantic-ai" version = "0.1.0" -description = "A sync Pydantic AI agent with tool calling and streaming" +description = "A sync Pydantic AI harness test agent using the unified emitter surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ diff --git a/examples/tutorials/00_sync/040_pydantic_ai/tests/test_agent.py b/examples/tutorials/00_sync/040_pydantic_ai/tests/test_agent.py index d3deed1c7..4aad12a56 100644 --- a/examples/tutorials/00_sync/040_pydantic_ai/tests/test_agent.py +++ b/examples/tutorials/00_sync/040_pydantic_ai/tests/test_agent.py @@ -1,8 +1,10 @@ -"""Tests for the sync Pydantic AI agent. +"""Live tests for the sync Pydantic AI agent. -This test suite validates: -- Non-streaming message sending with tool-calling Pydantic AI agent -- Streaming message sending with token-by-token output +These tests require a running agent (server + deployed agent) and exercise the +unified-surface sync handler end-to-end over the wire. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in the SDK repo under ``tests/lib/core/harness/`` (the pydantic-ai sync suite). To run these tests: 1. Make sure the agent is running (via docker-compose or `agentex agents run`) @@ -50,7 +52,7 @@ def agent_id(client, agent_name): class TestNonStreamingMessages: - """Test non-streaming message sending with Pydantic AI agent.""" + """Test non-streaming message sending with the unified-surface sync agent.""" def test_send_simple_message(self, client: Agentex, agent_name: str): """Test sending a simple message and receiving a response.""" @@ -86,7 +88,7 @@ def test_tool_calling(self, client: Agentex, agent_name: str): class TestStreamingMessages: - """Test streaming message sending with Pydantic AI agent.""" + """Test streaming message sending through the unified yield_turn path.""" def test_stream_simple_message(self, client: Agentex, agent_name: str): """Test streaming a simple message response.""" @@ -107,10 +109,10 @@ def test_stream_simple_message(self, client: Agentex, agent_name: str): assert len(chunks) > 1, "No chunks received in streaming response." def test_stream_tool_calling(self, client: Agentex, agent_name: str): - """Test streaming with tool calls. + """Test streaming with tool calls through the unified surface. - This exercises the headline Pydantic AI converter feature: - tool-call argument tokens streaming through as ToolRequestDelta. + Exercises token-by-token tool-call argument streaming (coalesce off), + which the unified yield_turn path preserves on the sync channel. """ stream = client.agents.send_message_stream( agent_name=agent_name, diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/.dockerignore b/examples/tutorials/00_sync/050_openai_agents/.dockerignore similarity index 100% rename from examples/tutorials/00_sync/050_openai_agents_local_sandbox/.dockerignore rename to examples/tutorials/00_sync/050_openai_agents/.dockerignore diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/Dockerfile b/examples/tutorials/00_sync/050_openai_agents/Dockerfile similarity index 65% rename from examples/tutorials/00_sync/050_openai_agents_local_sandbox/Dockerfile rename to examples/tutorials/00_sync/050_openai_agents/Dockerfile index 8e0ec22df..c9ccd6f54 100644 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/Dockerfile +++ b/examples/tutorials/00_sync/050_openai_agents/Dockerfile @@ -23,16 +23,16 @@ RUN uv pip install --system --upgrade pip setuptools wheel ENV UV_HTTP_TIMEOUT=1000 # Copy pyproject.toml and README.md to install dependencies -COPY 00_sync/050_openai_agents_local_sandbox/pyproject.toml /app/050_openai_agents_local_sandbox/pyproject.toml -COPY 00_sync/050_openai_agents_local_sandbox/README.md /app/050_openai_agents_local_sandbox/README.md +COPY 00_sync/050_openai_agents/pyproject.toml /app/050_openai_agents/pyproject.toml +COPY 00_sync/050_openai_agents/README.md /app/050_openai_agents/README.md -WORKDIR /app/050_openai_agents_local_sandbox +WORKDIR /app/050_openai_agents # Copy the project code -COPY 00_sync/050_openai_agents_local_sandbox/project /app/050_openai_agents_local_sandbox/project +COPY 00_sync/050_openai_agents/project /app/050_openai_agents/project # Copy the test files -COPY 00_sync/050_openai_agents_local_sandbox/tests /app/050_openai_agents_local_sandbox/tests +COPY 00_sync/050_openai_agents/tests /app/050_openai_agents/tests # Copy shared test utilities COPY test_utils /app/test_utils @@ -44,7 +44,7 @@ RUN uv pip install --system .[dev] ENV PYTHONPATH=/app # Set test environment variables -ENV AGENT_NAME=s050-openai-agents-local-sandbox +ENV AGENT_NAME=s050-openai-agents # Run the agent using uvicorn CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/050_openai_agents/README.md b/examples/tutorials/00_sync/050_openai_agents/README.md new file mode 100644 index 000000000..98cec3f9a --- /dev/null +++ b/examples/tutorials/00_sync/050_openai_agents/README.md @@ -0,0 +1,35 @@ +# Sync OpenAI Agents on the unified harness surface + +A sync (HTTP) Agentex agent that runs the OpenAI Agents SDK and delivers its +output through the **unified harness surface**. + +## What this demonstrates + +The OpenAI Agents SDK produces native streaming events. This tutorial wraps a +`Runner.run_streamed` result in an `OpenAITurn` — the provider -> canonical +`StreamTaskMessage*` adapter — and forwards the canonical stream to the frontend +via `UnifiedEmitter.yield_turn`. The same `OpenAITurn` flows unchanged through +`auto_send_turn` in the async (`10_async/00_base/120_openai_agents`) and temporal +(`10_async/10_temporal/120_openai_agents`) variants; only the delivery method differs. + +```python +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, parent_span_id=parent_span_id) +async for event in emitter.yield_turn(turn): + yield event +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +## Test it + +The offline test exercises the harness wiring without a server or API key: + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/manifest.yaml b/examples/tutorials/00_sync/050_openai_agents/manifest.yaml similarity index 66% rename from examples/tutorials/00_sync/050_openai_agents_local_sandbox/manifest.yaml rename to examples/tutorials/00_sync/050_openai_agents/manifest.yaml index 8ae5b98a1..bdb47e8d8 100644 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/manifest.yaml +++ b/examples/tutorials/00_sync/050_openai_agents/manifest.yaml @@ -2,10 +2,10 @@ build: context: root: ../../ include_paths: - - 00_sync/050_openai_agents_local_sandbox + - 00_sync/050_openai_agents - test_utils - dockerfile: 00_sync/050_openai_agents_local_sandbox/Dockerfile - dockerignore: 00_sync/050_openai_agents_local_sandbox/.dockerignore + dockerfile: 00_sync/050_openai_agents/Dockerfile + dockerignore: 00_sync/050_openai_agents/.dockerignore local_development: agent: @@ -16,8 +16,8 @@ local_development: agent: acp_type: sync - name: s050-openai-agents-local-sandbox - description: A sync OpenAI Agents SDK agent using a local (unix_local) sandbox + name: s050-openai-agents + description: A sync OpenAI Agents SDK agent on the unified harness surface temporal: enabled: false @@ -39,9 +39,6 @@ agent: secret_name: sgp-client-base-url secret_key: url - env: - OPENAI_AGENTS_DISABLE_TRACING: "1" - deployment: image: repository: "" @@ -49,8 +46,8 @@ deployment: global: agent: - name: "s050-openai-agents-local-sandbox" - description: "A sync OpenAI Agents SDK agent using a local (unix_local) sandbox" + name: "s050-openai-agents" + description: "A sync OpenAI Agents SDK agent on the unified harness surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/__init__.py b/examples/tutorials/00_sync/050_openai_agents/project/__init__.py similarity index 100% rename from examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/__init__.py rename to examples/tutorials/00_sync/050_openai_agents/project/__init__.py diff --git a/examples/tutorials/00_sync/050_openai_agents/project/acp.py b/examples/tutorials/00_sync/050_openai_agents/project/acp.py new file mode 100644 index 000000000..caaa0b132 --- /dev/null +++ b/examples/tutorials/00_sync/050_openai_agents/project/acp.py @@ -0,0 +1,87 @@ +"""ACP handler for the sync OpenAI Agents harness tutorial. + +This is the API layer. It runs the OpenAI Agents SDK via ``Runner.run_streamed``, +wraps the streamed run in an ``OpenAITurn`` (the provider -> canonical +``StreamTaskMessage*`` adapter), and forwards the canonical stream to the +Agentex frontend via ``UnifiedEmitter.yield_turn`` — the same harness surface +used by the async and temporal variants of this tutorial. +""" + +from __future__ import annotations + +import os +from typing import AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +from agents import Runner + +from agentex.lib import adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client +# compatibility, so the same example works behind the Scale LiteLLM gateway. +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +_agent = None + + +def get_agent(): + """Get or create the OpenAI Agents SDK agent instance.""" + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle incoming messages, streaming tokens and tool calls via the harness.""" + agent = get_agent() + task_id = params.task.id + user_message = params.content.content + logger.info(f"Processing message for task {task_id}") + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + result = Runner.run_streamed(starting_agent=agent, input=user_message) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + async for event in emitter.yield_turn(turn): + yield event diff --git a/examples/tutorials/00_sync/050_openai_agents/project/agent.py b/examples/tutorials/00_sync/050_openai_agents/project/agent.py new file mode 100644 index 000000000..3611012fe --- /dev/null +++ b/examples/tutorials/00_sync/050_openai_agents/project/agent.py @@ -0,0 +1,47 @@ +"""OpenAI Agents SDK agent definition for the harness tutorial. + +The agent is the boundary between this module and the API layer (acp.py). +The OpenAI Agents SDK runs its own tool-call loop internally; acp.py wraps a +``Runner.run_streamed`` result with ``OpenAITurn`` so it flows through the +unified harness surface. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +# Disable the openai-agents SDK's native tracer so it doesn't ship traces to +# api.openai.com (the key may be a gateway/proxy key). Agentex tracing still +# runs via the harness + tracing manager configured in acp.py. +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/00_sync/050_openai_agents/project/tools.py b/examples/tutorials/00_sync/050_openai_agents/project/tools.py new file mode 100644 index 000000000..b03aa7c31 --- /dev/null +++ b/examples/tutorials/00_sync/050_openai_agents/project/tools.py @@ -0,0 +1,19 @@ +"""Tool definitions for the OpenAI Agents harness tutorial. + +The bare function lives here so it's easy to unit-test; it's wrapped as an +OpenAI Agents SDK ``function_tool`` in ``project.agent``. +""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/pyproject.toml b/examples/tutorials/00_sync/050_openai_agents/pyproject.toml similarity index 75% rename from examples/tutorials/00_sync/050_openai_agents_local_sandbox/pyproject.toml rename to examples/tutorials/00_sync/050_openai_agents/pyproject.toml index 472a6bef7..48d2481dd 100644 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/pyproject.toml +++ b/examples/tutorials/00_sync/050_openai_agents/pyproject.toml @@ -3,15 +3,15 @@ requires = ["hatchling"] build-backend = "hatchling.build" [project] -name = "s050-openai-agents-local-sandbox" +name = "s050-openai-agents" version = "0.1.0" -description = "A sync OpenAI Agents SDK agent using a local (unix_local) sandbox" +description = "A sync OpenAI Agents SDK agent on the unified harness surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ "agentex-sdk", "scale-gp", - "openai-agents>=0.14.3,<0.15", + "openai-agents", ] [project.optional-dependencies] diff --git a/examples/tutorials/00_sync/050_openai_agents/tests/test_agent.py b/examples/tutorials/00_sync/050_openai_agents/tests/test_agent.py new file mode 100644 index 000000000..960b232b7 --- /dev/null +++ b/examples/tutorials/00_sync/050_openai_agents/tests/test_agent.py @@ -0,0 +1,48 @@ +"""Offline test for the sync OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server or an OpenAI API key. It +verifies the harness wiring this tutorial demonstrates: an ``OpenAITurn`` built +from an injected canonical ``StreamTaskMessage*`` stream, forwarded through +``UnifiedEmitter.yield_turn`` (the sync HTTP ACP delivery path), passes the +events through unchanged. + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_turn_forwards_canonical_stream(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + # trace_id=None disables tracing, so no Agentex server is needed. + emitter = UnifiedEmitter(task_id="task-1", trace_id=None, parent_span_id=None) + + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/README.md b/examples/tutorials/00_sync/050_openai_agents_local_sandbox/README.md deleted file mode 100644 index 9c2c81d7d..000000000 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/README.md +++ /dev/null @@ -1,113 +0,0 @@ -# Tutorial 050: Sync OpenAI Agents SDK with a Local Sandbox - -This tutorial demonstrates how to build a **synchronous** agent on AgentEx using the -[OpenAI Agents SDK](https://developers.openai.com/api/docs/guides/agents) and its -**sandbox** runtime, running with the **local** (`unix_local`) backend. - -The agent is a "local sandbox assistant": it answers questions by actually running -real shell commands (e.g. `python3 --version`, `ls /tmp`, `python3 -c "..."`) -instead of guessing. - -## Key Concepts - -### Sync ACP -The sync ACP model uses HTTP request/response for communication. The -`@acp.on_message_send` handler receives a message, runs the agent, and returns the -agent's final answer as a `TextContent`. - -### OpenAI Agents SDK Sandbox -The OpenAI Agents SDK ships `agents.sandbox`, which lets you give an agent -**capabilities** (instead of hand-written tools) that the runtime turns into real -tools backed by a sandbox: - -- **`SandboxAgent`**: an `Agent` that is granted sandbox capabilities. -- **Capabilities** (`from agents.sandbox.capabilities import Shell, Filesystem, Memory`): - each capability expands into a set of real tools. This tutorial uses `Shell`, which - lets the model run real shell commands. -- **`SandboxRunConfig`** + a sandbox **client**: tells the runtime *where* the tools - actually execute. - -### The LOCAL sandbox (`UnixLocalSandboxClient`) -This tutorial uses the local backend -(`from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClient, UnixLocalSandboxClientOptions`), -`backend_id="unix_local"`. The local sandbox runs shell commands **ON THE HOST** — -the agent's own container/process. There is **no Docker, no Temporal, and no remote -sandbox infrastructure** involved. This makes it the simplest way to give an agent a -real shell. - -The sandbox is wired up through the SDK's `RunConfig`: - -```python -from agents import Runner, set_tracing_disabled -from agents.run_config import RunConfig -from agents.sandbox import SandboxAgent, SandboxRunConfig -from agents.sandbox.capabilities import Shell -from agents.sandbox.sandboxes.unix_local import ( - UnixLocalSandboxClient, - UnixLocalSandboxClientOptions, -) - -set_tracing_disabled(True) # avoid api.openai.com tracing 401 behind a gateway - -agent = SandboxAgent( - name="Local Sandbox Assistant", - instructions="...use the shell tools to actually run commands...", - capabilities=[Shell()], -) -run_config = RunConfig( - sandbox=SandboxRunConfig( - client=UnixLocalSandboxClient(), - options=UnixLocalSandboxClientOptions(), - ) -) -result = await Runner.run(agent, input="what's the python version?", run_config=run_config) -print(result.final_output) -``` - -`Runner.run` drives the full tool-call loop internally: the model issues shell -commands, the local sandbox runs them on the host, the output is fed back, and the -loop continues until the model produces a final answer. - -## Files - -| File | Description | -|------|-------------| -| `project/acp.py` | ACP server and message handler (runs the sandbox agent) | -| `project/agent.py` | `SandboxAgent` + `RunConfig(sandbox=...)` wiring + `run_agent` | -| `project/tools.py` | Sandbox capability factory (`Shell`) | -| `tests/test_agent.py` | Integration tests | -| `manifest.yaml` | Agent configuration | - -## Running Locally - -```bash -# From this directory -agentex agents run -``` - -Set `OPENAI_API_KEY` (or `LITELLM_API_KEY` if you're behind the Scale LiteLLM -gateway) in your environment or in a `.env` file in `project/` so the agent can call -the model. - -## Running Tests - -```bash -pytest tests/test_agent.py -v -``` - -## Notes - -- **No infra required.** Because this uses the `unix_local` backend, the shell tools - run directly in the agent's process — no Docker daemon, no Temporal, no remote - sandbox. Swap the client for a remote/containerized backend to isolate execution. -- **Tracing.** `set_tracing_disabled(True)` turns off the OpenAI Agents SDK's native - tracer (which would otherwise try to ship traces to `api.openai.com`). The manifest - also sets `OPENAI_AGENTS_DISABLE_TRACING=1`. AgentEx/SGP tracing still runs via the - tracing manager configured in `acp.py` when SGP credentials are present. -- **Capabilities are the tools.** To let the agent do more, add capabilities in - `project/tools.py` (e.g. `Filesystem()`, `Memory()`). - -## Further Reading - -- OpenAI Agents SDK guide: https://developers.openai.com/api/docs/guides/agents -- The next evolution of the Agents SDK: https://openai.com/index/the-next-evolution-of-the-agents-sdk/ diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/acp.py b/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/acp.py deleted file mode 100644 index 005d679bf..000000000 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/acp.py +++ /dev/null @@ -1,77 +0,0 @@ -"""ACP (Agent Communication Protocol) handler for Agentex. - -This is the API layer — it owns the agent lifecycle and runs the OpenAI Agents -SDK *sandbox* agent for each incoming message, returning the agent's final -answer to the Agentex frontend. - -The agent uses the LOCAL sandbox backend (``UnixLocalSandboxClient``), which runs -shell commands on the host (this process/container). The OpenAI Agents SDK runs -its tool-call loop internally via ``Runner.run`` and returns the final output, so -this sync handler returns a single ``TextContent`` rather than streaming tokens. -""" - -from __future__ import annotations - -import os - -from dotenv import load_dotenv - -load_dotenv() - -from agentex.lib import adk -from project.agent import run_agent -from agentex.lib.types.acp import SendMessageParams -from agentex.lib.types.tracing import SGPTracingProcessorConfig -from agentex.lib.utils.logging import make_logger -from agentex.types.text_content import TextContent -from agentex.lib.sdk.fastacp.fastacp import FastACP -from agentex.types.task_message_content import TaskMessageContent -from agentex.lib.core.tracing.tracing_processor_manager import ( - add_tracing_processor_config, -) - -logger = make_logger(__name__) - -# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client -# compatibility, so the same example works behind the Scale LiteLLM gateway. -_litellm_key = os.environ.get("LITELLM_API_KEY") -if _litellm_key and not os.environ.get("OPENAI_API_KEY"): - os.environ["OPENAI_API_KEY"] = _litellm_key - -SGP_API_KEY = os.environ.get("SGP_API_KEY", "") -SGP_ACCOUNT_ID = os.environ.get("SGP_ACCOUNT_ID", "") -SGP_CLIENT_BASE_URL = os.environ.get("SGP_CLIENT_BASE_URL", "") - -if SGP_API_KEY and SGP_ACCOUNT_ID: - add_tracing_processor_config( - SGPTracingProcessorConfig( - sgp_api_key=SGP_API_KEY, - sgp_account_id=SGP_ACCOUNT_ID, - sgp_base_url=SGP_CLIENT_BASE_URL, - ) - ) - -acp = FastACP.create(acp_type="sync") - - -@acp.on_message_send -async def handle_message_send( - params: SendMessageParams, -) -> TaskMessageContent: - """Handle incoming messages by running the local-sandbox agent.""" - task_id = params.task.id - user_message = params.content.content - logger.info(f"Processing message for task {task_id}") - - async with adk.tracing.span( - trace_id=task_id, - task_id=task_id, - name="message", - input={"message": user_message}, - data={"__span_type__": "AGENT_WORKFLOW"}, - ) as turn_span: - final_output = await run_agent(user_message) - if turn_span: - turn_span.output = {"final_output": final_output} - - return TextContent(author="agent", content=final_output) diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/agent.py b/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/agent.py deleted file mode 100644 index d674d14c9..000000000 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/agent.py +++ /dev/null @@ -1,92 +0,0 @@ -"""OpenAI Agents SDK local-sandbox agent definition. - -This mirrors the Pydantic AI tutorial (040): the agent is the boundary between -this module and the API layer (acp.py). The difference is the runtime — here we -use the OpenAI Agents SDK ``SandboxAgent`` together with the **local** sandbox -backend (``UnixLocalSandboxClient``). - -The local sandbox runs shell commands ON THE HOST — the agent's own -container/process. There is no Docker, no Temporal, and no remote sandbox -infrastructure. The OpenAI Agents SDK runs its own tool-call loop internally: -when the model decides to run a shell command, the sandbox executes it locally -and feeds the output back to the model until it produces a final answer. -""" - -from __future__ import annotations - -from datetime import datetime - -from agents import Runner, set_tracing_disabled -from agents.sandbox import SandboxAgent, SandboxRunConfig -from agents.run_config import RunConfig -from agents.sandbox.sandboxes.unix_local import ( - UnixLocalSandboxClient, - UnixLocalSandboxClientOptions, -) - -from project.tools import get_capabilities - -# Disable the openai-agents SDK's native tracer so it doesn't ship traces to -# api.openai.com using OPENAI_API_KEY (which may be a gateway/proxy key and would -# 401). Agentex tracing still runs via the tracing manager configured in acp.py. -set_tracing_disabled(True) - -MODEL_NAME = "gpt-4o-mini" -INSTRUCTIONS = """You are a local sandbox assistant. - -Current date and time: {timestamp} - -You have access to shell tools that run real commands on the local machine. - -Guidelines: -- ALWAYS use the shell tools to actually run commands — never guess or make up - output. If the user asks for the Python version, run `python3 --version`. If - they ask to list files, run `ls`. If they ask you to compute something, use - `python3 -c "..."`. -- Run the minimal command(s) needed to answer the question. -- Report the real command output back to the user, concisely. -""" - - -def create_agent() -> SandboxAgent: - """Build and return the OpenAI Agents SDK sandbox agent. - - The agent is granted shell capabilities (see ``project.tools``). The actual - sandbox backend (where the shell commands run) is supplied at run time via - the ``RunConfig`` returned by ``create_run_config``. - """ - return SandboxAgent( - name="Local Sandbox Assistant", - model=MODEL_NAME, - instructions=INSTRUCTIONS.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ), - capabilities=get_capabilities(), - ) - - -def create_run_config() -> RunConfig: - """Build the RunConfig that points the agent at the LOCAL sandbox backend. - - ``UnixLocalSandboxClient`` (backend_id="unix_local") runs shell commands on - the host — the agent's own process — so no Docker or remote infra is needed. - """ - return RunConfig( - sandbox=SandboxRunConfig( - client=UnixLocalSandboxClient(), - options=UnixLocalSandboxClientOptions(), - ) - ) - - -async def run_agent(user_message: str) -> str: - """Run the sandbox agent on a single user message and return the final text. - - The OpenAI Agents SDK handles the full tool-call loop internally: the model - issues shell commands, the local sandbox runs them on the host, and the - output is fed back until the model produces a final answer. - """ - agent = create_agent() - run_config = create_run_config() - result = await Runner.run(agent, input=user_message, run_config=run_config, max_turns=10) - return result.final_output diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/tools.py b/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/tools.py deleted file mode 100644 index 0ad8f25ac..000000000 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/project/tools.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Sandbox capabilities for the OpenAI Agents SDK local-sandbox agent. - -Unlike the Pydantic AI tutorial (040), this agent does not register hand-written -Python functions as tools. Instead it is given *capabilities* — the OpenAI Agents -SDK sandbox runtime turns each capability into a real set of tools (run a shell -command, read a file, etc.) backed by an actual sandbox backend. - -Here we use the ``Shell`` capability, which lets the model run real shell commands. -With the local (``unix_local``) backend those commands execute ON THE HOST — the -agent's own process/container — so there is no Docker, Temporal, or remote infra -involved. This module hosts the capability factory so the agent wiring in -``project.agent`` stays readable and the capability set is easy to extend -(e.g. add ``Filesystem()`` or ``Memory()``). -""" - -from __future__ import annotations - -from agents.sandbox.capabilities import Shell - - -def get_capabilities() -> list: - """Return the sandbox capabilities the agent is allowed to use. - - Returns: - A list of OpenAI Agents SDK sandbox capabilities. We grant ``Shell`` so - the agent can run real shell commands on the local machine. Add - ``Filesystem()`` or ``Memory()`` here to expand what the agent can do. - """ - return [Shell()] diff --git a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/tests/test_agent.py b/examples/tutorials/00_sync/050_openai_agents_local_sandbox/tests/test_agent.py deleted file mode 100644 index 52ed1bf2f..000000000 --- a/examples/tutorials/00_sync/050_openai_agents_local_sandbox/tests/test_agent.py +++ /dev/null @@ -1,148 +0,0 @@ -"""Tests for the sync OpenAI Agents SDK local-sandbox agent. - -This test suite validates: -- Sending a message that requires the agent to actually run a shell command in - the LOCAL sandbox (unix_local backend) and receiving a non-empty response. - -To run these tests: -1. Make sure the agent is running (via docker-compose or `agentex agents run`) -2. Set the AGENTEX_API_BASE_URL environment variable if not using default -3. Run: pytest test_agent.py -v - -Configuration: -- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) -- AGENT_NAME: Name of the agent to test (default: s050-openai-agents-local-sandbox) -""" - -import os - -import pytest -from test_utils.sync import validate_text_in_string - -from agentex import Agentex -from agentex.types import TextContentParam -from agentex.types.agent_rpc_params import ParamsSendMessageRequest - -AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") -AGENT_NAME = os.environ.get("AGENT_NAME", "s050-openai-agents-local-sandbox") - - -@pytest.fixture -def client(): - """Create an AgentEx client instance for testing.""" - return Agentex(base_url=AGENTEX_API_BASE_URL) - - -@pytest.fixture -def agent_name(): - """Return the agent name for testing.""" - return AGENT_NAME - - -@pytest.fixture -def agent_id(client, agent_name): - """Retrieve the agent ID based on the agent name.""" - agents = client.agents.list() - for agent in agents: - if agent.name == agent_name: - return agent.id - raise ValueError(f"Agent with name {agent_name} not found.") - - -def _response_text(result) -> str: - """Flatten a send_message result into a single string for assertions. - - Result items may be a bare string, a ``TextContent`` (``.content`` is the - string), or a ``TaskMessage`` wrapping a ``TextContent`` (``.content`` is the - ``TextContent``, whose ``.content`` is the string). Dig through ``.content`` - until we reach a string. - """ - - def _text_of(obj, _depth: int = 0) -> str: - if isinstance(obj, str): - return obj - if _depth > 5: - return "" - inner = getattr(obj, "content", None) - if inner is None: - return "" - return _text_of(inner, _depth + 1) - - parts = [t for t in (_text_of(item) for item in result) if t] - return "\n".join(parts) - - -class TestLocalSandboxMessages: - """Test the local-sandbox OpenAI Agents SDK agent.""" - - def test_send_simple_message(self, client: Agentex, agent_name: str): - """Test sending a simple message and receiving a response.""" - response = client.agents.send_message( - agent_name=agent_name, - params=ParamsSendMessageRequest( - content=TextContentParam( - author="user", - content="Hello! What can you help me with?", - type="text", - ) - ), - ) - result = response.result - assert result is not None - assert len(result) >= 1 - - def test_shell_python_version(self, client: Agentex, agent_name: str): - """Test that the agent uses its shell to run a real command. - - We ask it to print the Python version. The agent should run - `python3 --version` in the local sandbox and report the real output, - which always starts with "Python 3". - """ - response = client.agents.send_message( - agent_name=agent_name, - params=ParamsSendMessageRequest( - content=TextContentParam( - author="user", - content=( - "Use your shell to print the Python version on this " - "machine, then tell me what it is." - ), - type="text", - ) - ), - ) - result = response.result - assert result is not None - assert len(result) >= 1 - - text = _response_text(result) - assert text, "Expected a non-empty response from the sandbox agent." - # The sandbox runs on Python 3.12, so the real output contains "Python 3". - validate_text_in_string("Python 3", text) - - def test_shell_compute(self, client: Agentex, agent_name: str): - """Test that the agent uses python3 in the sandbox to compute a value.""" - response = client.agents.send_message( - agent_name=agent_name, - params=ParamsSendMessageRequest( - content=TextContentParam( - author="user", - content=( - "Use python3 in your shell to compute 21 * 2 and tell me " - "the result." - ), - type="text", - ) - ), - ) - result = response.result - assert result is not None - assert len(result) >= 1 - - text = _response_text(result) - assert text, "Expected a non-empty response from the sandbox agent." - validate_text_in_string("42", text) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/.dockerignore b/examples/tutorials/00_sync/060_claude_code/.dockerignore similarity index 100% rename from examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/.dockerignore rename to examples/tutorials/00_sync/060_claude_code/.dockerignore diff --git a/examples/tutorials/00_sync/060_claude_code/Dockerfile b/examples/tutorials/00_sync/060_claude_code/Dockerfile new file mode 100644 index 000000000..ec22d7e0b --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/Dockerfile @@ -0,0 +1,46 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies including Node.js (required by the claude CLI) +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +# Install the claude CLI (requires Node.js) +# NOTE: live runs require ANTHROPIC_API_KEY in the environment. +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 00_sync/060_claude_code/pyproject.toml /app/060_claude_code/pyproject.toml +COPY 00_sync/060_claude_code/README.md /app/060_claude_code/README.md + +WORKDIR /app/060_claude_code + +COPY 00_sync/060_claude_code/project /app/060_claude_code/project +COPY 00_sync/060_claude_code/tests /app/060_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=s060-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/060_claude_code/README.md b/examples/tutorials/00_sync/060_claude_code/README.md new file mode 100644 index 000000000..e9c724732 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 060: Sync Claude Code Agent + +This tutorial demonstrates how to build a **synchronous** agent that spawns the +Claude Code CLI as a local subprocess and streams its output through the Agentex +unified harness surface via ``ClaudeCodeTurn`` and ``UnifiedEmitter``. + +## Key Concepts + +### ClaudeCodeTurn + UnifiedEmitter + +``ClaudeCodeTurn`` wraps ``convert_claude_code_to_agentex_events``, which +parses the newline-delimited JSON envelopes emitted by +``claude -p --output-format stream-json``. It implements the ``HarnessTurn`` +protocol: an ``events`` async iterator of canonical ``StreamTaskMessage*`` +objects and a ``usage()`` method (populated once the stream is exhausted). + +``UnifiedEmitter.yield_turn(turn)`` is the sync delivery path: it forwards +events as HTTP yield chunks while tracing as a side effect. + +### Local subprocess spawn + +The ``_spawn_claude`` function in ``project/acp.py`` uses +``asyncio.create_subprocess_exec`` to run: + +``` +claude -p --output-format stream-json --verbose +``` + +The prompt is written to stdin. Stdout is read line by line and fed into +``ClaudeCodeTurn``. This is purely local -- no Scale sandbox is involved. + +Production isolation (Scale sandbox, secret injection, MCP configuration) +is the golden agent's concern at +``teams/sgp/agents/golden_agent/project/harness/providers/claude.py``. + +### Injectable spawn seam + +``_spawn_claude`` is a top-level async generator in ``project/acp.py``. +Tests monkeypatch it to inject pre-recorded stream-json lines instead of +spawning the real process, so offline unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | ACP server, ``_spawn_claude`` seam, and message handler | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires the ``claude`` CLI installed and ``ANTHROPIC_API_KEY`` set: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. + This tutorial runs the CLI directly to keep the code as simple as possible. +- Multi-turn session resumption (``claude -r ``) is out of scope + for this tutorial. See the golden agent for that pattern. +- The ``--verbose`` flag is included to match the golden agent's invocation; + it causes the CLI to emit ``stream_event`` triples for incremental streaming. diff --git a/examples/tutorials/00_sync/060_claude_code/manifest.yaml b/examples/tutorials/00_sync/060_claude_code/manifest.yaml new file mode 100644 index 000000000..56b9fd9e4 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/manifest.yaml @@ -0,0 +1,55 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/060_claude_code + - test_utils + dockerfile: 00_sync/060_claude_code/Dockerfile + dockerignore: 00_sync/060_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s060-claude-code + description: A sync Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s060-claude-code" + description: "A sync Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/__init__.py b/examples/tutorials/00_sync/060_claude_code/project/__init__.py similarity index 100% rename from examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/__init__.py rename to examples/tutorials/00_sync/060_claude_code/project/__init__.py diff --git a/examples/tutorials/00_sync/060_claude_code/project/acp.py b/examples/tutorials/00_sync/060_claude_code/project/acp.py new file mode 100644 index 000000000..aad53801a --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/project/acp.py @@ -0,0 +1,137 @@ +"""ACP handler for the sync Claude Code tutorial. + +Spawns ``claude -p --output-format stream-json --verbose`` as a LOCAL +asyncio subprocess (no Scale sandbox -- that is the golden agent's +production concern). Stdout lines are fed into ``ClaudeCodeTurn``, which +wraps ``convert_claude_code_to_agentex_events``. Events are delivered via +``UnifiedEmitter.yield_turn``, the sync HTTP yield path. + +Live runs require the ``claude`` CLI to be installed and an +ANTHROPIC_API_KEY (or equivalent credential) to be in the environment. +For offline testing, see ``tests/test_agent_offline.py``, which injects a +fake subprocess. +""" + +from __future__ import annotations + +import os +import asyncio +from typing import AsyncIterator, AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + + +async def _spawn_claude(prompt: str) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + This is a seam: tests replace it with a fake async iterator of + pre-recorded lines so no real CLI invocation is needed offline. + """ + proc = await asyncio.create_subprocess_exec( + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle an incoming message: run Claude Code locally and stream events.""" + task_id = params.task.id + prompt = params.content.content + logger.info("Processing message for task %s", task_id) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": prompt}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + async for event in emitter.yield_turn(turn): + yield event diff --git a/examples/tutorials/00_sync/060_claude_code/pyproject.toml b/examples/tutorials/00_sync/060_claude_code/pyproject.toml new file mode 100644 index 000000000..e5c1c4ea6 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s060-claude-code" +version = "0.1.0" +description = "A sync Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py b/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py new file mode 100644 index 000000000..954a520f3 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py @@ -0,0 +1,162 @@ +"""Tests for the sync Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require the ``claude`` CLI on PATH and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline yields events, + populates usage, and satisfies the ``HarnessTurn`` protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-offline-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.0001, + "duration_ms": 250, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI or network.""" + + @pytest.mark.asyncio + async def test_yields_stream_events(self): + """ClaudeCodeTurn drives UnifiedEmitter and yields StreamTaskMessage* events.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageStart + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0, "No events yielded" + assert any(isinstance(e, StreamTaskMessageStart) for e in events) + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear after stream exhaustion.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """ClaudeCodeTurn.usage() returns correct tokens after stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + _ = [e async for e in turn.events] + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_protocol_compliance(self): + """ClaudeCodeTurn satisfies the HarnessTurn protocol.""" + from agentex.lib.adk import ClaudeCodeTurn + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + assert hasattr(turn, "events"), "ClaudeCodeTurn missing .events" + assert hasattr(turn, "usage"), "ClaudeCodeTurn missing .usage()" + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s060-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live streaming tests -- needs the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + def test_stream_simple_message(self, client, agent_name: str): + """Stream a simple prompt through the local Claude Code subprocess.""" + from test_utils.sync import collect_streaming_response + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendMessageRequest + + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) >= 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py b/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..23ac52a57 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py @@ -0,0 +1,210 @@ +"""Offline unit tests for the sync Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI or an ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in +place of the real subprocess spawn, and a fake streaming backend in place +of the real Redis/AGP layer, then assert that the handler correctly drives +the unified surface (``UnifiedEmitter.yield_turn``). + +The injection seam is the ``_spawn_claude`` function in ``project/acp.py``. +Tests monkeypatch it with a coroutine that returns a pre-recorded async +iterator, so the handler code runs in full without any subprocess. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageStart, +) + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.0001, + "duration_ms": 250, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_abc", + "name": "Bash", + "input": {"command": "echo hello"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_abc", + "content": "hello\n", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Done."}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 20, "output_tokens": 8}, + "cost_usd": 0.0002, + "duration_ms": 400, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _collect_yield_turn(lines: list[str]) -> list: + """Run a ClaudeCodeTurn through UnifiedEmitter.yield_turn and collect events.""" + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter(task_id="t1", trace_id=None, parent_span_id=None) + return [e async for e in emitter.yield_turn(turn)] + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_text_only_produces_start_and_done(): + events = await _collect_yield_turn(_TEXT_ONLY_LINES) + types = [type(e).__name__ for e in events] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDone" in types + + +@pytest.mark.asyncio +async def test_text_only_content(): + events = await _collect_yield_turn(_TEXT_ONLY_LINES) + starts = [e for e in events if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 1 + assert starts[0].content.type == "text" + + +@pytest.mark.asyncio +async def test_usage_is_populated_after_stream(): + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + _ = [e async for e in turn.events] + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.cost_usd == pytest.approx(0.0001, rel=1e-4) + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_tool_call_produces_tool_request_and_response(): + events = await _collect_yield_turn(_TOOL_CALL_LINES) + content_types = { + getattr(e, "content", None) and getattr(e.content, "type", None) for e in events if hasattr(e, "content") + } + assert "tool_request" in content_types + assert "tool_response" in content_types + + +@pytest.mark.asyncio +async def test_tool_call_has_one_text_block(): + """The tool_use block is not text; only 'Done.' is the text block.""" + events = await _collect_yield_turn(_TOOL_CALL_LINES) + text_starts = [ + e for e in events if isinstance(e, StreamTaskMessageStart) and getattr(e.content, "type", None) == "text" + ] + assert len(text_starts) == 1 + + +@pytest.mark.asyncio +async def test_empty_lines_are_skipped(): + """Inserting blank lines in the stream must not crash the parser.""" + lines_with_blanks = ["", " "] + _TEXT_ONLY_LINES + [""] + events = await _collect_yield_turn(lines_with_blanks) + assert any(isinstance(e, StreamTaskMessageStart) for e in events) + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/acp.py. + + The ``_spawn_claude`` function in ``project/acp.py`` is a top-level async + generator. Production code calls it like:: + + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + + In tests, a replacement function is injected (e.g. via monkeypatch) to + return pre-recorded lines. This test proves the pattern works end-to-end + without importing the full ACP module (which has module-level env-var + checks that only pass in a running agent environment). + """ + recorded_lines = _TEXT_ONLY_LINES + + async def _fake_spawn(prompt: str) -> AsyncIterator[str]: # noqa: ARG001 + """Drop-in replacement for _spawn_claude.""" + for line in recorded_lines: + yield line + + called_with: list[str] = [] + + async def _wrapped_spawn(prompt: str) -> AsyncIterator[str]: + called_with.append(prompt) + async for line in _fake_spawn(prompt): + yield line + + turn = ClaudeCodeTurn(_wrapped_spawn("test prompt")) + emitter = UnifiedEmitter(task_id="t2", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + + assert called_with == ["test prompt"] + assert any(isinstance(e, StreamTaskMessageStart) for e in events) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/.dockerignore b/examples/tutorials/00_sync/070_codex/.dockerignore similarity index 100% rename from examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/.dockerignore rename to examples/tutorials/00_sync/070_codex/.dockerignore diff --git a/examples/tutorials/00_sync/070_codex/Dockerfile b/examples/tutorials/00_sync/070_codex/Dockerfile new file mode 100644 index 000000000..fb500b221 --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/Dockerfile @@ -0,0 +1,56 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install the codex CLI: the agent spawns `codex exec --json`, so the binary +# must be present on PATH in the image. +RUN npm install -g @openai/codex + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/070_codex/pyproject.toml /app/070_codex/pyproject.toml +COPY 00_sync/070_codex/README.md /app/070_codex/README.md + +WORKDIR /app/070_codex + +# Copy the project code +COPY 00_sync/070_codex/project /app/070_codex/project + +# Copy the test files +COPY 00_sync/070_codex/tests /app/070_codex/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s070-codex + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/070_codex/README.md b/examples/tutorials/00_sync/070_codex/README.md new file mode 100644 index 000000000..3abb2766f --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/README.md @@ -0,0 +1,40 @@ +# 070_codex (sync) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for a **sync** (HTTP-yield) ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox). +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to the HTTP caller via + `UnifiedEmitter.yield_turn` (tracing as a side-effect). + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. + +## Running offline unit tests + +The offline tests inject a fake subprocess and never invoke the real CLI: + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/00_sync/070_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +# Start the agent server first, then: +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/070_codex/conftest.py b/examples/tutorials/00_sync/070_codex/conftest.py new file mode 100644 index 000000000..bdd78994b --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/conftest.py @@ -0,0 +1,12 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so the FastACP and tracing modules +can be imported without a running agent server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +os.environ.setdefault("ACP_URL", "http://localhost:8000") diff --git a/examples/tutorials/00_sync/070_codex/manifest.yaml b/examples/tutorials/00_sync/070_codex/manifest.yaml new file mode 100644 index 000000000..87dad2847 --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/070_codex + - test_utils + dockerfile: 00_sync/070_codex/Dockerfile + dockerignore: 00_sync/070_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s070-codex + description: Sync tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s070-codex" + description: "Sync tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/__init__.py b/examples/tutorials/00_sync/070_codex/project/__init__.py similarity index 100% rename from examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/__init__.py rename to examples/tutorials/00_sync/070_codex/project/__init__.py diff --git a/examples/tutorials/00_sync/070_codex/project/acp.py b/examples/tutorials/00_sync/070_codex/project/acp.py new file mode 100644 index 000000000..bcb5e10df --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/project/acp.py @@ -0,0 +1,175 @@ +"""Sync ACP handler for the Codex CLI harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for a sync (HTTP-yield) ACP agent. + +The handler: +1. Spawns ``codex exec --json`` as a LOCAL asyncio subprocess (no sandbox). + This is correct for tutorials and local development; production isolation + is handled by the golden agent's Scale sandbox at + ``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. +2. Wraps the stdout line stream in a ``CodexTurn``. +3. Delivers every canonical ``StreamTaskMessage*`` event via + ``UnifiedEmitter.yield_turn``, which traces + yields each event back to + the HTTP caller in one pass. + +Live runs require: +- ``codex`` CLI on PATH (``npm install -g @openai/codex``) +- ``OPENAI_API_KEY`` set in the environment +""" + +from __future__ import annotations + +import os +import time +import codecs +import asyncio +from typing import AsyncGenerator +from collections.abc import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import CodexTurn +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +async def _spawn_codex(model: str) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + The flags mirror the golden agent (codex.py in the golden agent repo): + --json machine-readable newline-delimited events + --skip-git-repo-check safe to run outside a git repo + --dangerously-bypass-approvals-and-sandbox + skip interactive approval prompts in a + non-interactive (server) context + --model which OpenAI model to use + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + cmd = [ + "codex", + "exec", + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + "-", # read prompt from stdin + ] + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle each message by running ``codex exec`` locally and streaming events.""" + task_id = params.task.id + user_message = params.content.content + logger.info("Processing message for task %s", task_id) + + start_ms = int(time.monotonic() * 1000) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + process = await _spawn_codex(MODEL) + + # Write prompt to stdin then close it so codex knows input is done. + assert process.stdin is not None + process.stdin.write(user_message.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn( + events=_process_stdout(process), + model=MODEL, + ) + + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + async for event in emitter.yield_turn(turn): + yield event + + await process.wait() + + # Record the real wall-clock duration AFTER streaming completes; setting + # it before the stream ran would capture only subprocess spawn overhead. + turn.duration_ms = int(time.monotonic() * 1000) - start_ms + + if turn_span: + usage = turn.usage() + turn_span.output = { + "model": usage.model, + "input_tokens": usage.input_tokens, + "output_tokens": usage.output_tokens, + } diff --git a/examples/tutorials/00_sync/070_codex/pyproject.toml b/examples/tutorials/00_sync/070_codex/pyproject.toml new file mode 100644 index 000000000..88bbb9cca --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s070-codex" +version = "0.1.0" +description = "Sync tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/00_sync/070_codex/tests/test_agent.py b/examples/tutorials/00_sync/070_codex/tests/test_agent.py new file mode 100644 index 000000000..94aa2aaf2 --- /dev/null +++ b/examples/tutorials/00_sync/070_codex/tests/test_agent.py @@ -0,0 +1,176 @@ +"""Tests for the sync Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH and ``OPENAI_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexHandler``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert the ``CodexTurn`` + ``UnifiedEmitter`` pipeline yields events, + populates usage, and satisfies the ``HarnessTurn`` protocol. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-abc"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hello, world!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class TestOfflineCodexHandler: + """Unit tests that run without a real codex CLI or network.""" + + @pytest.mark.asyncio + async def test_codex_turn_yields_stream_events(self): + """CodexTurn drives the unified surface and yields StreamTaskMessage* events.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0, "No events yielded" + + types_seen = {type(e).__name__ for e in events} + known_types = { + "StreamTaskMessageStart", + "StreamTaskMessageDelta", + "StreamTaskMessageFull", + "StreamTaskMessageDone", + } + assert bool(types_seen & known_types), f"Unexpected event types: {types_seen}" + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """CodexTurn.usage() returns correct tokens after stream is exhausted.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + collected = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_codex_turn_protocol_compliance(self): + """CodexTurn satisfies the HarnessTurn protocol.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness.types import HarnessTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + assert isinstance(turn, HarnessTurn), "CodexTurn does not satisfy HarnessTurn protocol" + + @pytest.mark.asyncio + async def test_unified_emitter_yield_passes_through_events(self): + """UnifiedEmitter.yield_turn passes events through unchanged in sync mode.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0 + + @pytest.mark.asyncio + async def test_convert_codex_to_agentex_events_direct(self): + """convert_codex_to_agentex_events tap produces text start/done events.""" + from agentex.lib.adk import convert_codex_to_agentex_events + from agentex.types.task_message_update import StreamTaskMessageDone + + events = [e async for e in convert_codex_to_agentex_events(_fake_event_stream())] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_on_result_callback_receives_session_id(self): + """on_result callback receives the session_id from thread.started.""" + from agentex.lib.adk import convert_codex_to_agentex_events + + captured: list[dict] = [] + + events = [ + e + async for e in convert_codex_to_agentex_events( + _fake_event_stream(), + on_result=captured.append, + ) + ] + + assert len(captured) == 1 + assert captured[0]["session_id"] == "thread-abc" + assert captured[0]["tool_call_count"] == 0 + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CODEX_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s070-codex") + + +@pytest.mark.skipif(not LIVE, reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY are available") +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI and a running Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + def test_send_simple_message(self, client): + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendMessageRequest + + response = client.agents.send_message( + agent_name=AGENT_NAME, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What is 2+2? Reply with just the number.", + type="text", + ) + ), + ) + assert response.result is not None + assert len(response.result) >= 1 diff --git a/examples/tutorials/10_async/00_base/100_langgraph/README.md b/examples/tutorials/10_async/00_base/100_langgraph/README.md index 6f6c6a36b..cd2fa6dd6 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/README.md +++ b/examples/tutorials/10_async/00_base/100_langgraph/README.md @@ -1,46 +1,52 @@ -# Tutorial 100: Async LangGraph Agent +# Tutorial: Async LangGraph Agent -This tutorial demonstrates how to build an **asynchronous** LangGraph agent on AgentEx with: -- Task-based event handling via Redis -- Tool calling (ReAct pattern) -- Multi-turn conversation memory via AgentEx checkpointer -- Tracing integration +This tutorial demonstrates how to build an **async** LangGraph agent on AgentEx +using the **unified harness surface**: -## Graph Structure +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +result = await emitter.auto_send_turn(turn) +``` + +The `LangGraphTurn` + `UnifiedEmitter.auto_send_turn` path replaces calling the +lower-level ``stream_langgraph_events`` helper directly. + +## Key Concepts + +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). -![Graph](graph.png) +`UnifiedEmitter.auto_send_turn(turn)` pushes each event to Redis via +`streaming_task_message_context`, accumulates the final text, and returns a +`TurnResult(final_text=..., usage=...)`. -## Sync vs Async: Key Differences +The same `LangGraphTurn` object can also be passed to +`UnifiedEmitter.yield_turn` in the sync channel. -| Aspect | Sync (Tutorial 030) | Async (This Tutorial) | -|--------|--------------------|-----------------------| -| **ACP Type** | `sync` | `async` | -| **Handler** | `@acp.on_message_send` | `@acp.on_task_event_send` | -| **Response** | HTTP streaming (yields) | Redis streaming | -| **Message Echo** | Implicit | Explicit (`adk.messages.create`) | -| **Streaming Helper** | `convert_langgraph_to_agentex_events()` | `stream_langgraph_events()` | -| **Extra Handlers** | None | `on_task_create`, `on_task_cancel` | +### AGX1-377 Note -### When to use Async? -- Long-running tasks that may exceed HTTP timeout -- Agents that need to push updates asynchronously -- Multi-step workflows where the client polls for results -- Production agents that need reliable message delivery via Redis +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. ## Files | File | Description | |------|-------------| -| `project/acp.py` | ACP server with async event handlers | -| `project/graph.py` | LangGraph state graph definition | +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + auto_send_turn) | +| `project/graph.py` | LangGraph state graph (weather example) | | `project/tools.py` | Tool definitions (weather example) | | `tests/test_agent.py` | Integration tests | -| `manifest.yaml` | Agent configuration | +| `manifest.yaml` | Agent configuration (name: ab100-langgraph) | ## Running Locally ```bash -# From this directory agentex agents run ``` diff --git a/examples/tutorials/10_async/00_base/100_langgraph/graph.png b/examples/tutorials/10_async/00_base/100_langgraph/graph.png deleted file mode 100644 index 16d22a1e7..000000000 Binary files a/examples/tutorials/10_async/00_base/100_langgraph/graph.png and /dev/null differ diff --git a/examples/tutorials/10_async/00_base/100_langgraph/manifest.yaml b/examples/tutorials/10_async/00_base/100_langgraph/manifest.yaml index 1b0b5d490..13d64f524 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/manifest.yaml +++ b/examples/tutorials/10_async/00_base/100_langgraph/manifest.yaml @@ -17,7 +17,7 @@ local_development: agent: acp_type: async name: ab100-langgraph - description: An async LangGraph agent with tool calling and Redis streaming + description: An async LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) temporal: enabled: false @@ -47,7 +47,7 @@ deployment: global: agent: name: "ab100-langgraph" - description: "An async LangGraph agent with tool calling and Redis streaming" + description: "An async LangGraph agent using the unified harness surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/10_async/00_base/100_langgraph/project/acp.py b/examples/tutorials/10_async/00_base/100_langgraph/project/acp.py index 2585fefd6..198446607 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/project/acp.py +++ b/examples/tutorials/10_async/00_base/100_langgraph/project/acp.py @@ -1,7 +1,21 @@ -""" -ACP handler for async LangGraph agent. - -Uses the async ACP model with Redis streaming instead of HTTP yields. +"""ACP handler for the async LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.auto_send_turn`` streams events +to Redis and returns a ``TurnResult`` with the accumulated final text. + +Properties of the unified surface: +- Tracing is wired through the tracing manager (no bespoke handler boilerplate). +- A single ``UnifiedEmitter.auto_send_turn(LangGraphTurn(stream))`` call + replaces bespoke event-streaming helpers. +- Tool calls/responses go through ``streaming_task_message_context`` + (same code path as text deltas), making the event stream channel-agnostic. +- Usage data (token counts) is captured on ``LangGraphTurn.usage()`` after + ``auto_send_turn`` returns. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. """ from __future__ import annotations @@ -14,12 +28,13 @@ import agentex.lib.adk as adk from project.graph import create_graph -from agentex.lib.adk import stream_langgraph_events, create_langgraph_tracing_handler from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams from agentex.lib.types.fastacp import AsyncACPConfig from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) @@ -29,7 +44,8 @@ sgp_api_key=os.environ.get("SGP_API_KEY", ""), sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), - )) + ) +) acp = FastACP.create( acp_type="async", @@ -48,40 +64,39 @@ async def get_graph(): @acp.on_task_event_send async def handle_task_event_send(params: SendEventParams): - """Handle incoming events, streaming tokens and tool calls via Redis.""" + """Handle incoming events, streaming tokens and tool calls via unified harness.""" graph = await get_graph() task_id = params.task.id user_message = params.event.content.content logger.info(f"Processing message for thread {task_id}") - # Echo the user's message await adk.messages.create(task_id=task_id, content=params.event.content) async with adk.tracing.span( trace_id=task_id, + task_id=task_id, name="message", input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - callback = create_langgraph_tracing_handler( - trace_id=task_id, - parent_span_id=turn_span.id if turn_span else None, - ) - stream = graph.astream( {"messages": [{"role": "user", "content": user_message}]}, - config={ - "configurable": {"thread_id": task_id}, - "callbacks": [callback], - }, + config={"configurable": {"thread_id": task_id}}, stream_mode=["messages", "updates"], ) - final_output = await stream_langgraph_events(stream, task_id) + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) if turn_span: - turn_span.output = {"final_output": final_output} + turn_span.output = {"final_output": result.final_text} @acp.on_task_create diff --git a/examples/tutorials/10_async/00_base/100_langgraph/project/graph.py b/examples/tutorials/10_async/00_base/100_langgraph/project/graph.py index af6e31313..d63f28390 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/project/graph.py +++ b/examples/tutorials/10_async/00_base/100_langgraph/project/graph.py @@ -1,7 +1,7 @@ -""" -LangGraph graph definition. +"""LangGraph graph definition for the 100_langgraph async agent. -Defines the state, nodes, edges, and compiles the graph. +Identical to ``100_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. """ from __future__ import annotations @@ -34,6 +34,7 @@ class AgentState(TypedDict): """State schema for the agent graph.""" + messages: Annotated[list[Any], add_messages] @@ -51,9 +52,7 @@ def agent_node(state: AgentState) -> dict[str, Any]: """Process the current state and generate a response.""" messages = state["messages"] if not messages or not isinstance(messages[0], SystemMessage): - system_content = SYSTEM_PROMPT.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ) + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) messages = [SystemMessage(content=system_content)] + messages response = llm_with_tools.invoke(messages) return {"messages": [response]} diff --git a/examples/tutorials/10_async/00_base/100_langgraph/project/tools.py b/examples/tutorials/10_async/00_base/100_langgraph/project/tools.py index 1b402a906..e421528fc 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/project/tools.py +++ b/examples/tutorials/10_async/00_base/100_langgraph/project/tools.py @@ -1,9 +1,4 @@ -""" -Tool definitions for the LangGraph agent. - -Add your custom tools here. Each tool should be a function decorated with @tool -or created using the Tool class. -""" +"""Tool definitions for the 100_langgraph async agent.""" from langchain_core.tools import Tool @@ -17,16 +12,13 @@ def get_weather(city: str) -> str: Returns: A string describing the weather conditions. """ - # TODO: Replace with actual weather API call return f"The weather in {city} is sunny and 72°F" -# Define tools weather_tool = Tool( name="get_weather", func=get_weather, description="Get the current weather for a city. Input should be a city name.", ) -# Export all tools as a list TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/00_base/100_langgraph/pyproject.toml b/examples/tutorials/10_async/00_base/100_langgraph/pyproject.toml index fecbc6149..715477bac 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/pyproject.toml +++ b/examples/tutorials/10_async/00_base/100_langgraph/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "ab100-langgraph" version = "0.1.0" -description = "An async LangGraph agent with tool calling and Redis streaming" +description = "An async LangGraph agent using the unified harness surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ diff --git a/examples/tutorials/10_async/00_base/100_langgraph/tests/test_agent.py b/examples/tutorials/10_async/00_base/100_langgraph/tests/test_agent.py index 948db1558..b80d7a8f9 100644 --- a/examples/tutorials/10_async/00_base/100_langgraph/tests/test_agent.py +++ b/examples/tutorials/10_async/00_base/100_langgraph/tests/test_agent.py @@ -1,14 +1,8 @@ """ -Tests for the async LangGraph agent. +Tests for the async harness LangGraph agent. -This test suite validates: -- Non-streaming event sending and polling -- Streaming event sending - -To run these tests: -1. Make sure the agent is running (via docker-compose or `agentex agents run`) -2. Set the AGENTEX_API_BASE_URL environment variable if not using default -3. Run: pytest test_agent.py -v +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) +end-to-end against a live AgentEx server. Configuration: - AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) @@ -25,14 +19,12 @@ from agentex.types.agent_rpc_params import ParamsCreateTaskRequest from agentex.lib.sdk.fastacp.base.base_acp_server import uuid -# Configuration from environment variables AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") AGENT_NAME = os.environ.get("AGENT_NAME", "ab100-langgraph") @pytest_asyncio.fixture async def client(): - """Create an AsyncAgentex client instance for testing.""" client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) yield client await client.close() @@ -40,13 +32,11 @@ async def client(): @pytest.fixture def agent_name(): - """Return the agent name for testing.""" return AGENT_NAME @pytest_asyncio.fixture async def agent_id(client, agent_name): - """Retrieve the agent ID based on the agent name.""" agents = await client.agents.list() for agent in agents: if agent.name == agent_name: @@ -55,14 +45,9 @@ async def agent_id(client, agent_name): class TestNonStreamingEvents: - """Test non-streaming event sending and polling.""" - @pytest.mark.asyncio async def test_send_event(self, client: AsyncAgentex, agent_id: str): - """Test sending an event to the async LangGraph agent.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None @@ -78,10 +63,7 @@ async def test_send_event(self, client: AsyncAgentex, agent_id: str): @pytest.mark.asyncio async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): - """Test that the agent can use tools (e.g., weather tool).""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None @@ -97,14 +79,9 @@ async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): class TestStreamingEvents: - """Test streaming event sending.""" - @pytest.mark.asyncio async def test_send_event_and_stream(self, client: AsyncAgentex, agent_id: str): - """Test sending an event and streaming the response.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/README.md b/examples/tutorials/10_async/00_base/110_pydantic_ai/README.md index 6046b579a..db56979cc 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/README.md +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/README.md @@ -1,63 +1,52 @@ -# Tutorial 110 (async/base): Pydantic AI Agent +# Async Pydantic AI Agent -This tutorial demonstrates how to build an **async** Pydantic AI agent on AgentEx with: -- Tool calling (Pydantic AI handles the tool loop internally) -- Streaming token output via Redis (text + reasoning tokens stream as deltas) -- Task lifecycle hooks (create / event-send / cancel) +A minimal **async** (Redis-streaming) Pydantic AI agent that drives the +**unified harness surface** (`UnifiedEmitter.auto_send_turn` + `PydanticAITurn`) +directly. -This is the async counterpart to the sync tutorial at [`00_sync/040_pydantic_ai`](../../../00_sync/040_pydantic_ai/). +## Why this agent exists -## Key Concepts +This agent calls `emitter.auto_send_turn(...)` **explicitly** at the +agent-author level, making the unified-surface wiring visible and giving the +async channel direct coverage. -### Async ACP -Unlike sync ACP (HTTP request/response with chunked streaming back), async ACP uses **Redis** for streaming. The HTTP call returns immediately when an event is acknowledged; the agent then pushes updates to Redis on its own schedule. The UI subscribes to Redis to receive deltas. +## How it wires the unified surface -### Pydantic AI Integration -- **Agent**: A single `pydantic_ai.Agent` that owns the model and tools. No graph required. -- **`@agent.tool_plain`**: Registers a Python function as a tool. Pydantic AI infers the schema from type hints and docstring. -- **`agent.run_stream_events(...)`**: Yields `AgentStreamEvent`s (`PartStartEvent` / `PartDeltaEvent` / `PartEndEvent` / `FunctionToolResultEvent`) as the model produces them. +In `project/acp.py`: -### Streaming -The helper `stream_pydantic_ai_events(stream, task_id)` consumes the Pydantic AI event stream and writes Agentex updates to Redis via `adk.streaming.streaming_task_message_context(...)`: -- **Text and thinking tokens** stream as Redis deltas inside coalesced contexts. -- **Tool requests and tool responses** are emitted as **discrete full messages** (no token-level arg streaming). To stream tool-call argument tokens, use the sync converter — see [`00_sync/040_pydantic_ai`](../../../00_sync/040_pydantic_ai/). - -## Files - -| File | Description | -|------|-------------| -| `project/acp.py` | Async ACP server with task lifecycle handlers | -| `project/agent.py` | Pydantic AI agent + tool registration | -| `project/tools.py` | Tool definitions (weather example) | -| `tests/test_agent.py` | Integration tests | -| `manifest.yaml` | Agent configuration | - -## Running Locally - -```bash -# From this directory -agentex agents run +```python +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, +) +async with agent.run_stream_events(user_message, message_history=previous_messages) as stream: + turn = PydanticAITurn(tee_messages(stream), model=MODEL_NAME, coalesce_tool_requests=True) + result = await emitter.auto_send_turn(turn) ``` -## Running Tests +- `coalesce_tool_requests=True` is required on the async/auto_send path until + AGX1-377 lands: tool requests are delivered as a single `Full(tool_request)` + rather than streamed `Start + Delta + Done`. +- The `UnifiedEmitter` is constructed from the ACP context (`task_id` + + `trace_id` + `parent_span_id`) so messages auto-send to the task stream + (Redis) and tracing is automatic. +- Multi-turn memory is persisted via `adk.state` (pydantic-ai message history + round-tripped through `ModelMessagesTypeAdapter`). -```bash -pytest tests/test_agent.py -v -``` +## Files -## Sync vs Async — How the Code Differs +- `project/acp.py` — async ACP handler using `emitter.auto_send_turn(...)`. +- `project/agent.py` — builds the `pydantic_ai.Agent` with one tool. +- `project/tools.py` — `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires a running agent). -This tutorial uses the same `project/agent.py` and `project/tools.py` as the sync version. The only meaningful differences live in `project/acp.py`: +## Tools -| Concern | Sync (`s040-pydantic-ai`) | Async (`ab110-pydantic-ai`) | -|---|---|---| -| ACP type | `FastACP.create(acp_type="sync")` | `FastACP.create(acp_type="async", config=AsyncACPConfig(type="base"))` | -| Handler hook | `@acp.on_message_send` returns/yields events | `@acp.on_task_event_send` returns nothing | -| Stream output | `yield event` (chunked HTTP) | `await context.stream_update(...)` (Redis) | -| Tool calls | Args stream as `ToolRequestDelta` tokens | Args arrive in one full message | -| Lifecycle | Ephemeral (no task hooks) | `on_task_create` + `on_task_cancel` form a durable task contract | +- `get_weather(city: str) -> str`: returns a fixed "sunny and 72°F" string. -## Notes +## Offline coverage -- Multi-turn conversation memory is not wired here. Pydantic AI does not ship a checkpointer; to add memory, load prior messages via `adk.messages.list(task_id=...)` and pass them to `agent.run_stream_events(..., message_history=...)`. -- Reasoning/thinking tokens are not exercised by `gpt-4o-mini`. Swap to a reasoning-capable model if you want to test that branch end-to-end. +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no network) live in the SDK repo under +`tests/lib/core/harness/` (the pydantic-ai async suite). diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/manifest.yaml b/examples/tutorials/10_async/00_base/110_pydantic_ai/manifest.yaml index 583b07251..4aca13d44 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/manifest.yaml +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/manifest.yaml @@ -17,7 +17,7 @@ local_development: agent: acp_type: async name: ab110-pydantic-ai - description: An async Pydantic AI agent with tool calling and Redis streaming + description: An async Pydantic AI harness test agent using the unified emitter surface temporal: enabled: false @@ -38,7 +38,7 @@ agent: - env_var_name: SGP_CLIENT_BASE_URL secret_name: sgp-client-base-url secret_key: url - + deployment: image: repository: "" @@ -47,7 +47,7 @@ deployment: global: agent: name: "ab110-pydantic-ai" - description: "An async Pydantic AI agent with tool calling and Redis streaming" + description: "An async Pydantic AI harness test agent using the unified emitter surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/acp.py b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/acp.py index dc8a2de21..95b638f8b 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/acp.py +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/acp.py @@ -1,13 +1,14 @@ -"""ACP handler for async Pydantic AI agent. +"""ACP handler for the async harness Pydantic AI test agent. -Uses the async ACP model with Redis streaming instead of HTTP yields. -Text and reasoning tokens stream as Redis deltas; tool requests and -responses are persisted as discrete full messages. +This agent exercises the UNIFIED HARNESS SURFACE on the async (Redis-streaming) +channel — ``UnifiedEmitter.auto_send_turn(PydanticAITurn(...))`` +— calling it directly rather than via the ``stream_pydantic_ai_events`` helper +(which the ``110_pydantic_ai`` tutorial uses). This makes the unified-surface +wiring explicit at the agent-author level. Multi-turn memory is persisted via ``adk.state``: on each turn we load the previous pydantic-ai ``message_history`` from state, run the agent with it, -then save the updated history back. Without this, every turn would be a -fresh stateless run and the agent would forget the prior conversation. +then save the updated history back. """ from __future__ import annotations @@ -23,17 +24,15 @@ from pydantic_ai.messages import ModelMessagesTypeAdapter import agentex.lib.adk as adk -from project.agent import create_agent -from agentex.lib.adk import ( - stream_pydantic_ai_events, - create_pydantic_ai_tracing_handler, -) +from project.agent import MODEL_NAME, create_agent from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.types.fastacp import AsyncACPConfig from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.utils.model_utils import BaseModel from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) @@ -66,9 +65,7 @@ class ConversationState(BaseModel): ``history_json`` holds the pydantic-ai message history serialized by ``ModelMessagesTypeAdapter`` — pydantic-ai's official way to round-trip - ``ModelMessage`` objects through JSON. We can't use a plain - ``list[ModelMessage]`` field because ``ModelMessage`` is a discriminated - union of runtime types, not a stable Pydantic schema. + ``ModelMessage`` objects through JSON. """ history_json: str = "[]" @@ -77,11 +74,7 @@ class ConversationState(BaseModel): @acp.on_task_create async def handle_task_create(params: CreateTaskParams): - """Initialize per-task state on task creation. - - A fresh task starts with no message history; the conversation is built - up by ``handle_task_event_send`` on each subsequent user message. - """ + """Initialize per-task state on task creation.""" logger.info(f"Task created: {params.task.id}") await adk.state.create( task_id=params.task.id, @@ -92,7 +85,7 @@ async def handle_task_create(params: CreateTaskParams): @acp.on_task_event_send async def handle_task_event_send(params: SendEventParams): - """Handle each user message: load prior history, run the agent, save updated history.""" + """Handle each user message through the unified auto_send_turn path.""" agent = get_agent() task_id = params.task.id agent_id = params.agent.id @@ -103,9 +96,7 @@ async def handle_task_event_send(params: SendEventParams): # Echo the user's message into the task history. await adk.messages.create(task_id=task_id, content=params.event.content) - # Load the previous conversation history from state. If state is missing - # (e.g. task wasn't initialised via on_task_create), fall back to a fresh - # one so the agent still responds — just without memory of prior turns. + # Load the previous conversation history from state (fall back to fresh). task_state = await adk.state.get_by_task_and_agent(task_id=task_id, agent_id=agent_id) if task_state is None: state = ConversationState() @@ -123,15 +114,15 @@ async def handle_task_event_send(params: SendEventParams): input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - tracing_handler = create_pydantic_ai_tracing_handler( + # Construct the UnifiedEmitter from the ACP context so tracing is + # automatic and messages are auto-sent to the task stream (Redis). + emitter = UnifiedEmitter( + task_id=task_id, trace_id=task_id, parent_span_id=turn_span.id if turn_span else None, - task_id=task_id, ) - # Wrap the pydantic-ai event stream so we can capture the final - # AgentRunResultEvent (which carries the full message list for the - # next turn) without changing the streaming-helper's signature. + # Capture the terminal AgentRunResultEvent to persist message history. captured_messages: list[Any] = [] async def tee_messages(upstream) -> AsyncIterator[Any]: @@ -141,9 +132,13 @@ async def tee_messages(upstream) -> AsyncIterator[Any]: yield event async with agent.run_stream_events(user_message, message_history=previous_messages) as stream: - final_output = await stream_pydantic_ai_events( - tee_messages(stream), task_id, tracing_handler=tracing_handler + # The unified auto_send path delivers streamed tool requests natively + # (Start+Delta+Done), so no coalescing workaround is needed. + turn = PydanticAITurn( + tee_messages(stream), + model=MODEL_NAME, ) + result = await emitter.auto_send_turn(turn) # Save the updated message history so the next turn picks up here. if captured_messages: @@ -156,7 +151,7 @@ async def tee_messages(upstream) -> AsyncIterator[Any]: ) if turn_span: - turn_span.output = {"final_output": final_output} + turn_span.output = {"final_output": result.final_text} @acp.on_task_cancel diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/agent.py b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/agent.py index 2c0f6f10c..e7b764d82 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/agent.py +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/agent.py @@ -1,4 +1,4 @@ -"""Pydantic AI agent definition. +"""Pydantic AI agent definition for the async harness test agent. The Agent is the boundary between this module and the API layer (acp.py). Pydantic AI handles its own tool-call loop internally — no graph required. @@ -12,6 +12,8 @@ from project.tools import get_weather +__all__ = ["create_agent", "MODEL_NAME"] + MODEL_NAME = "openai:gpt-4o-mini" SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. @@ -29,9 +31,7 @@ def create_agent() -> Agent: """Build and return the Pydantic AI agent with tools registered.""" agent = Agent( MODEL_NAME, - system_prompt=SYSTEM_PROMPT.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ), + system_prompt=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ) agent.tool_plain(get_weather) diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/tools.py b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/tools.py index 98f65d509..0f16a7cb0 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/project/tools.py +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/project/tools.py @@ -1,8 +1,8 @@ -"""Tool definitions for the async Pydantic AI agent. +"""Tool definitions for the async harness Pydantic AI agent. Pydantic AI tools are registered directly on the Agent via decorators -(see project.agent). This module hosts the bare functions so they're -easy to unit-test in isolation. +(see project.agent). This module hosts the bare function so it is easy to +unit-test in isolation. """ from __future__ import annotations diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/pyproject.toml b/examples/tutorials/10_async/00_base/110_pydantic_ai/pyproject.toml index f5cd32e0a..257918014 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/pyproject.toml +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "ab110-pydantic-ai" version = "0.1.0" -description = "An async Pydantic AI agent with tool calling and Redis streaming" +description = "An async Pydantic AI harness test agent using the unified emitter surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ diff --git a/examples/tutorials/10_async/00_base/110_pydantic_ai/tests/test_agent.py b/examples/tutorials/10_async/00_base/110_pydantic_ai/tests/test_agent.py index a31322d30..ce573a697 100644 --- a/examples/tutorials/10_async/00_base/110_pydantic_ai/tests/test_agent.py +++ b/examples/tutorials/10_async/00_base/110_pydantic_ai/tests/test_agent.py @@ -1,8 +1,10 @@ -"""Tests for the async Pydantic AI agent. +"""Live tests for the async Pydantic AI agent. -This test suite validates: -- Non-streaming event sending and polling -- Streaming event sending +These tests require a running agent (server + deployed agent) and exercise the +unified-surface async handler end-to-end over the wire. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in the SDK repo under ``tests/lib/core/harness/`` (the pydantic-ai async suite). To run these tests: 1. Make sure the agent is running (via docker-compose or `agentex agents run`) @@ -53,14 +55,12 @@ async def agent_id(client, agent_name): class TestNonStreamingEvents: - """Test non-streaming event sending and polling.""" + """Test non-streaming event sending through the unified auto_send_turn path.""" @pytest.mark.asyncio async def test_send_event(self, client: AsyncAgentex, agent_id: str): - """Test sending an event to the async Pydantic AI agent.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + """Test sending an event to the async harness Pydantic AI agent.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None @@ -77,9 +77,7 @@ async def test_send_event(self, client: AsyncAgentex, agent_id: str): @pytest.mark.asyncio async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): """Test that the agent can use tools (e.g., weather tool).""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None @@ -100,9 +98,7 @@ class TestStreamingEvents: @pytest.mark.asyncio async def test_send_event_and_stream(self, client: AsyncAgentex, agent_id: str): """Test sending an event and streaming the response.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/.dockerignore b/examples/tutorials/10_async/00_base/120_openai_agents/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/Dockerfile b/examples/tutorials/10_async/00_base/120_openai_agents/Dockerfile similarity index 64% rename from examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/Dockerfile rename to examples/tutorials/10_async/00_base/120_openai_agents/Dockerfile index 1272027cf..76fe0fdef 100644 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/Dockerfile +++ b/examples/tutorials/10_async/00_base/120_openai_agents/Dockerfile @@ -23,16 +23,16 @@ RUN uv pip install --system --upgrade pip setuptools wheel ENV UV_HTTP_TIMEOUT=1000 # Copy pyproject.toml and README.md to install dependencies -COPY 10_async/00_base/120_openai_agents_local_sandbox/pyproject.toml /app/120_openai_agents_local_sandbox/pyproject.toml -COPY 10_async/00_base/120_openai_agents_local_sandbox/README.md /app/120_openai_agents_local_sandbox/README.md +COPY 10_async/00_base/120_openai_agents/pyproject.toml /app/120_openai_agents/pyproject.toml +COPY 10_async/00_base/120_openai_agents/README.md /app/120_openai_agents/README.md -WORKDIR /app/120_openai_agents_local_sandbox +WORKDIR /app/120_openai_agents # Copy the project code -COPY 10_async/00_base/120_openai_agents_local_sandbox/project /app/120_openai_agents_local_sandbox/project +COPY 10_async/00_base/120_openai_agents/project /app/120_openai_agents/project # Copy the test files -COPY 10_async/00_base/120_openai_agents_local_sandbox/tests /app/120_openai_agents_local_sandbox/tests +COPY 10_async/00_base/120_openai_agents/tests /app/120_openai_agents/tests # Copy shared test utilities COPY test_utils /app/test_utils @@ -44,7 +44,7 @@ RUN uv pip install --system .[dev] pytest-asyncio httpx ENV PYTHONPATH=/app # Set test environment variables -ENV AGENT_NAME=ab120-openai-agents-local-sandbox +ENV AGENT_NAME=ab120-openai-agents # Run the agent using uvicorn CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/README.md b/examples/tutorials/10_async/00_base/120_openai_agents/README.md new file mode 100644 index 000000000..0b55b00a2 --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/README.md @@ -0,0 +1,33 @@ +# Async OpenAI Agents on the unified harness surface + +An async (Redis-streaming) Agentex agent that runs the OpenAI Agents SDK and +delivers its output through the **unified harness surface**. + +## What this demonstrates + +Same `OpenAITurn` adapter as the sync tutorial (`050_openai_agents`), but the +async ACP pushes the turn to the task stream via +`UnifiedEmitter.auto_send_turn` instead of yielding over HTTP. `auto_send_turn` +returns a `TurnResult` with the accumulated final text and normalized usage. + +```python +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, parent_span_id=parent_span_id) +turn_result = await emitter.auto_send_turn(turn) +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +## Test it + +The offline test exercises the auto-send delivery path with an injected fake +streaming backend (no server, Redis, or API key required): + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/manifest.yaml b/examples/tutorials/10_async/00_base/120_openai_agents/manifest.yaml similarity index 64% rename from examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/manifest.yaml rename to examples/tutorials/10_async/00_base/120_openai_agents/manifest.yaml index e0c3c0596..bd8d5cce5 100644 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/manifest.yaml +++ b/examples/tutorials/10_async/00_base/120_openai_agents/manifest.yaml @@ -2,10 +2,10 @@ build: context: root: ../../../ include_paths: - - 10_async/00_base/120_openai_agents_local_sandbox + - 10_async/00_base/120_openai_agents - test_utils - dockerfile: 10_async/00_base/120_openai_agents_local_sandbox/Dockerfile - dockerignore: 10_async/00_base/120_openai_agents_local_sandbox/.dockerignore + dockerfile: 10_async/00_base/120_openai_agents/Dockerfile + dockerignore: 10_async/00_base/120_openai_agents/.dockerignore local_development: agent: @@ -16,8 +16,8 @@ local_development: agent: acp_type: async - name: ab120-openai-agents-local-sandbox - description: An async OpenAI Agents SDK agent using a local (unix_local) sandbox + name: ab120-openai-agents + description: An async OpenAI Agents SDK agent on the unified harness surface temporal: enabled: false @@ -39,9 +39,6 @@ agent: secret_name: sgp-client-base-url secret_key: url - env: - OPENAI_AGENTS_DISABLE_TRACING: "1" - deployment: image: repository: "" @@ -49,8 +46,8 @@ deployment: global: agent: - name: "ab120-openai-agents-local-sandbox" - description: "An async OpenAI Agents SDK agent using a local (unix_local) sandbox" + name: "ab120-openai-agents" + description: "An async OpenAI Agents SDK agent on the unified harness surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/project/__init__.py b/examples/tutorials/10_async/00_base/120_openai_agents/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/project/acp.py b/examples/tutorials/10_async/00_base/120_openai_agents/project/acp.py new file mode 100644 index 000000000..fcd10cc62 --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/project/acp.py @@ -0,0 +1,98 @@ +"""ACP handler for the async OpenAI Agents harness tutorial. + +Uses the async ACP model with Redis streaming instead of HTTP yields. The +OpenAI Agents SDK run is wrapped in an ``OpenAITurn`` and pushed to the task +stream via ``UnifiedEmitter.auto_send_turn`` — the async/temporal delivery path +of the unified harness surface. ``auto_send_turn`` returns a ``TurnResult`` +carrying the accumulated final text and normalized usage. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agents import Runner + +from agentex.lib import adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +_agent = None + + +def get_agent(): + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info(f"Task created: {params.task.id}") + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle each user message: run the agent and auto-send its turn.""" + agent = get_agent() + task_id = params.task.id + user_message = params.event.content.content + + logger.info(f"Processing message for task {task_id}") + + # Echo the user's message into the task history. + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + result = Runner.run_streamed(starting_agent=agent, input=user_message) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn_result = await emitter.auto_send_turn(turn) + if turn_span: + turn_span.output = {"final_output": turn_result.final_text} + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/project/agent.py b/examples/tutorials/10_async/00_base/120_openai_agents/project/agent.py new file mode 100644 index 000000000..5b83c5aab --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/project/agent.py @@ -0,0 +1,43 @@ +"""OpenAI Agents SDK agent definition for the async harness tutorial. + +Identical agent shape to the sync tutorial (060). The only difference is the +delivery path in acp.py: the async ACP uses ``UnifiedEmitter.auto_send_turn`` +(Redis streaming) instead of yielding events over an HTTP response. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/project/tools.py b/examples/tutorials/10_async/00_base/120_openai_agents/project/tools.py new file mode 100644 index 000000000..d2e5468c9 --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/project/tools.py @@ -0,0 +1,15 @@ +"""Tool definitions for the async OpenAI Agents harness tutorial.""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/pyproject.toml b/examples/tutorials/10_async/00_base/120_openai_agents/pyproject.toml similarity index 75% rename from examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/pyproject.toml rename to examples/tutorials/10_async/00_base/120_openai_agents/pyproject.toml index 75c6254f3..f48fab49f 100644 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/pyproject.toml +++ b/examples/tutorials/10_async/00_base/120_openai_agents/pyproject.toml @@ -3,15 +3,15 @@ requires = ["hatchling"] build-backend = "hatchling.build" [project] -name = "ab120-openai-agents-local-sandbox" +name = "ab120-openai-agents" version = "0.1.0" -description = "An async OpenAI Agents SDK agent using a local (unix_local) sandbox" +description = "An async OpenAI Agents SDK agent on the unified harness surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ "agentex-sdk", "scale-gp", - "openai-agents>=0.14.3,<0.15", + "openai-agents", ] [project.optional-dependencies] diff --git a/examples/tutorials/10_async/00_base/120_openai_agents/tests/test_agent.py b/examples/tutorials/10_async/00_base/120_openai_agents/tests/test_agent.py new file mode 100644 index 000000000..ceb95dbab --- /dev/null +++ b/examples/tutorials/10_async/00_base/120_openai_agents/tests/test_agent.py @@ -0,0 +1,77 @@ +"""Offline test for the async OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server, Redis, or an OpenAI API +key. It verifies the async delivery path this tutorial demonstrates: an +``OpenAITurn`` built from an injected canonical stream, pushed through +``UnifiedEmitter.auto_send_turn`` with an injected fake streaming backend, +returns the accumulated final text. + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +class _FakeCtx: + def __init__(self, initial_content): + self.task_message = TaskMessage(id="m-1", task_id="task-1", content=initial_content) + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + pass + + async def stream_update(self, update): + return update + + +class _FakeStreaming: + def streaming_task_message_context(self, task_id, initial_content, **_kwargs): # noqa: ARG002 + return _FakeCtx(initial_content) + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_auto_send_turn_returns_final_text(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=_FakeStreaming(), + ) + + result = await emitter.auto_send_turn(turn) + assert result.final_text == "Hello" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/README.md b/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/README.md deleted file mode 100644 index 58d422b39..000000000 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/README.md +++ /dev/null @@ -1,119 +0,0 @@ -# Tutorial 120: Async OpenAI Agents SDK with a Local Sandbox - -This tutorial demonstrates how to build an **async (non-Temporal)** agent on AgentEx -using the [OpenAI Agents SDK](https://developers.openai.com/api/docs/guides/agents) -and its **sandbox** runtime, running with the **local** (`unix_local`) backend. - -The agent is a "local sandbox assistant": it answers questions by actually running -real shell commands (e.g. `python3 --version`, `ls /tmp`, `python3 -c "..."`) -instead of guessing. - -This mirrors the Pydantic AI async tutorial (`110_pydantic_ai`): same async ACP -model (`acp_type: async`, `temporal.enabled: false`), same per-task `adk.state` -multi-turn memory pattern. The difference is the runtime — here we use the OpenAI -Agents SDK `SandboxAgent` with the local sandbox backend. - -## Key Concepts - -### Async ACP (base) -The async ACP model is event-driven: `on_task_create` initializes per-task state, -and `on_task_event_send` handles each user message. Conversation history is -persisted across turns via `adk.state`. - -### OpenAI Agents SDK Sandbox -The OpenAI Agents SDK ships `agents.sandbox`, which lets you give an agent -**capabilities** (instead of hand-written tools) that the runtime turns into real -tools backed by a sandbox: - -- **`SandboxAgent`**: an `Agent` that is granted sandbox capabilities. -- **Capabilities** (`from agents.sandbox.capabilities import Shell, Filesystem, Memory`): - each capability expands into a set of real tools. This tutorial uses `Shell`, which - lets the model run real shell commands. -- **`SandboxRunConfig`** + a sandbox **client**: tells the runtime *where* the tools - actually execute. - -### The LOCAL sandbox (`UnixLocalSandboxClient`) -This tutorial uses the local backend -(`from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClient, UnixLocalSandboxClientOptions`), -`backend_id="unix_local"`. The local sandbox runs shell commands **ON THE HOST** — -the agent's own container/process. There is **no Docker, no Temporal, and no remote -sandbox infrastructure** involved. - -The sandbox is wired up through the SDK's `RunConfig`: - -```python -from agents import Runner, set_tracing_disabled -from agents.run_config import RunConfig -from agents.sandbox import SandboxAgent, SandboxRunConfig -from agents.sandbox.capabilities import Shell -from agents.sandbox.sandboxes.unix_local import ( - UnixLocalSandboxClient, - UnixLocalSandboxClientOptions, -) - -set_tracing_disabled(True) # avoid api.openai.com tracing 401 behind a gateway - -agent = SandboxAgent( - name="Local Sandbox Assistant", - instructions="...use the shell tools to actually run commands...", - capabilities=[Shell()], -) -run_config = RunConfig( - sandbox=SandboxRunConfig( - client=UnixLocalSandboxClient(), - options=UnixLocalSandboxClientOptions(), - ) -) -result = await Runner.run(agent, input=input_list, run_config=run_config) -print(result.final_output) -``` - -`Runner.run` drives the full tool-call loop internally: the model issues shell -commands, the local sandbox runs them on the host, the output is fed back, and the -loop continues until the model produces a final answer. Because the loop is -self-contained, the async handler runs the agent and persists a single final -`TextContent` rather than streaming tokens. - -## Files - -| File | Description | -|------|-------------| -| `project/acp.py` | Async ACP server + handlers (`adk.state` multi-turn, runs the sandbox agent) | -| `project/agent.py` | `SandboxAgent` + `RunConfig(sandbox=...)` wiring + `run_agent` | -| `project/tools.py` | Sandbox capability factory (`Shell`) | -| `tests/test_agent.py` | Integration tests (polling pattern) | -| `manifest.yaml` | Agent configuration | - -## Running Locally - -```bash -# From this directory -agentex agents run -``` - -Set `OPENAI_API_KEY` (or `LITELLM_API_KEY` if you're behind the Scale LiteLLM -gateway) in your environment or in a `.env` file in `project/` so the agent can call -the model. - -## Running Tests - -```bash -pytest tests/test_agent.py -v -``` - -## Notes - -- **No infra required.** Because this uses the `unix_local` backend, the shell tools - run directly in the agent's process — no Docker daemon, no Temporal, no remote - sandbox. Swap the client for a remote/containerized backend to isolate execution. -- **Tracing.** `set_tracing_disabled(True)` turns off the OpenAI Agents SDK's native - tracer (which would otherwise try to ship traces to `api.openai.com`). The manifest - also sets `OPENAI_AGENTS_DISABLE_TRACING=1`. AgentEx/SGP tracing still runs via the - tracing manager configured in `acp.py` when SGP credentials are present. -- **Capabilities are the tools.** To let the agent do more, add capabilities in - `project/tools.py` (e.g. `Filesystem()`, `Memory()`). - -## Further Reading - -- OpenAI Agents SDK guide: https://developers.openai.com/api/docs/guides/agents -- The Temporal variant of this tutorial: `10_async/10_temporal/120_openai_agents_local_sandbox` diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/acp.py b/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/acp.py deleted file mode 100644 index 6ff475873..000000000 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/acp.py +++ /dev/null @@ -1,149 +0,0 @@ -"""ACP handler for the async OpenAI Agents SDK local-sandbox agent. - -Uses the async ACP model (``acp_type: async``, ``temporal.enabled: false``), -mirroring the Pydantic AI tutorial (110). The difference is the runtime: here we -run an OpenAI Agents SDK ``SandboxAgent`` against the **local** sandbox backend -(``UnixLocalSandboxClient``), which executes real shell commands on the host. - -The OpenAI Agents SDK sandbox runtime drives the full tool-call loop internally -inside ``Runner.run`` (model -> shell command -> output -> model -> ... -> final -answer), so this handler runs the agent and persists a single final -``TextContent`` rather than streaming tokens itself. - -Multi-turn memory is persisted via ``adk.state``: on each turn we load the prior -OpenAI Agents SDK input list from state, run the agent with it, then save the -updated list (``result.to_input_list()``) back. Without this, every turn would be -a fresh stateless run and the agent would forget the prior conversation. -""" - -from __future__ import annotations - -import os -from typing import Any - -from dotenv import load_dotenv - -load_dotenv() - -import agentex.lib.adk as adk -from project.agent import run_agent -from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams -from agentex.lib.types.fastacp import AsyncACPConfig -from agentex.lib.types.tracing import SGPTracingProcessorConfig -from agentex.lib.utils.logging import make_logger -from agentex.types.text_content import TextContent -from agentex.lib.utils.model_utils import BaseModel -from agentex.lib.sdk.fastacp.fastacp import FastACP -from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config - -logger = make_logger(__name__) - -# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client -# compatibility, so the same example works behind the Scale LiteLLM gateway. -_litellm_key = os.environ.get("LITELLM_API_KEY") -if _litellm_key and not os.environ.get("OPENAI_API_KEY"): - os.environ["OPENAI_API_KEY"] = _litellm_key - -add_tracing_processor_config( - SGPTracingProcessorConfig( - sgp_api_key=os.environ.get("SGP_API_KEY", ""), - sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), - sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), - ) -) - -acp = FastACP.create( - acp_type="async", - config=AsyncACPConfig(type="base"), -) - - -class ConversationState(BaseModel): - """Per-task conversation state persisted via ``adk.state``. - - ``input_list`` holds the OpenAI Agents SDK conversation history — the same - structure ``Runner.run`` accepts as input and ``result.to_input_list()`` - returns. Persisting it between turns gives the agent multi-turn memory. - """ - - input_list: list[dict[str, Any]] = [] - turn_number: int = 0 - - -@acp.on_task_create -async def handle_task_create(params: CreateTaskParams): - """Initialize per-task state on task creation. - - A fresh task starts with no message history; the conversation is built up by - ``handle_task_event_send`` on each subsequent user message. - """ - logger.info(f"Task created: {params.task.id}") - await adk.state.create( - task_id=params.task.id, - agent_id=params.agent.id, - state=ConversationState(), - ) - - -@acp.on_task_event_send -async def handle_task_event_send(params: SendEventParams): - """Handle each user message: load prior history, run the agent, save updated history.""" - task_id = params.task.id - agent_id = params.agent.id - user_message = params.event.content.content - - logger.info(f"Processing message for thread {task_id}") - - # Echo the user's message into the task history so it shows up in the UI. - await adk.messages.create(task_id=task_id, content=params.event.content) - - # Load the previous conversation history from state. If state is missing - # (e.g. task wasn't initialised via on_task_create), fall back to a fresh - # one so the agent still responds — just without memory of prior turns. - task_state = await adk.state.get_by_task_and_agent(task_id=task_id, agent_id=agent_id) - if task_state is None: - state = ConversationState() - task_state = await adk.state.create(task_id=task_id, agent_id=agent_id, state=state) - else: - state = ConversationState.model_validate(task_state.state) - - state.turn_number += 1 - state.input_list.append({"role": "user", "content": user_message}) - - async with adk.tracing.span( - trace_id=task_id, - task_id=task_id, - name=f"Turn {state.turn_number}", - input={"message": user_message}, - data={"__span_type__": "AGENT_WORKFLOW"}, - ) as turn_span: - # The OpenAI Agents SDK sandbox runtime runs the full tool-call loop - # internally (model -> shell command on the local host -> output -> - # model -> ... -> final answer), so we get a single final result. - result = await run_agent(state.input_list) - final_output = result.final_output - - # Persist the assistant's final answer as a TaskMessage so it shows up - # in the UI. (Unlike the streaming Pydantic AI tutorial, the sandbox run - # is non-streaming, so we post the final text ourselves.) - await adk.messages.create( - task_id=task_id, - content=TextContent(author="agent", content=final_output), - ) - - # Save the updated message history so the next turn picks up here. - state.input_list = result.to_input_list() - await adk.state.update( - state_id=task_state.id, - task_id=task_id, - agent_id=agent_id, - state=state, - ) - - if turn_span: - turn_span.output = {"final_output": final_output} - - -@acp.on_task_cancel -async def handle_task_canceled(params: CancelTaskParams): - logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/agent.py b/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/agent.py deleted file mode 100644 index 177bb287d..000000000 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/agent.py +++ /dev/null @@ -1,95 +0,0 @@ -"""OpenAI Agents SDK local-sandbox agent definition (async, non-Temporal). - -This mirrors the Pydantic AI tutorial (110): the agent is the boundary between -this module and the API layer (acp.py). The difference is the runtime — here we -use the OpenAI Agents SDK ``SandboxAgent`` together with the **local** sandbox -backend (``UnixLocalSandboxClient``). - -The local sandbox runs shell commands ON THE HOST — the agent's own -container/process. There is no Docker, no Temporal, and no remote sandbox -infrastructure. The OpenAI Agents SDK runs its own tool-call loop internally: -when the model decides to run a shell command, the sandbox executes it locally -and feeds the output back to the model until it produces a final answer. -""" - -from __future__ import annotations - -from datetime import datetime - -from agents import Runner, set_tracing_disabled -from agents.sandbox import SandboxAgent, SandboxRunConfig -from agents.run_config import RunConfig -from agents.sandbox.sandboxes.unix_local import ( - UnixLocalSandboxClient, - UnixLocalSandboxClientOptions, -) - -from project.tools import get_capabilities - -# Disable the openai-agents SDK's native tracer so it doesn't ship traces to -# api.openai.com using OPENAI_API_KEY (which may be a gateway/proxy key and would -# 401). Agentex tracing still runs via the tracing manager configured in acp.py. -set_tracing_disabled(True) - -MODEL_NAME = "gpt-4o-mini" -INSTRUCTIONS = """You are a local sandbox assistant. - -Current date and time: {timestamp} - -You have access to shell tools that run real commands on the local machine. - -Guidelines: -- ALWAYS use the shell tools to actually run commands — never guess or make up - output. If the user asks for the Python version, run `python3 --version`. If - they ask to list files, run `ls`. If they ask you to compute something, use - `python3 -c "..."`. -- Run the minimal command(s) needed to answer the question. -- Report the real command output back to the user, concisely. -""" - - -def create_agent() -> SandboxAgent: - """Build and return the OpenAI Agents SDK sandbox agent. - - The agent is granted shell capabilities (see ``project.tools``). The actual - sandbox backend (where the shell commands run) is supplied at run time via - the ``RunConfig`` returned by ``create_run_config``. - """ - return SandboxAgent( - name="Local Sandbox Assistant", - model=MODEL_NAME, - instructions=INSTRUCTIONS.format( - timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") - ), - capabilities=get_capabilities(), - ) - - -def create_run_config() -> RunConfig: - """Build the RunConfig that points the agent at the LOCAL sandbox backend. - - ``UnixLocalSandboxClient`` (backend_id="unix_local") runs shell commands on - the host — the agent's own process — so no Docker or remote infra is needed. - """ - return RunConfig( - sandbox=SandboxRunConfig( - client=UnixLocalSandboxClient(), - options=UnixLocalSandboxClientOptions(), - ) - ) - - -async def run_agent(input_list: list) -> "Runner": - """Run the sandbox agent over the conversation so far and return the result. - - The OpenAI Agents SDK handles the full tool-call loop internally: the model - issues shell commands, the local sandbox runs them on the host, and the - output is fed back until the model produces a final answer. - - We pass the full ``input_list`` (prior turns + the new user message) so the - agent has conversation memory across turns; the caller persists - ``result.to_input_list()`` back into ``adk.state`` for the next turn. - """ - agent = create_agent() - run_config = create_run_config() - return await Runner.run(agent, input=input_list, run_config=run_config, max_turns=10) diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/tools.py b/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/tools.py deleted file mode 100644 index a931fa273..000000000 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/project/tools.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Sandbox capabilities for the async OpenAI Agents SDK local-sandbox agent. - -Unlike the Pydantic AI tutorial (110), this agent does not register hand-written -Python functions as tools. Instead it is given *capabilities* — the OpenAI Agents -SDK sandbox runtime turns each capability into a real set of tools (run a shell -command, read a file, etc.) backed by an actual sandbox backend. - -Here we use the ``Shell`` capability, which lets the model run real shell commands. -With the local (``unix_local``) backend those commands execute ON THE HOST — the -agent's own process/container — so there is no Docker, Temporal, or remote infra -involved. This module hosts the capability factory so the agent wiring in -``project.agent`` stays readable and the capability set is easy to extend -(e.g. add ``Filesystem()`` or ``Memory()``). -""" - -from __future__ import annotations - -from agents.sandbox.capabilities import Shell - - -def get_capabilities() -> list: - """Return the sandbox capabilities the agent is allowed to use. - - Returns: - A list of OpenAI Agents SDK sandbox capabilities. We grant ``Shell`` so - the agent can run real shell commands on the local machine. Add - ``Filesystem()`` or ``Memory()`` here to expand what the agent can do. - """ - return [Shell()] diff --git a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/tests/test_agent.py b/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/tests/test_agent.py deleted file mode 100644 index 0c7904eac..000000000 --- a/examples/tutorials/10_async/00_base/120_openai_agents_local_sandbox/tests/test_agent.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Tests for the async OpenAI Agents SDK local-sandbox agent. - -This test suite validates that the agent actually runs shell commands in the -LOCAL sandbox (unix_local backend) by polling for the agent's response: -- Ask for the Python version -> response contains "Python 3" -- Ask it to compute 21 * 2 with python3 -> response contains "42" - -To run these tests: -1. Make sure the agent is running (via docker-compose or `agentex agents run`) -2. Set the AGENTEX_API_BASE_URL environment variable if not using default -3. Run: pytest test_agent.py -v - -Configuration: -- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) -- AGENT_NAME: Name of the agent to test (default: ab120-openai-agents-local-sandbox) -""" - -import os -import uuid - -import pytest -import pytest_asyncio -from test_utils.async_utils import send_event_and_poll_yielding - -from agentex import AsyncAgentex -from agentex.types.agent_rpc_params import ParamsCreateTaskRequest - -AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") -AGENT_NAME = os.environ.get("AGENT_NAME", "ab120-openai-agents-local-sandbox") - - -@pytest_asyncio.fixture -async def client(): - """Create an AsyncAgentex client instance for testing.""" - client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) - yield client - await client.close() - - -@pytest.fixture -def agent_name(): - """Return the agent name for testing.""" - return AGENT_NAME - - -@pytest_asyncio.fixture -async def agent_id(client, agent_name): - """Retrieve the agent ID based on the agent name.""" - agents = await client.agents.list() - for agent in agents: - if agent.name == agent_name: - return agent.id - raise ValueError(f"Agent with name {agent_name} not found.") - - -async def _send_and_collect_agent_text( - client: AsyncAgentex, agent_id: str, task_id: str, user_message: str -) -> str: - """Send a user message and accumulate all agent text responses into a string.""" - parts: list[str] = [] - async for message in send_event_and_poll_yielding( - client=client, - agent_id=agent_id, - task_id=task_id, - user_message=user_message, - timeout=60, - sleep_interval=1.0, - yield_updates=True, - ): - content = message.content - if content and content.type == "text" and content.author == "agent": - if content.content and content.content not in parts: - parts.append(content.content) - return "\n".join(parts) - - -class TestLocalSandboxEvents: - """Test the async local-sandbox OpenAI Agents SDK agent.""" - - @pytest.mark.asyncio - async def test_shell_python_version(self, client: AsyncAgentex, agent_id: str): - """The agent should run `python3 --version` in the local sandbox. - - The sandbox runs on Python 3.12, so the real output contains "Python 3". - """ - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) - task = task_response.result - assert task is not None - - text = await _send_and_collect_agent_text( - client, - agent_id, - task.id, - "Use your shell to print the Python version on this machine, then " - "tell me what it is.", - ) - assert text, "Expected a non-empty response from the sandbox agent." - assert "Python 3" in text - - @pytest.mark.asyncio - async def test_shell_compute(self, client: AsyncAgentex, agent_id: str): - """The agent should use python3 in the sandbox to compute 21 * 2 == 42.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) - task = task_response.result - assert task is not None - - text = await _send_and_collect_agent_text( - client, - agent_id, - task.id, - "Use python3 in your shell to compute 21 * 2 and tell me the result.", - ) - assert text, "Expected a non-empty response from the sandbox agent." - assert "42" in text - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore b/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile b/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile new file mode 100644 index 000000000..e36b9e56d --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/00_base/130_claude_code/pyproject.toml /app/130_claude_code/pyproject.toml +COPY 10_async/00_base/130_claude_code/README.md /app/130_claude_code/README.md + +WORKDIR /app/130_claude_code + +COPY 10_async/00_base/130_claude_code/project /app/130_claude_code/project +COPY 10_async/00_base/130_claude_code/tests /app/130_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=ab130-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/130_claude_code/README.md b/examples/tutorials/10_async/00_base/130_claude_code/README.md new file mode 100644 index 000000000..695207c57 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 130 (async/base): Async Claude Code Agent + +This tutorial demonstrates how to build an **async (non-Temporal)** agent that +spawns the Claude Code CLI as a local subprocess and delivers its output through +the Agentex unified harness surface via ``ClaudeCodeTurn`` and +``UnifiedEmitter.auto_send_turn``. + +## Key Concepts + +### Async delivery path + +Unlike the sync tutorial (060), this agent uses the async ACP model. The +``@acp.on_task_event_send`` handler does not return a generator -- instead, +``UnifiedEmitter.auto_send_turn(turn)`` pushes events to the task's Redis +stream in real time and returns a ``TurnResult`` when the turn is complete. +The UI polls or streams that Redis channel independently. + +### ClaudeCodeTurn + UnifiedEmitter + +Same tap as the sync tutorial: +- ``ClaudeCodeTurn`` wraps ``convert_claude_code_to_agentex_events``. +- ``UnifiedEmitter`` wires trace context + chosen delivery. +- ``auto_send_turn`` is the async push path. + +### Local subprocess spawn + +``_spawn_claude`` in ``project/acp.py`` uses ``asyncio.create_subprocess_exec`` +to run: + +``` +claude -p --output-format stream-json --verbose +``` + +The prompt is written to stdin. Stdout is read line by line. + +Production isolation (Scale sandbox, secret injection, MCP configuration) +is the golden agent's concern at +``teams/sgp/agents/golden_agent/project/harness/providers/claude.py``. + +### Injectable spawn seam + +``_spawn_claude`` is a top-level async generator. Tests monkeypatch it to +inject pre-recorded stream-json lines so offline unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | ACP server, ``_spawn_claude`` seam, and event handler | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires the ``claude`` CLI installed and ``ANTHROPIC_API_KEY`` set: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. +- For multi-turn memory, persist the Claude Code session_id from the + ``result`` envelope and pass it to ``claude -r `` on the next turn. diff --git a/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml b/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml new file mode 100644 index 000000000..7d74de7c6 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/130_claude_code + - test_utils + dockerfile: 10_async/00_base/130_claude_code/Dockerfile + dockerignore: 10_async/00_base/130_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab130-claude-code + description: An async Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab130-claude-code" + description: "An async Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/130_claude_code/project/__init__.py b/examples/tutorials/10_async/00_base/130_claude_code/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py b/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py new file mode 100644 index 000000000..b6681f6a8 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py @@ -0,0 +1,149 @@ +"""ACP handler for the async Claude Code tutorial. + +Spawns ``claude -p --output-format stream-json --verbose`` as a LOCAL +asyncio subprocess (no Scale sandbox -- that is the golden agent's +production concern). Stdout lines are fed into ``ClaudeCodeTurn``. Events +are delivered via ``UnifiedEmitter.auto_send_turn``, the async Redis push +path. + +Live runs require the ``claude`` CLI to be installed and an +ANTHROPIC_API_KEY (or equivalent credential) in the environment. +For offline testing, see ``tests/test_agent_offline.py``. +""" + +from __future__ import annotations + +import os +import asyncio +from typing import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + + +async def _spawn_claude(prompt: str) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + Injectable seam: tests monkeypatch this with a fake async iterator of + pre-recorded lines so no real CLI invocation is needed offline. + """ + proc = await asyncio.create_subprocess_exec( + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info("Task created: %s", params.task.id) + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle a user message: spawn Claude Code locally and push events to the task stream.""" + task_id = params.task.id + prompt = params.event.content.content + logger.info("Processing message for task %s", task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": prompt}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + result = await emitter.auto_send_turn(turn) + if turn_span: + turn_span.output = {"final_text": result.final_text} + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info("Task canceled: %s", params.task.id) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml b/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml new file mode 100644 index 000000000..66c3cdaf3 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab130-claude-code" +version = "0.1.0" +description = "An async Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py new file mode 100644 index 000000000..ee254da23 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py @@ -0,0 +1,250 @@ +"""Tests for the async Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require the ``claude`` CLI on PATH and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline drives + ``auto_send_turn``, populates usage, and satisfies the ``HarnessTurn`` + protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-offline-async-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from async Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 12, "output_tokens": 6}, + "cost_usd": 0.0001, + "duration_ms": 300, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI or network.""" + + @pytest.mark.asyncio + async def test_auto_send_text_only_opens_and_closes_context(self): + """auto_send_turn opens and closes exactly one streaming context.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + opened = [s for s in fake_streaming.sink if s[0] == "open"] + closed = [s for s in fake_streaming.sink if s[0] == "close"] + assert len(opened) == 1 + assert len(closed) == 1 + assert opened[0][1] == "text" + + @pytest.mark.asyncio + async def test_auto_send_populates_final_text(self): + """auto_send_turn result carries the agent's reply text.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + assert "Hello from async Claude Code" in result.final_text + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """Usage is populated after the events stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + usage = turn.usage() + assert usage.input_tokens == 12 + assert usage.output_tokens == 6 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear via yield_turn on a ClaudeCodeTurn.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "ab130-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live async tests -- needs the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + @pytest.fixture + def agent_id(self, client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent {agent_name!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Create a task, send a message, and poll until a response appears.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + task_id = task.id + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task_id, + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 60 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task_id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + if agent_msgs: + assert len(agent_msgs) >= 1 + return + time.sleep(2) + + raise AssertionError("No agent response received within 60 s") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..ac48474ee --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py @@ -0,0 +1,243 @@ +"""Offline unit tests for the async Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI or an ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in +place of the real subprocess spawn and a fake streaming backend, then +assert that the handler drives ``UnifiedEmitter.auto_send_turn`` correctly. + +The injection seam is the ``_spawn_claude`` function in ``project/acp.py``. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from async Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 12, "output_tokens": 6}, + "cost_usd": 0.0001, + "duration_ms": 300, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_xyz", + "name": "Read", + "input": {"file_path": "/tmp/foo.txt"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_xyz", + "content": "file contents", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Read the file."}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 25, "output_tokens": 10}, + "cost_usd": 0.0003, + "duration_ms": 500, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _run_auto_send(lines: list[str]): + """Drive ClaudeCodeTurn through auto_send_turn with a fake streaming backend.""" + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming.sink + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_text_only_opens_and_closes_context(): + result, sink = await _run_auto_send(_TEXT_ONLY_LINES) + opened = [s for s in sink if s[0] == "open"] + closed = [s for s in sink if s[0] == "close"] + assert len(opened) == 1 + assert len(closed) == 1 + assert opened[0][1] == "text" + + +@pytest.mark.asyncio +async def test_auto_send_populates_final_text(): + result, _ = await _run_auto_send(_TEXT_ONLY_LINES) + assert "Hello from async Claude Code" in result.final_text + + +@pytest.mark.asyncio +async def test_auto_send_usage_is_populated(): + """Usage is populated after the events stream is exhausted. + + UnifiedEmitter.auto_send_turn evaluates turn.usage() eagerly (before + the events are consumed) so the TurnResult.usage reflects a pre-exhaust + snapshot. Test usage directly from the turn after auto_send_turn completes + instead -- the result envelope is populated by the generator being consumed + inside auto_send. + """ + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + fake_streaming = _FakeStreaming() + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + # After auto_send_turn, the events generator is exhausted and + # ClaudeCodeTurn._on_result has been called with the result envelope. + usage = turn.usage() + assert usage.input_tokens == 12 + assert usage.output_tokens == 6 + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_auto_send_tool_call_opens_two_contexts(): + result, sink = await _run_auto_send(_TOOL_CALL_LINES) + opened = [s for s in sink if s[0] == "open"] + content_types = [s[1] for s in opened] + assert "tool_request" in content_types + assert "text" in content_types + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/acp.py. + + The ``_spawn_claude`` function is a top-level async generator. A drop-in + replacement can be injected (e.g. via monkeypatch) to supply pre-recorded + lines without spawning the real CLI. This test proves the pattern works + end-to-end without importing the full ACP module. + """ + called: list[str] = [] + + async def _fake_spawn(prompt: str) -> AsyncIterator[str]: + called.append(prompt) + for line in _TEXT_ONLY_LINES: + yield line + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_spawn("ping")) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert called == ["ping"] + assert "Hello from async Claude Code" in result.final_text diff --git a/examples/tutorials/10_async/00_base/140_codex/.dockerignore b/examples/tutorials/10_async/00_base/140_codex/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/140_codex/Dockerfile b/examples/tutorials/10_async/00_base/140_codex/Dockerfile new file mode 100644 index 000000000..0dd839d8c --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/Dockerfile @@ -0,0 +1,45 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install the codex CLI: the agent spawns `codex exec --json`, so the binary +# must be present on PATH in the image. +RUN npm install -g @openai/codex + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/00_base/140_codex/pyproject.toml /app/140_codex/pyproject.toml +COPY 10_async/00_base/140_codex/README.md /app/140_codex/README.md + +WORKDIR /app/140_codex + +COPY 10_async/00_base/140_codex/project /app/140_codex/project +COPY 10_async/00_base/140_codex/tests /app/140_codex/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app +ENV AGENT_NAME=ab140-codex + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/140_codex/README.md b/examples/tutorials/10_async/00_base/140_codex/README.md new file mode 100644 index 000000000..a00ddb562 --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/README.md @@ -0,0 +1,40 @@ +# 140_codex (async base) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for an **async** (Redis-streaming, no Temporal) +ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox). +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to Redis via + `UnifiedEmitter.auto_send_turn`, so the UI receives tokens in real time. +- Persisting the codex thread ID in `adk.state` so subsequent turns resume the + same codex session via `codex exec resume `. + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. + +## Running offline unit tests + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/10_async/00_base/140_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/140_codex/conftest.py b/examples/tutorials/10_async/00_base/140_codex/conftest.py new file mode 100644 index 000000000..bdd78994b --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/conftest.py @@ -0,0 +1,12 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so the FastACP and tracing modules +can be imported without a running agent server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +os.environ.setdefault("ACP_URL", "http://localhost:8000") diff --git a/examples/tutorials/10_async/00_base/140_codex/manifest.yaml b/examples/tutorials/10_async/00_base/140_codex/manifest.yaml new file mode 100644 index 000000000..be020b141 --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/140_codex + - test_utils + dockerfile: 10_async/00_base/140_codex/Dockerfile + dockerignore: 10_async/00_base/140_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab140-codex + description: Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab140-codex" + description: "Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/140_codex/project/__init__.py b/examples/tutorials/10_async/00_base/140_codex/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/140_codex/project/acp.py b/examples/tutorials/10_async/00_base/140_codex/project/acp.py new file mode 100644 index 000000000..0233c49ab --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/project/acp.py @@ -0,0 +1,230 @@ +"""Async (base) ACP handler for the Codex CLI harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for an async (Redis-streaming) ACP agent without Temporal. + +The handler: +1. Spawns ``codex exec --json`` as a LOCAL asyncio subprocess (no sandbox). + This is correct for tutorials and local development; production isolation + is handled by the golden agent's Scale sandbox at + ``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. +2. Wraps the stdout line stream in a ``CodexTurn``. +3. Delivers every canonical ``StreamTaskMessage*`` event to Redis via + ``UnifiedEmitter.auto_send_turn``, so the UI receives tokens in real time. +4. Multi-turn memory is persisted via ``adk.state``. + +Live runs require: +- ``codex`` CLI on PATH (``npm install -g @openai/codex``) +- ``OPENAI_API_KEY`` set in the environment +""" + +from __future__ import annotations + +import os +import time +import codecs +import asyncio +from collections.abc import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import CodexTurn +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +class ConversationState(BaseModel): + """Per-task conversation state persisted via ``adk.state``. + + We store the codex session/thread ID so subsequent turns can resume the + same codex session via ``codex exec resume ``. + """ + + codex_thread_id: str | None = None + turn_number: int = 0 + + +async def _spawn_codex( + model: str, + thread_id: str | None = None, +) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + When ``thread_id`` is provided the subcommand becomes + ``codex exec ... resume -`` so codex continues the prior + conversation thread. + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + base_flags = [ + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + ] + + if thread_id: + cmd = ["codex", "exec", *base_flags, "resume", thread_id, "-"] + else: + cmd = ["codex", "exec", *base_flags, "-"] + + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + """Initialize per-task state on task creation.""" + logger.info("Task created: %s", params.task.id) + await adk.state.create( + task_id=params.task.id, + agent_id=params.agent.id, + state=ConversationState(), + ) + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle each user message: spawn codex, stream events, save thread ID.""" + task_id = params.task.id + agent_id = params.agent.id + user_message = params.event.content.content + + logger.info("Processing message for task %s", task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + task_state = await adk.state.get_by_task_and_agent(task_id=task_id, agent_id=agent_id) + if task_state is None: + state = ConversationState() + task_state = await adk.state.create(task_id=task_id, agent_id=agent_id, state=state) + else: + state = ConversationState.model_validate(task_state.state) + + state.turn_number += 1 + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name=f"Turn {state.turn_number}", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + start_ms = int(time.monotonic() * 1000) + + process = await _spawn_codex(MODEL, thread_id=state.codex_thread_id) + + assert process.stdin is not None + process.stdin.write(user_message.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn( + events=_process_stdout(process), + model=MODEL, + ) + + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) + + await process.wait() + + # Record the real wall-clock duration AFTER streaming completes; setting + # it before the stream ran would capture only subprocess spawn overhead. + turn.duration_ms = int(time.monotonic() * 1000) - start_ms + + # Persist the new thread ID so subsequent turns resume the same session. + usage = turn.usage() + if usage.model: + # usage() is valid now that the stream is exhausted + pass + # Persist the codex session id (public accessor; valid post-stream) so the + # next turn resumes the same session. + if turn.session_id: + state.codex_thread_id = turn.session_id + + await adk.state.update( + state_id=task_state.id, + task_id=task_id, + agent_id=agent_id, + state=state, + ) + + if turn_span: + turn_span.output = { + "final_text": result.final_text, + "model": usage.model, + } + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info("Task canceled: %s", params.task.id) diff --git a/examples/tutorials/10_async/00_base/140_codex/pyproject.toml b/examples/tutorials/10_async/00_base/140_codex/pyproject.toml new file mode 100644 index 000000000..bdf7c462f --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab140-codex" +version = "0.1.0" +description = "Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/10_async/00_base/140_codex/tests/test_agent.py b/examples/tutorials/10_async/00_base/140_codex/tests/test_agent.py new file mode 100644 index 000000000..68ca5aded --- /dev/null +++ b/examples/tutorials/10_async/00_base/140_codex/tests/test_agent.py @@ -0,0 +1,188 @@ +"""Tests for the async (base) Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH and ``OPENAI_API_KEY`` set. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexHandler``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert ``CodexTurn`` + ``UnifiedEmitter.auto_send_turn`` is driven correctly. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-xyz"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hi"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hi there!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 8, "output_tokens": 4, "total_tokens": 12}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class TestOfflineCodexHandler: + """Unit tests that run without a real codex CLI or network.""" + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """CodexTurn.usage() returns non-None tokens after stream is exhausted.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + collected = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 8 + assert usage.output_tokens == 4 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_auto_send_turn_drives_unified_surface(self): + """auto_send_turn returns a TurnResult with the final text.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message import TaskMessage + from agentex.types.text_content import TextContent + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + real_task_msg = TaskMessage( + id="msg-fake", + task_id="t", + content=TextContent(type="text", author="agent", content=""), + ) + + fake_streaming = MagicMock() + fake_ctx = AsyncMock() + fake_ctx.__aenter__ = AsyncMock(return_value=fake_ctx) + fake_ctx.__aexit__ = AsyncMock(return_value=False) + fake_ctx.stream_update = AsyncMock(return_value=MagicMock()) + fake_ctx.close = AsyncMock() + fake_ctx.task_message = real_task_msg + fake_streaming.streaming_task_message_context = MagicMock(return_value=fake_ctx) + + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + + result = await emitter.auto_send_turn(turn) + assert result is not None + + @pytest.mark.asyncio + async def test_session_id_captured_after_stream(self): + """CodexTurn._result captures the session_id from thread.started.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + _ = [e async for e in turn.events] + + assert turn._result is not None + assert turn._result["session_id"] == "thread-xyz" + + @pytest.mark.asyncio + async def test_yield_turn_is_passthrough(self): + """yield_turn mode also works with CodexTurn (no streaming infra needed).""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0 + + +# --------------------------------------------------------------------------- +# Live tests +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "ab140-codex") + + +@pytest.mark.skipif( + not LIVE, + reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY are available", +) +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI and a running Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_id(self, client): + for agent in client.agents.list(): + if agent.name == AGENT_NAME: + return agent.id + raise ValueError(f"Agent {AGENT_NAME!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Async agents process events out of band, so create a task, send an + event, and poll the task's messages for the agent's response.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task.id, + content=TextContentParam( + author="user", + content="What is 3+3? Reply with just the number.", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 60 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task.id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + if agent_msgs: + assert len(agent_msgs) >= 1 + return + time.sleep(2) + + raise AssertionError("No agent response received within 60 s") diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/README.md b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/README.md index b221c1238..66466693b 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/README.md +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/README.md @@ -1,153 +1,59 @@ -# Tutorial 110 (temporal): Pydantic AI Agent +# Temporal Pydantic AI Agent -This tutorial demonstrates a **durable** Pydantic AI agent on AgentEx, backed by Temporal: -- Workflow state survives crashes mid-conversation (Temporal replay) -- Every LLM call and every tool call becomes its own Temporal activity (independent retries + observability) -- Streaming via Redis still works — token-by-token deltas appear in the UI in real time +A minimal **Temporal-backed** Pydantic AI agent that drives the **unified +harness surface** (`UnifiedEmitter.auto_send_turn` + `PydanticAITurn`) from +inside the model activity's `event_stream_handler`. -This is the Temporal counterpart to the async base tutorial at [`10_async/00_base/110_pydantic_ai/`](../../00_base/110_pydantic_ai/). +## Why this agent exists -## Why Temporal? Why not just async? +This agent calls `emitter.auto_send_turn(...)` **explicitly** inside +the `event_stream_handler`, making the unified-surface wiring visible and giving +the temporal channel direct coverage. -In async base 110, the agent state lives in memory inside the ACP process. If that process dies mid-LLM-call, the in-flight turn is lost. Temporal fixes this by: +## How it wires the unified surface -1. Recording every external interaction (LLM call, tool call) to a durable event log. -2. On worker restart, **replaying** the workflow code, using cached activity results to skip work that already finished. -3. Letting workflows live forever — multi-day conversations or human-in-the-loop flows just work. - -## Architecture at a glance - -Two long-running processes plus shared infrastructure: - -``` -┌──────────────────────────┐ ┌──────────────────────────┐ -│ uvicorn project.acp:acp │ │ python -m run_worker │ -│ (HTTP shim, forwards │ │ (executes workflows + │ -│ signals to Temporal) │ │ activities) │ -└──────────────────────────┘ └──────────────────────────┘ - │ │ - └────► Temporal server ◄───────────┘ - (event log + queue) - - Redis ◄─── activities push deltas - │ - └─── Agentex API tails ──► UI client -``` - -The HTTP server is a thin shim that translates `task/event/send` into Temporal signals. The worker is where your agent code actually runs. Temporal sits in between, recording everything. - -## Key code patterns - -### `project/agent.py` — wrap the base agent in `TemporalAgent` - -```python -base_agent = Agent(MODEL_NAME, deps_type=TaskDeps, system_prompt=...) -base_agent.tool_plain(get_weather) - -temporal_agent = TemporalAgent( - base_agent, - name="at110_pydantic_ai_agent", - event_stream_handler=event_handler, # streams to Redis from inside the model activity -) -``` - -`TemporalAgent` (from `pydantic_ai.durable_exec.temporal`) wraps a normal Pydantic AI Agent so that: -- Each LLM call runs in its own activity -- Each tool call runs in its own activity -- The wrapping is invisible to the workflow code that calls `temporal_agent.run(...)` - -### `project/workflow.py` — declare `__pydantic_ai_agents__` +In `project/agent.py`, the `event_stream_handler` runs inside the model activity +and constructs a `UnifiedEmitter` from `RunContext.deps`: ```python -@workflow.defn(name=environment_variables.WORKFLOW_NAME) -class At110PydanticAiWorkflow(BaseWorkflow): - __pydantic_ai_agents__ = [temporal_agent] # ← discovered by PydanticAIPlugin - - @workflow.signal(name=SignalName.RECEIVE_EVENT) - async def on_task_event_send(self, params): - await adk.messages.create(task_id=params.task.id, content=params.event.content) - result = await temporal_agent.run( - params.event.content.content, - deps=TaskDeps(task_id=params.task.id), - ) +async def event_handler(run_context, events): + emitter = UnifiedEmitter( + task_id=run_context.deps.task_id, + trace_id=run_context.deps.task_id, + parent_span_id=run_context.deps.parent_span_id, + ) + turn = PydanticAITurn(events, model=MODEL_NAME, coalesce_tool_requests=True) + await emitter.auto_send_turn(turn) ``` -The `__pydantic_ai_agents__` attribute is how `PydanticAIPlugin` discovers which activities to register on the worker — no manual activity list needed. - -### `project/acp.py` — no handlers, just plugin wiring - -```python -acp = FastACP.create( - acp_type="async", - config=TemporalACPConfig( - type="temporal", - temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), - plugins=[PydanticAIPlugin()], - ), -) -``` - -When `type="temporal"`, FastACP auto-wires HTTP → workflow signals. You don't define `@acp.on_task_event_send` anywhere — Temporal handles it. - -### `project/run_worker.py` — boot the worker with the plugin - -```python -worker = AgentexWorker( - task_queue=task_queue_name, - plugins=[PydanticAIPlugin()], -) -await worker.run( - activities=get_all_activities(), - workflow=At110PydanticAiWorkflow, -) -``` - -`get_all_activities()` returns the built-in Agentex activities (state, messages, streaming, tracing). Pydantic AI's per-agent activities are auto-added by the plugin. - -## Files - -| File | Purpose | -|------|---------| -| `project/acp.py` | Thin HTTP shim — `FastACP.create(type="temporal", ...)` | -| `project/workflow.py` | `@workflow.defn` class with the signal handler | -| `project/agent.py` | Base Pydantic AI Agent wrapped in `TemporalAgent` | -| `project/tools.py` | Tool functions (must be `async` for Temporal compatibility) | -| `project/run_worker.py` | Worker boot script (separate process) | -| `tests/test_agent.py` | End-to-end test verifying tool round-trips | -| `manifest.yaml` | Sets `temporal.enabled: true` and declares workflow + queue name | - -## Running Locally - -You'll need three terminals open (this is the price of Temporal): - -```bash -# Terminal 1 — backend services (separate repo) -cd ~/scale-agentex/agentex -make dev # brings up Temporal, Redis, Postgres, Agentex API - -# Terminal 2 — this tutorial (ACP server + Temporal worker) -cd ~/scale-agentex-python/examples/tutorials/10_async/10_temporal/110_pydantic_ai -agentex agents run # this also launches the worker process - -# Terminal 3 — tests -cd ~/scale-agentex-python/examples/tutorials/10_async/10_temporal/110_pydantic_ai -uv run pytest tests/test_agent.py -v -``` - -Watch the Temporal UI at http://localhost:8233 — you'll see workflow executions, signal events, and one activity per LLM call + one per tool call. - -## Sync vs Async vs Temporal — How the code differs - -| Concern | Sync (040) | Async base (110) | Temporal (this one) | -|---|---|---|---| -| `project/acp.py` | `@acp.on_message_send` yields events | `@acp.on_task_event_send` pushes to Redis | **No handlers** — `FastACP.create(type="temporal", ...)` | -| Where the agent runs | In the ACP HTTP process | In the ACP HTTP process | In a separate worker process | -| Durability | Ephemeral — request-scoped | Ephemeral — process-scoped | **Durable** — survives worker restarts via Temporal replay | -| Per-call retries | None | None | Each model + tool call automatically retried by Temporal | -| Code we add | — | `acp.py` handler | `workflow.py`, `run_worker.py`, wrap agent in `TemporalAgent` | - -## Notes - -- Multi-turn conversation memory is not wired here. Workflow state (`self._turn_number`) is durable, but message history isn't currently threaded into `temporal_agent.run(..., message_history=...)`. To add: load via `adk.messages.list(task_id=...)` inside the signal handler and pass through. -- Reasoning/thinking tokens are not exercised by `gpt-4o-mini`. Swap to a reasoning-capable model to exercise that branch end-to-end. -- Tools must be `async` (Pydantic AI's Temporal integration requires it — sync tools would run in threads, breaking Temporal's determinism guarantees). +- The handler runs inside a Temporal activity, so it can freely make + non-deterministic Redis + tracing writes. +- `coalesce_tool_requests=True` is required on the auto_send path until + AGX1-377 lands. +- `deps` (set by `project/workflow.py`) threads the `task_id` and the per-turn + `parent_span_id` into the handler so tool spans nest under the workflow's turn + span. + +## Structure + +- `project/acp.py` — thin ACP server; FastACP auto-wires HTTP routes to the + workflow when `TemporalACPConfig` is used. +- `project/agent.py` — base `Agent` + `TemporalAgent` + the unified-surface + `event_stream_handler`. +- `project/workflow.py` — durable workflow; each turn delegates to + `temporal_agent.run(...)`. +- `project/run_worker.py` — Temporal worker entry point. +- `project/tools.py` — async `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires Temporal + Redis + + ACP server + worker). + +## Tools + +- `get_weather(city: str) -> str` (async): returns a fixed "sunny and 72°F" + string. Each tool call becomes its own Temporal activity. + +## Offline coverage + +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no Temporal server) live in the SDK repo under +`tests/lib/core/harness/` (the pydantic-ai temporal suite). diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/manifest.yaml b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/manifest.yaml index 15d00076f..7ca454b05 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/manifest.yaml +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/manifest.yaml @@ -18,7 +18,7 @@ local_development: agent: acp_type: async name: at110-pydantic-ai - description: A Temporal-backed Pydantic AI agent with tool calling and Redis streaming + description: A Temporal-backed Pydantic AI harness test agent using the unified emitter surface temporal: enabled: true @@ -42,8 +42,6 @@ agent: - env_var_name: SGP_CLIENT_BASE_URL secret_name: sgp-client-base-url secret_key: url - # env: - # OPENAI_BASE_URL: "https://your-litellm-proxy/v1" deployment: image: @@ -53,7 +51,7 @@ deployment: global: agent: name: "at110-pydantic-ai" - description: "A Temporal-backed Pydantic AI agent" + description: "A Temporal-backed Pydantic AI harness test agent using the unified emitter surface" replicaCount: 1 resources: requests: diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/acp.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/acp.py index dacb45ad6..c142dcf70 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/acp.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/acp.py @@ -1,7 +1,7 @@ -"""ACP server for the Temporal Pydantic AI tutorial. +"""ACP server for the Temporal harness Pydantic AI test agent. -This file is intentionally thin. When ``acp_type="async"`` is combined -with ``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: +This file is intentionally thin. When ``acp_type="async"`` is combined with +``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: HTTP task/create → @workflow.run on the workflow class HTTP task/event/send → @workflow.signal(SignalName.RECEIVE_EVENT) diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/agent.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/agent.py index a33a317cc..4e59688ce 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/agent.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/agent.py @@ -1,18 +1,20 @@ -"""Pydantic AI agent definition for the Temporal tutorial. +"""Pydantic AI agent definition for the Temporal harness test agent. This module constructs the base ``pydantic_ai.Agent`` once at import time, registers tools on it, and wraps it in ``TemporalAgent`` from ``pydantic_ai.durable_exec.temporal``. -The ``TemporalAgent`` wrapper makes every model call and every tool call -run as a Temporal activity automatically. The workflow code stays -deterministic; the non-deterministic work (LLM HTTP calls, tool execution) -moves into recorded activities. - -Streaming back to Agentex happens via ``event_stream_handler``, which -receives Pydantic AI ``AgentStreamEvent``s from inside the model activity -and forwards them to Redis using our existing ``stream_pydantic_ai_events`` -helper. The ``task_id`` is threaded into the handler via ``deps``. +The ``TemporalAgent`` wrapper makes every model call and every tool call run as +a Temporal activity automatically. The workflow stays deterministic; the +non-deterministic work (LLM HTTP calls, tool execution) moves into recorded +activities. + +Streaming back to Agentex happens via ``event_stream_handler``, which receives +Pydantic AI ``AgentStreamEvent``s from inside the model activity and forwards +them through the UNIFIED HARNESS SURFACE (``UnifiedEmitter.auto_send_turn`` + +``PydanticAITurn``) — called directly rather than via ``stream_pydantic_ai_events``. +The ``task_id`` and per-turn ``parent_span_id`` are threaded into the handler +via ``deps``. """ from __future__ import annotations @@ -26,10 +28,10 @@ from pydantic_ai.durable_exec.temporal import TemporalAgent from project.tools import get_weather -from agentex.lib.adk import ( - stream_pydantic_ai_events, - create_pydantic_ai_tracing_handler, -) +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +__all__ = ["TaskDeps", "temporal_agent", "base_agent", "MODEL_NAME"] MODEL_NAME = "openai:gpt-4o-mini" SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. @@ -48,13 +50,13 @@ class TaskDeps(BaseModel): """Per-run dependencies passed into the agent via ``deps=``. Pydantic AI's ``RunContext.deps`` is the canonical place to thread - request-scoped data (like the Agentex task_id) into tools and - event handlers — including code that runs inside Temporal activities. + request-scoped data (like the Agentex task_id) into tools and event + handlers — including code that runs inside Temporal activities. """ task_id: str - # When set, the event handler nests per-tool-call spans under this - # span. Typically the ID of the per-turn span opened by the workflow. + # When set, the event handler nests per-tool-call spans under this span. + # Typically the ID of the per-turn span opened by the workflow. parent_span_id: str | None = None @@ -77,32 +79,33 @@ async def event_handler( run_context: RunContext[TaskDeps], events: AsyncIterable[AgentStreamEvent], ) -> None: - """Stream Pydantic AI events to Agentex via Redis from inside the model activity. + """Stream Pydantic AI events to Agentex via the unified surface. Pydantic AI calls this with the live event stream as soon as the model - activity begins emitting parts. Because the handler runs inside the - activity (not the workflow), it can freely make non-deterministic - Redis writes — including the tracing HTTP calls that record per-tool-call - spans under the workflow's per-turn span (when ``parent_span_id`` is set). + activity begins emitting parts. Because the handler runs inside the activity + (not the workflow), it can freely make non-deterministic Redis + tracing + writes. + + The UnifiedEmitter is constructed from ``deps`` (task_id + parent_span_id), + so tool spans nest under the workflow's per-turn span and messages auto-send + to the task stream. The auto_send path delivers streamed tool requests + natively, so no coalescing workaround is needed. """ - tracing_handler = create_pydantic_ai_tracing_handler( + emitter = UnifiedEmitter( + task_id=run_context.deps.task_id, trace_id=run_context.deps.task_id, parent_span_id=run_context.deps.parent_span_id, - task_id=run_context.deps.task_id, - ) - await stream_pydantic_ai_events( - events, - run_context.deps.task_id, - tracing_handler=tracing_handler, ) + turn = PydanticAITurn(events, model=MODEL_NAME) + await emitter.auto_send_turn(turn) -# Construct the durable agent at module load time so that the -# PydanticAIPlugin can auto-discover its activities via the workflow's -# ``__pydantic_ai_agents__`` attribute. +# Construct the durable agent at module load time so that the PydanticAIPlugin +# can auto-discover its activities via the workflow's ``__pydantic_ai_agents__`` +# attribute. base_agent = _build_base_agent() temporal_agent: TemporalAgent[TaskDeps, str] = TemporalAgent( base_agent, - name="at110_pydantic_ai_agent", + name="pydantic_ai_agent", event_stream_handler=event_handler, ) diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/run_worker.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/run_worker.py index e54c9d1dc..4b4d43d19 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/run_worker.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/run_worker.py @@ -1,18 +1,18 @@ -"""Temporal worker for the Pydantic AI tutorial. +"""Temporal worker for the harness Pydantic AI test agent. -Run as a separate long-lived process alongside the ACP HTTP server. The -worker polls Temporal for workflow + activity tasks and executes them. +Run as a separate long-lived process alongside the ACP HTTP server. The worker +polls Temporal for workflow + activity tasks and executes them. -The ``PydanticAIPlugin`` reads ``__pydantic_ai_agents__`` off the workflow -class and registers every model/tool activity the TemporalAgent needs — -so we don't have to enumerate activities by hand here. +The ``PydanticAIPlugin`` reads ``__pydantic_ai_agents__`` off the workflow class +and registers every model/tool activity the TemporalAgent needs — so we don't +have to enumerate activities by hand here. """ import asyncio from pydantic_ai.durable_exec.temporal import PydanticAIPlugin -from project.workflow import At110PydanticAiWorkflow +from project.workflow import HarnessPydanticAiWorkflow from agentex.lib.utils.debug import setup_debug_if_enabled from agentex.lib.utils.logging import make_logger from agentex.lib.environment_variables import EnvironmentVariables @@ -31,8 +31,8 @@ async def main(): raise ValueError("WORKFLOW_TASK_QUEUE is not set") # get_all_activities() returns the built-in Agentex activities (state, - # messages, streaming, tracing). Pydantic AI's TemporalAgent activities - # are auto-registered by PydanticAIPlugin via __pydantic_ai_agents__. + # messages, streaming, tracing). Pydantic AI's TemporalAgent activities are + # auto-registered by PydanticAIPlugin via __pydantic_ai_agents__. worker = AgentexWorker( task_queue=task_queue_name, plugins=[PydanticAIPlugin()], @@ -40,7 +40,7 @@ async def main(): await worker.run( activities=get_all_activities(), - workflow=At110PydanticAiWorkflow, + workflow=HarnessPydanticAiWorkflow, ) diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/tools.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/tools.py index 75640fcb7..bbd6c5200 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/tools.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/tools.py @@ -1,9 +1,8 @@ -"""Tool definitions for the Temporal Pydantic AI agent. +"""Tool definitions for the Temporal harness Pydantic AI agent. These functions are registered on the base Pydantic AI agent. When the agent is wrapped in ``TemporalAgent``, each tool call becomes its own Temporal -activity automatically — independently retryable and observable in the -Temporal UI. +activity automatically — independently retryable and observable. Tools must be ``async`` because Pydantic AI's Temporal integration requires it: non-async tools would run in threads, which is non-deterministic and diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/workflow.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/workflow.py index bb07ac818..9a01be7de 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/workflow.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/project/workflow.py @@ -1,16 +1,16 @@ -"""Temporal workflow for the Pydantic AI tutorial. +"""Temporal workflow for the harness Pydantic AI test agent. The workflow holds task state durably across crashes. Its signal handler -delegates the actual agent run to ``temporal_agent.run(...)`` — which -internally schedules model and tool activities, each independently -durable. The ``event_stream_handler`` registered on ``temporal_agent`` -pushes streaming deltas to Redis while the model activity runs. +delegates the actual agent run to ``temporal_agent.run(...)`` — which internally +schedules model and tool activities, each independently durable. The +``event_stream_handler`` registered on ``temporal_agent`` (see project.agent) +pushes streaming deltas through the unified harness surface while the model +activity runs. Multi-turn memory is kept on the workflow instance itself -(``self._message_history``). Temporal's workflow state is already durable -and replay-safe, so unlike the async-base tutorial we don't need an -external ``adk.state`` round-trip — the message list survives crashes -because Temporal replays activity results that produced it. +(``self._message_history``). Temporal's workflow state is already durable and +replay-safe, so unlike the async-base agent we don't need an external +``adk.state`` round-trip. """ from __future__ import annotations @@ -56,14 +56,14 @@ @workflow.defn(name=environment_variables.WORKFLOW_NAME) -class At110PydanticAiWorkflow(BaseWorkflow): +class HarnessPydanticAiWorkflow(BaseWorkflow): """Long-running Temporal workflow that delegates each turn to a Pydantic AI TemporalAgent. The ``__pydantic_ai_agents__`` attribute is the marker the ``PydanticAIPlugin`` looks for at worker startup: it pulls - ``temporal_agent.temporal_activities`` off this list and registers them - on the worker automatically — so we don't have to list activities by - hand in ``run_worker.py``. + ``temporal_agent.temporal_activities`` off this list and registers them on + the worker automatically — so we don't have to list activities by hand in + ``run_worker.py``. """ __pydantic_ai_agents__ = [temporal_agent] @@ -74,8 +74,8 @@ def __init__(self): self._turn_number = 0 # Conversation history accumulated across turns. Each entry is a # pydantic-ai ``ModelMessage``. Temporal replays the activity that - # produced these messages, so the list is rebuilt deterministically - # if the workflow ever recovers from a crash. + # produced these messages, so the list is rebuilt deterministically if + # the workflow ever recovers from a crash. self._message_history: list["ModelMessage"] = [] @workflow.signal(name=SignalName.RECEIVE_EVENT) @@ -93,17 +93,10 @@ async def on_task_event_send(self, params: SendEventParams) -> None: name=f"Turn {self._turn_number}", input={"message": params.event.content.content}, ) as span: - # temporal_agent.run() is the magic line. From the outside it - # looks like a regular async call. Internally it schedules: - # 1. A model activity (LLM HTTP call recorded by Temporal) - # 2. For each tool the model invokes, a tool activity - # 3. Each activity is retried, observable, and durable - # While the model activity runs, the event_stream_handler on - # temporal_agent pushes deltas to Redis so the UI sees tokens. - # - # Passing ``message_history`` makes the run remember prior turns: - # without it the agent would respond to each user message as if - # it had never seen the conversation before. + # temporal_agent.run() schedules a model activity, per-tool + # activities, and the event_stream_handler activity (which pushes + # deltas through the unified surface). Passing ``message_history`` + # makes the run remember prior turns. result = await temporal_agent.run( params.event.content.content, message_history=self._message_history, @@ -112,8 +105,8 @@ async def on_task_event_send(self, params: SendEventParams) -> None: parent_span_id=span.id if span else None, ), ) - # Persist the new full history (user + assistant + any tool - # rounds) so the next turn picks up from here. + # Persist the new full history (user + assistant + any tool rounds) + # so the next turn picks up from here. self._message_history = list(result.all_messages()) if span: span.output = {"final_output": result.output} diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/pyproject.toml b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/pyproject.toml index 9f47733c0..2f308f2a1 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/pyproject.toml +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "at110-pydantic-ai" version = "0.1.0" -description = "A Temporal-backed Pydantic AI agent with tool calling and Redis streaming" +description = "A Temporal-backed Pydantic AI harness test agent using the unified emitter surface" readme = "README.md" requires-python = ">=3.12" dependencies = [ diff --git a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py index d01276ab8..974cddcc0 100644 --- a/examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py +++ b/examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py @@ -1,9 +1,10 @@ -"""Tests for the Temporal Pydantic AI agent. +"""Live tests for the Temporal Pydantic AI agent. -This test suite validates: -- The agent responds to a basic message -- Tool calls are visible in the message history (proving each tool call - ran as its own Temporal activity) +These tests require a running agent (Temporal + Redis + ACP server + worker) and +exercise the unified-surface event_stream_handler end-to-end over the wire. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in the SDK repo under ``tests/lib/core/harness/`` (the pydantic-ai temporal suite). To run these tests: 1. Make sure the agent is running (worker + ACP server) @@ -16,10 +17,7 @@ import pytest import pytest_asyncio -from test_utils.async_utils import ( - poll_messages, - send_event_and_poll_yielding, -) +from test_utils.async_utils import poll_messages, send_event_and_poll_yielding from agentex import AsyncAgentex from agentex.types.task_message import TaskMessage @@ -51,14 +49,12 @@ async def agent_id(client, agent_name): class TestNonStreamingEvents: - """Test that the Temporal-backed Pydantic AI agent responds and uses tools.""" + """Test that the Temporal-backed harness agent responds and uses tools.""" @pytest.mark.asyncio async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): """Drive a full turn: create task, send a weather question, verify tool round-trip.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None @@ -71,11 +67,7 @@ async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): sleep_interval=1.0, ): assert isinstance(message, TaskMessage) - if ( - message.content - and message.content.type == "text" - and message.content.author == "agent" - ): + if message.content and message.content.type == "text" and message.content.author == "agent": task_creation_found = True break assert task_creation_found, "Task creation welcome message not found" @@ -101,11 +93,7 @@ async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): if final_message and getattr(final_message, "streaming_status", None) == "DONE": break - if ( - message.content - and message.content.type == "text" - and message.content.author == "agent" - ): + if message.content and message.content.type == "text" and message.content.author == "agent": final_message = message content_length = len(getattr(message.content, "content", "") or "") if message.streaming_status == "DONE" and content_length > 0: @@ -115,9 +103,7 @@ async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): assert seen_tool_request, "Expected a tool_request (agent calling get_weather)" assert seen_tool_response, "Expected a tool_response (get_weather result)" assert final_message is not None, "Expected a final agent text message" - final_text = ( - getattr(final_message.content, "content", None) if final_message.content else None - ) + final_text = getattr(final_message.content, "content", None) if final_message.content else None assert isinstance(final_text, str) and len(final_text) > 0 # The get_weather tool always returns "72°F" — the response should mention it. assert "72" in final_text, "Expected weather response to mention 72°F" diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/.dockerignore b/examples/tutorials/10_async/10_temporal/120_openai_agents/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/Dockerfile b/examples/tutorials/10_async/10_temporal/120_openai_agents/Dockerfile new file mode 100644 index 000000000..700f56cea --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/120_openai_agents/pyproject.toml /app/120_openai_agents/pyproject.toml +COPY 10_async/10_temporal/120_openai_agents/README.md /app/120_openai_agents/README.md + +WORKDIR /app/120_openai_agents + +COPY 10_async/10_temporal/120_openai_agents/project /app/120_openai_agents/project +COPY 10_async/10_temporal/120_openai_agents/tests /app/120_openai_agents/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at120-openai-agents + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When we deploy the worker, we will replace the CMD with the following +# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/README.md b/examples/tutorials/10_async/10_temporal/120_openai_agents/README.md new file mode 100644 index 000000000..4db26d0a1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/README.md @@ -0,0 +1,41 @@ +# Temporal OpenAI Agents on the unified harness surface + +A Temporal-backed Agentex agent that runs the OpenAI Agents SDK and delivers its +output through the **unified harness surface**. + +## What this demonstrates + +LLM calls are non-deterministic, so they can't run directly in a Temporal +workflow. This tutorial keeps the workflow (`project/workflow.py`) +deterministic and delegates each turn to a custom activity +(`project/activities.py`). The activity uses the SAME `OpenAITurn` adapter as +the sync (`050_openai_agents`) and async (`120_openai_agents`) variants, and +delivers via `UnifiedEmitter.auto_send_turn` — which is designed to run inside +an activity (it writes streaming side effects to Redis and returns the final +text + usage). + +```python +# inside the activity: +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) +turn_result = await emitter.auto_send_turn(turn) +return turn_result.final_text +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +This starts both the ACP HTTP server and the Temporal worker. + +## Test it + +The offline test exercises the activity's delivery path with an injected fake +streaming backend (no server, Temporal, Redis, or API key required): + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/environments.yaml b/examples/tutorials/10_async/10_temporal/120_openai_agents/environments.yaml similarity index 100% rename from examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/environments.yaml rename to examples/tutorials/10_async/10_temporal/120_openai_agents/environments.yaml diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/manifest.yaml b/examples/tutorials/10_async/10_temporal/120_openai_agents/manifest.yaml new file mode 100644 index 000000000..4b59db442 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/120_openai_agents + - test_utils + dockerfile: 10_async/10_temporal/120_openai_agents/Dockerfile + dockerignore: 10_async/10_temporal/120_openai_agents/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at120-openai-agents + description: A Temporal-backed OpenAI Agents SDK agent on the unified harness surface + + temporal: + enabled: true + workflows: + - name: at120-openai-agents + queue_name: at120_openai_agents_queue + + credentials: + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at120-openai-agents" + description: "A Temporal-backed OpenAI Agents SDK agent on the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/__init__.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/acp.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/acp.py new file mode 100644 index 000000000..6076835ba --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/acp.py @@ -0,0 +1,33 @@ +"""ACP server for the Temporal OpenAI Agents harness tutorial. + +Thin by design: with ``acp_type="async"`` + ``TemporalACPConfig``, FastACP +auto-wires task/create, task/event/send, and task/cancel onto the workflow. +The agent logic lives in ``project/workflow.py`` (deterministic) and +``project/activities.py`` (the harness-backed LLM run), executed by the worker +in ``project/run_worker.py``. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client +# compatibility, so the same example works behind the Scale LiteLLM gateway. +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/activities.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/activities.py new file mode 100644 index 000000000..72c92d617 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/activities.py @@ -0,0 +1,80 @@ +"""Custom Temporal activity that runs the OpenAI agent on the harness surface. + +LLM calls are non-deterministic, so they must run inside a Temporal activity +rather than directly in the workflow. This activity runs the OpenAI Agents SDK +via ``Runner.run_streamed``, wraps the result in an ``OpenAITurn``, and pushes +the canonical stream to the task stream via ``UnifiedEmitter.auto_send_turn``. + +``auto_send`` (which backs ``auto_send_turn``) is explicitly designed to be +called from inside an activity: it writes streaming side effects to Redis and +returns the accumulated final text + normalized usage. +""" + +from __future__ import annotations + +from typing import Any +from datetime import datetime + +from agents import Runner +from pydantic import BaseModel +from temporalio import activity + +from project.agent import MODEL_NAME, create_agent +from agentex.lib.utils.logging import make_logger +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + +logger = make_logger(__name__) + +RUN_AGENT_ACTIVITY = "run_openai_agent" + + +class RunHarnessAgentParams(BaseModel): + """Parameters for the harness agent activity.""" + + task_id: str + user_message: str + # Prior conversation as OpenAI Agents SDK input items, so the agent sees the + # full history (not just the latest message) on every turn. + input_list: list[Any] = [] + trace_id: str | None = None + parent_span_id: str | None = None + # Deterministic turn timestamp from workflow.now(); forwarded to + # auto_send_turn so retried activities re-emit messages with stable + # timestamps instead of new server-side ones (which could reorder turns). + created_at: datetime | None = None + + +class RunHarnessAgentResult(BaseModel): + """Result of one harness turn.""" + + final_text: str + # Updated conversation (prior history + this turn) to carry into the next turn. + input_list: list[Any] + + +class HarnessActivities: + """Hosts the harness-backed OpenAI agent activity.""" + + @activity.defn(name=RUN_AGENT_ACTIVITY) + async def run_openai_agent(self, params: RunHarnessAgentParams) -> RunHarnessAgentResult: + """Run the agent for one turn and auto-send its output. + + Threads the running conversation through ``input_list`` so multi-turn + chats retain memory: prior history + the new user message go in, and the + updated conversation comes back out via ``result.to_input_list()``. + """ + logger.info(f"Running harness OpenAI agent for task {params.task_id}") + + agent = create_agent() + input_list: list[Any] = [*params.input_list, {"role": "user", "content": params.user_message}] + result = Runner.run_streamed(starting_agent=agent, input=input_list) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + turn_result = await emitter.auto_send_turn(turn, created_at=params.created_at) + # to_input_list() is valid now: auto_send_turn has exhausted the stream. + return RunHarnessAgentResult(final_text=turn_result.final_text, input_list=result.to_input_list()) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/agent.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/agent.py new file mode 100644 index 000000000..385a80b69 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/agent.py @@ -0,0 +1,44 @@ +"""OpenAI Agents SDK agent definition for the Temporal harness tutorial. + +Same agent shape as the sync (060) and async (130) variants. Here the agent is +built and run inside a Temporal activity (see ``project.activities``); the +workflow stays deterministic and delegates the non-deterministic LLM run to that +activity, which delivers the turn via the unified harness surface. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/run_worker.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/run_worker.py new file mode 100644 index 000000000..b82ee0f50 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/run_worker.py @@ -0,0 +1,44 @@ +"""Temporal worker for the OpenAI Agents harness tutorial. + +Runs as a separate long-lived process alongside the ACP HTTP server. Registers +the built-in Agentex activities plus the custom harness agent activity +(``HarnessActivities.run_openai_agent``), and the workflow. +""" + +import asyncio + +from project.workflow import At140HarnessOpenaiWorkflow +from project.activities import HarnessActivities +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + harness_activities = HarnessActivities() + all_activities = [ + harness_activities.run_openai_agent, + *get_all_activities(), + ] + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=all_activities, + workflow=At140HarnessOpenaiWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/tools.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/tools.py new file mode 100644 index 000000000..d26f9b097 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/tools.py @@ -0,0 +1,15 @@ +"""Tool definitions for the Temporal OpenAI Agents harness tutorial.""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/project/workflow.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/workflow.py new file mode 100644 index 000000000..5cb8fb38b --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/project/workflow.py @@ -0,0 +1,124 @@ +"""Temporal workflow for the OpenAI Agents harness tutorial. + +The workflow stays deterministic: it echoes the user message and delegates the +non-deterministic LLM run to ``run_openai_agent`` (see +``project.activities``). That activity runs the OpenAI Agents SDK and delivers +the turn through the unified harness surface (``OpenAITurn`` + +``UnifiedEmitter.auto_send_turn``). +""" + +from __future__ import annotations + +import os +import json +from datetime import timedelta + +from temporalio import workflow +from temporalio.common import RetryPolicy + +from agentex.lib import adk +from project.activities import ( + RUN_AGENT_ACTIVITY, + RunHarnessAgentParams, + RunHarnessAgentResult, +) +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class At140HarnessOpenaiWorkflow(BaseWorkflow): + """Long-running workflow that runs each turn through the harness activity.""" + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + # Running conversation (OpenAI Agents SDK input items) so each turn sees + # the full history, not just the latest user message. + self._messages: list = [] + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a user message: echo it, then run the harness activity durably.""" + logger.info(f"Received task event: {params.task.id}") + self._turn_number += 1 + + # Echo the user's message so it shows up in the UI as a chat bubble. + await adk.messages.create(task_id=params.task.id, content=params.event.content) + + async with adk.tracing.span( + trace_id=params.task.id, + task_id=params.task.id, + name=f"Turn {self._turn_number}", + input={"message": params.event.content.content}, + ) as span: + turn_result = await workflow.execute_activity( + RUN_AGENT_ACTIVITY, + RunHarnessAgentParams( + task_id=params.task.id, + user_message=params.event.content.content, + input_list=self._messages, + trace_id=params.task.id, + parent_span_id=span.id if span else None, + # Deterministic timestamp under replay so a retried activity + # re-emits this turn's messages with stable ordering. + created_at=workflow.now(), + ), + start_to_close_timeout=timedelta(minutes=5), + retry_policy=RetryPolicy(maximum_attempts=3), + result_type=RunHarnessAgentResult, + ) + # Carry the updated conversation into the next turn. + self._messages = turn_result.input_list + if span: + span.output = {"final_output": turn_result.final_text} + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + """Workflow entry point — keep the conversation alive for incoming signals.""" + logger.info(f"Task created: {params.task.id}") + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" + f"Send me a message and I'll respond using an OpenAI Agents SDK agent " + f"delivered through the unified harness surface." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + """Graceful workflow shutdown signal.""" + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/pyproject.toml b/examples/tutorials/10_async/10_temporal/120_openai_agents/pyproject.toml similarity index 72% rename from examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/pyproject.toml rename to examples/tutorials/10_async/10_temporal/120_openai_agents/pyproject.toml index 696894e32..e6c77fae3 100644 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/pyproject.toml +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/pyproject.toml @@ -3,21 +3,23 @@ requires = ["hatchling"] build-backend = "hatchling.build" [project] -name = "at120_openai_agents_local_sandbox" +name = "at120-openai-agents" version = "0.1.0" -description = "A Temporal OpenAI Agents SDK agent using a local (unix_local) sandbox" +description = "A Temporal-backed OpenAI Agents SDK agent on the unified harness surface" +readme = "README.md" requires-python = ">=3.12" dependencies = [ - "agentex-sdk>=0.6.0", - "openai-agents>=0.14.3,<0.15", - "temporalio>=1.18.2", + "agentex-sdk", "scale-gp", + "temporalio>=1.18.2", + "openai-agents", ] [project.optional-dependencies] dev = [ "pytest", "pytest-asyncio", + "httpx", "black", "isort", "flake8", diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/120_openai_agents/tests/test_agent.py new file mode 100644 index 000000000..dd043c44c --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/120_openai_agents/tests/test_agent.py @@ -0,0 +1,77 @@ +"""Offline test for the Temporal OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server, Temporal, Redis, or an +OpenAI API key. It verifies the delivery path the harness activity uses: an +``OpenAITurn`` built from an injected canonical stream, pushed through +``UnifiedEmitter.auto_send_turn`` with an injected fake streaming backend, +returns the accumulated final text (which the activity returns to the workflow). + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +class _FakeCtx: + def __init__(self, initial_content): + self.task_message = TaskMessage(id="m-1", task_id="task-1", content=initial_content) + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + pass + + async def stream_update(self, update): + return update + + +class _FakeStreaming: + def streaming_task_message_context(self, task_id, initial_content, **_kwargs): # noqa: ARG002 + return _FakeCtx(initial_content) + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_activity_delivery_returns_final_text(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="72")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="F")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=_FakeStreaming(), + ) + + result = await emitter.auto_send_turn(turn) + assert result.final_text == "72F" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/Dockerfile b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/Dockerfile deleted file mode 100644 index d4927d0ce..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/Dockerfile +++ /dev/null @@ -1,62 +0,0 @@ -# syntax=docker/dockerfile:1.3 -FROM python:3.12-slim -COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - htop \ - vim \ - curl \ - tar \ - python3-dev \ - postgresql-client \ - build-essential \ - libpq-dev \ - gcc \ - cmake \ - netcat-openbsd \ - nodejs \ - npm \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/** - -# Install tctl (Temporal CLI) -RUN curl -L https://github.com/temporalio/tctl/releases/download/v1.18.1/tctl_1.18.1_linux_arm64.tar.gz -o /tmp/tctl.tar.gz && \ - tar -xzf /tmp/tctl.tar.gz -C /usr/local/bin && \ - chmod +x /usr/local/bin/tctl && \ - rm /tmp/tctl.tar.gz - -RUN uv pip install --system --upgrade pip setuptools wheel - -ENV UV_HTTP_TIMEOUT=1000 - -# Copy pyproject.toml and README.md to install dependencies -COPY 10_async/10_temporal/120_openai_agents_local_sandbox/pyproject.toml /app/120_openai_agents_local_sandbox/pyproject.toml -COPY 10_async/10_temporal/120_openai_agents_local_sandbox/README.md /app/120_openai_agents_local_sandbox/README.md - -WORKDIR /app/120_openai_agents_local_sandbox - -# Copy the project code -COPY 10_async/10_temporal/120_openai_agents_local_sandbox/project /app/120_openai_agents_local_sandbox/project - -# Copy the test files -COPY 10_async/10_temporal/120_openai_agents_local_sandbox/tests /app/120_openai_agents_local_sandbox/tests - -# Copy shared test utilities -COPY test_utils /app/test_utils - -# Install the required Python packages with dev dependencies -RUN uv pip install --system .[dev] - -WORKDIR /app/120_openai_agents_local_sandbox - -ENV PYTHONPATH=/app - -# Set test environment variables -ENV AGENT_NAME=at120-openai-agents-local-sandbox - -# Run the ACP server using uvicorn -CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] - -# When we deploy the worker, we will replace the CMD with the following -# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/README.md b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/README.md deleted file mode 100644 index 161bc43da..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/README.md +++ /dev/null @@ -1,130 +0,0 @@ -# Tutorial 120: Temporal OpenAI Agents SDK with a Local Sandbox - -This tutorial demonstrates running an [OpenAI Agents SDK](https://developers.openai.com/api/docs/guides/agents) -`SandboxAgent` inside a **Temporal** workflow, backed by the **local** -(`unix_local`) sandbox. - -The agent is a "local sandbox assistant": it answers questions by actually running -real shell commands (e.g. `python3 --version`, `ls`, `python3 -c "..."`) instead of -guessing. Because it runs inside Temporal, the sandbox tool calls become durable, -retried, and observable activities. - -This mirrors the canonical OpenAI Agents SDK Temporal example -(`060_open_ai_agents_sdk_hello_world`) and the tools example -(`070_open_ai_agents_sdk_tools`). The new piece is the **Temporal sandbox bridge**. - -## Key Concepts - -### Temporal ACP -The Temporal ACP model (`acp_type: async`, `temporal.enabled: true`) maps task -lifecycle to a Temporal workflow: -- `@workflow.run` (`on_task_create`) keeps the conversation alive. -- `@workflow.signal(name=SignalName.RECEIVE_EVENT)` (`on_task_event_send`) handles - each user message. - -No ACP handlers are registered by hand — the `TemporalACPConfig` wires them to the -workflow automatically. - -### Streaming (Interceptor + Model Provider + Hooks) -Real-time streaming uses STANDARD Temporal components — no forked plugin: -- **`ContextInterceptor`** threads `task_id` through activity headers. The workflow - sets `self._task_id` so the interceptor can read it. -- **`TemporalStreamingModelProvider`** returns a model that streams tokens to Redis - in real time while still returning the complete response to Temporal for - determinism / replay safety. -- **`TemporalStreamingHooks`** creates the lifecycle messages (tool request / - response, etc.) in the database. - -The `stream_lifecycle_content` activity must be registered on the worker alongside -`get_all_activities()`. - -### The Temporal sandbox bridge (`UnixLocalSandboxClient`) -The sandbox client is registered ON THE WORKER (and the ACP) via the standard -plugin: - -```python -from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClient -from temporalio.contrib.openai_agents import OpenAIAgentsPlugin, SandboxClientProvider - -OpenAIAgentsPlugin( - model_provider=TemporalStreamingModelProvider(), - sandbox_clients=[SandboxClientProvider("local", UnixLocalSandboxClient())], -) -``` - -Inside the workflow, the run is pointed at that backend by name: - -```python -from temporalio.contrib.openai_agents.workflow import temporal_sandbox_client -from agents.sandbox import SandboxAgent, SandboxRunConfig -from agents.run_config import RunConfig -from agents.sandbox.snapshot import NoopSnapshotSpec -from agents.sandbox.capabilities import Shell -from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClientOptions - -agent = SandboxAgent( - name="Local Sandbox Assistant", - model="gpt-4o-mini", - instructions="...use the shell tools to actually run commands...", - capabilities=[Shell()], -) -run_config = RunConfig( - sandbox=SandboxRunConfig( - client=temporal_sandbox_client("local"), - options=UnixLocalSandboxClientOptions(), - snapshot=NoopSnapshotSpec(), # skip the per-turn workspace snapshot - ) -) -result = await Runner.run( - agent, self._state.input_list, run_config=run_config, - hooks=TemporalStreamingHooks(task_id=params.task.id), -) -``` - -`temporal_sandbox_client("local")` resolves the worker-registered client, so the -sandbox shell tool calls run as Temporal activities (durable + observable in the -Temporal UI). - -## Two important lessons - -1. **Don't double-post the assistant message.** The `TemporalStreamingModelProvider` - already streams AND persists the assistant's response. If you also call - `adk.messages.create(...)` after `Runner.run`, the answer shows up twice. We only - persist conversation state for the next turn via `result.to_input_list()`. -2. **Use `NoopSnapshotSpec()`.** Without it, the sandbox tries to take a per-turn - workspace snapshot, and stopping the sandbox can raise - `WorkspaceArchiveReadError`. `NoopSnapshotSpec()` skips that snapshot. - -## Files - -| File | Description | -|------|-------------| -| `project/acp.py` | Temporal ACP server (plugin + sandbox client + interceptor) | -| `project/run_worker.py` | Temporal worker (registers workflow, activities, plugin, sandbox client) | -| `project/workflow.py` | `BaseWorkflow` that runs the `SandboxAgent` against the local sandbox | -| `tests/test_agent.py` | Integration tests (polling pattern) | -| `manifest.yaml` | Agent configuration (temporal enabled) | -| `environments.yaml` | Per-environment deployment overrides | - -## Running Locally - -```bash -# From this directory -agentex agents run -``` - -Set `OPENAI_API_KEY` (or `LITELLM_API_KEY` if you're behind the Scale LiteLLM -gateway) in your environment or in a `.env` file in `project/` so the agent can call -the model. - -## Running Tests - -```bash -pytest tests/test_agent.py -v -``` - -## Further Reading - -- OpenAI Agents SDK guide: https://developers.openai.com/api/docs/guides/agents -- The async (non-Temporal) variant: `10_async/00_base/120_openai_agents_local_sandbox` -- The canonical OpenAI Agents SDK Temporal example: `10_async/10_temporal/060_open_ai_agents_sdk_hello_world` diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/manifest.yaml b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/manifest.yaml deleted file mode 100644 index 86ac89288..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/manifest.yaml +++ /dev/null @@ -1,111 +0,0 @@ -# Agent Manifest Configuration -# --------------------------- -# This file defines how your agent should be built and deployed. - -# Build Configuration -# ------------------ -build: - context: - # Root directory for the build context - root: ../../../ # Up to tutorials level to include test_utils - - # Paths to include in the Docker build context - include_paths: - - 10_async/10_temporal/120_openai_agents_local_sandbox - - test_utils - - # Path to your agent's Dockerfile (relative to the root directory) - dockerfile: 10_async/10_temporal/120_openai_agents_local_sandbox/Dockerfile - - # Path to your agent's .dockerignore - dockerignore: 10_async/10_temporal/120_openai_agents_local_sandbox/.dockerignore - - -# Local Development Configuration -# ----------------------------- -local_development: - agent: - port: 8000 # Port where your local ACP server is running - host_address: host.docker.internal # Host address for Docker networking - - # File paths for local development (relative to this manifest.yaml) - paths: - # Path to ACP server file - acp: project/acp.py - # Path to temporal worker file - worker: project/run_worker.py - - -# Agent Configuration -# ----------------- -agent: - # Type of agent - either sync or async - acp_type: async - - # Unique name for your agent - name: at120-openai-agents-local-sandbox - - # Description of what your agent does - description: A Temporal OpenAI Agents SDK agent using a local (unix_local) sandbox - - # Temporal workflow configuration - temporal: - enabled: true - workflows: - # Name of the workflow class (must match the @workflow.defn name in workflow.py) - - name: at120-openai-agents-local-sandbox - - # Queue name for task distribution - queue_name: at120_openai_agents_local_sandbox_queue - - # Credentials mapping (maps Kubernetes secrets to environment variables) - credentials: - - env_var_name: OPENAI_API_KEY - secret_name: openai-api-key - secret_key: api-key - - env_var_name: REDIS_URL - secret_name: redis-url-secret - secret_key: url - - env_var_name: SGP_API_KEY - secret_name: sgp-api-key - secret_key: api-key - - env_var_name: SGP_ACCOUNT_ID - secret_name: sgp-account-id - secret_key: account-id - - env_var_name: SGP_CLIENT_BASE_URL - secret_name: sgp-client-base-url - secret_key: url - - # Environment variables for running locally and for deployment - env: - OPENAI_AGENTS_DISABLE_TRACING: "1" - - -# Deployment Configuration -# ----------------------- -deployment: - # Container image configuration - image: - repository: "" # Update with your container registry - tag: "latest" # Default tag, should be versioned in production - - imagePullSecrets: - - name: my-registry-secret # Update with your image pull secret name - - # Global deployment settings that apply to all clusters - global: - agent: - name: "at120-openai-agents-local-sandbox" - description: "A Temporal OpenAI Agents SDK agent using a local (unix_local) sandbox" - - # Default replica count - replicaCount: 1 - - # Default resource requirements - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/acp.py b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/acp.py deleted file mode 100644 index 196e1e7cd..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/acp.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import sys - -from temporalio.contrib.openai_agents import ( - OpenAIAgentsPlugin, - SandboxClientProvider, -) -from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClient - -# === DEBUG SETUP (AgentEx CLI Debug Support) === -if os.getenv("AGENTEX_DEBUG_ENABLED") == "true": - try: - import debugpy - debug_port = int(os.getenv("AGENTEX_DEBUG_PORT", "5679")) - debug_type = os.getenv("AGENTEX_DEBUG_TYPE", "acp") - wait_for_attach = os.getenv("AGENTEX_DEBUG_WAIT_FOR_ATTACH", "false").lower() == "true" - - # Configure debugpy - debugpy.configure(subProcess=False) - debugpy.listen(debug_port) - - print(f"🐛 [{debug_type.upper()}] Debug server listening on port {debug_port}") - - if wait_for_attach: - print(f"⏳ [{debug_type.upper()}] Waiting for debugger to attach...") - debugpy.wait_for_client() - print(f"✅ [{debug_type.upper()}] Debugger attached!") - else: - print(f"📡 [{debug_type.upper()}] Ready for debugger attachment") - - except ImportError: - print("❌ debugpy not available. Install with: pip install debugpy") - sys.exit(1) - except Exception as e: - print(f"❌ Debug setup failed: {e}") - sys.exit(1) -# === END DEBUG SETUP === - -from agentex.lib.types.fastacp import TemporalACPConfig -from agentex.lib.sdk.fastacp.fastacp import FastACP -from agentex.lib.core.temporal.plugins.openai_agents.models.temporal_streaming_model import ( - TemporalStreamingModelProvider, -) -from agentex.lib.core.temporal.plugins.openai_agents.interceptors.context_interceptor import ( - ContextInterceptor, -) - -context_interceptor = ContextInterceptor() -temporal_streaming_model_provider = TemporalStreamingModelProvider() - -# Create the ACP server. We register the STANDARD OpenAIAgentsPlugin with: -# - the streaming model provider (real-time token streaming + persistence) -# - the LOCAL sandbox backend, registered under the name "local" so the -# workflow can resolve it via ``temporal_sandbox_client("local")`` -# plus the ContextInterceptor that threads task_id through activity headers. -acp = FastACP.create( - acp_type="async", - config=TemporalACPConfig( - # When deployed to the cluster, the Temporal address is set automatically. - # For local development, we set the address manually to talk to the local - # Temporal service set up via docker compose. - type="temporal", - temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), - plugins=[ - OpenAIAgentsPlugin( - model_provider=temporal_streaming_model_provider, - sandbox_clients=[ - SandboxClientProvider("local", UnixLocalSandboxClient()), - ], - ) - ], - interceptors=[context_interceptor], - ), -) - - -# Notice that we don't need to register any handlers when we use type="temporal". -# These handlers are automatically registered when the ACP is created: -# -# @acp.on_task_create -> the workflow method decorated with @workflow.run -# @acp.on_task_event_send -> the workflow method decorated with -# @workflow.signal(name=SignalName.RECEIVE_EVENT) -# @acp.on_task_cancel -> handled by the temporal client (cancels the workflow) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/run_worker.py b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/run_worker.py deleted file mode 100644 index a2b7bdf6b..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/run_worker.py +++ /dev/null @@ -1,80 +0,0 @@ -import asyncio - -from temporalio.contrib.openai_agents import ( - OpenAIAgentsPlugin, - SandboxClientProvider, -) -from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClient - -from project.workflow import At120OpenaiAgentsLocalSandboxWorkflow -from agentex.lib.utils.debug import setup_debug_if_enabled -from agentex.lib.utils.logging import make_logger -from agentex.lib.environment_variables import EnvironmentVariables -from agentex.lib.core.temporal.activities import get_all_activities -from agentex.lib.core.temporal.workers.worker import AgentexWorker -from agentex.lib.core.temporal.plugins.openai_agents.hooks.activities import ( - stream_lifecycle_content, -) -from agentex.lib.core.temporal.plugins.openai_agents.models.temporal_streaming_model import ( - TemporalStreamingModelProvider, -) -from agentex.lib.core.temporal.plugins.openai_agents.interceptors.context_interceptor import ( - ContextInterceptor, -) - -environment_variables = EnvironmentVariables.refresh() - -logger = make_logger(__name__) - - -async def main(): - # Setup debug mode if enabled - setup_debug_if_enabled() - - task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE - if task_queue_name is None: - raise ValueError("WORKFLOW_TASK_QUEUE is not set") - - # Register activities. ``stream_lifecycle_content`` powers the streaming - # lifecycle hooks; the rest are the standard AgentEx activities. - all_activities = get_all_activities() + [stream_lifecycle_content] - - # ============================================================================ - # STREAMING + SANDBOX SETUP - # ============================================================================ - # 1. ContextInterceptor threads task_id through activity headers so the - # streaming model + hooks know which task to stream/persist to. - # 2. TemporalStreamingModelProvider returns a model that streams tokens to - # Redis in real time while still returning the complete response to - # Temporal for determinism / replay safety. - # 3. SandboxClientProvider registers the LOCAL sandbox backend - # (UnixLocalSandboxClient) under the name "local". The workflow resolves - # it at run time via ``temporal_sandbox_client("local")``, so the sandbox - # tool calls run as durable Temporal activities. - # - # We use the STANDARD temporalio.contrib.openai_agents.OpenAIAgentsPlugin — - # no forked plugin needed. - context_interceptor = ContextInterceptor() - temporal_streaming_model_provider = TemporalStreamingModelProvider() - - worker = AgentexWorker( - task_queue=task_queue_name, - plugins=[ - OpenAIAgentsPlugin( - model_provider=temporal_streaming_model_provider, - sandbox_clients=[ - SandboxClientProvider("local", UnixLocalSandboxClient()), - ], - ) - ], - interceptors=[context_interceptor], - ) - - await worker.run( - activities=all_activities, - workflow=At120OpenaiAgentsLocalSandboxWorkflow, - ) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/workflow.py b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/workflow.py deleted file mode 100644 index 45b61b04e..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/project/workflow.py +++ /dev/null @@ -1,213 +0,0 @@ -"""OpenAI Agents SDK + Temporal: Local Sandbox Tutorial - -This tutorial demonstrates running an OpenAI Agents SDK ``SandboxAgent`` inside a -Temporal workflow, backed by the **local** (``unix_local``) sandbox. The agent is -a "local sandbox assistant": it answers questions by actually running real shell -commands (e.g. ``python3 --version``, ``ls``, ``python3 -c "..."``) instead of -guessing. - -KEY CONCEPTS DEMONSTRATED: -- A ``SandboxAgent`` granted the ``Shell`` capability inside a durable Temporal - workflow. -- The Temporal sandbox bridge: ``temporal_sandbox_client("local")`` resolves to - the ``UnixLocalSandboxClient`` registered on the worker via - ``SandboxClientProvider`` (see ``run_worker.py`` / ``acp.py``). The sandbox tool - calls run as Temporal activities, so they are durable, retried, and observable. -- Real-time streaming + persistence via ``TemporalStreamingModelProvider`` + - ``ContextInterceptor`` (configured on the worker) and ``TemporalStreamingHooks``. - -IMPORTANT LESSONS (applied below): - (a) Do NOT post the assistant message yourself with ``adk.messages.create`` - after ``Runner.run``. The ``TemporalStreamingModelProvider`` already streams - and persists the assistant's response — posting it again would duplicate the - answer in the UI. We only persist conversation state for the next turn via - ``result.to_input_list()``. - (b) Use ``NoopSnapshotSpec()`` so the per-turn workspace snapshot is skipped. - Without it, stopping the sandbox can raise ``WorkspaceArchiveReadError``. -""" - -from __future__ import annotations - -import os -import json - -from agents import Runner -from temporalio import workflow - -from agentex.lib import adk -from agentex.lib.types.acp import SendEventParams, CreateTaskParams -from agentex.lib.types.tracing import SGPTracingProcessorConfig -from agentex.lib.utils.logging import make_logger -from agentex.types.text_content import TextContent -from agentex.lib.utils.model_utils import BaseModel -from agentex.lib.environment_variables import EnvironmentVariables -from agentex.lib.core.temporal.types.workflow import SignalName -from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow -from agentex.lib.core.tracing.tracing_processor_manager import ( - add_tracing_processor_config, -) -from agentex.lib.core.temporal.plugins.openai_agents.hooks.hooks import ( - TemporalStreamingHooks, -) - -# OpenAI Agents SDK sandbox imports. These are safe to import at workflow module -# load time; the actual sandbox client is resolved at run time via -# ``temporal_sandbox_client`` (which maps to the worker-registered backend). -with workflow.unsafe.imports_passed_through(): - from agents.sandbox import SandboxAgent, SandboxRunConfig - from agents.run_config import RunConfig - from agents.sandbox.snapshot import NoopSnapshotSpec - from agents.sandbox.capabilities import Shell - from agents.sandbox.sandboxes.unix_local import UnixLocalSandboxClientOptions - from temporalio.contrib.openai_agents.workflow import temporal_sandbox_client - -# Configure tracing processor (optional - only if you have SGP credentials) -add_tracing_processor_config( - SGPTracingProcessorConfig( - sgp_api_key=os.environ.get("SGP_API_KEY", ""), - sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), - ) -) - -environment_variables = EnvironmentVariables.refresh() - -if environment_variables.WORKFLOW_NAME is None: - raise ValueError("Environment variable WORKFLOW_NAME is not set") - -if environment_variables.AGENT_NAME is None: - raise ValueError("Environment variable AGENT_NAME is not set") - -logger = make_logger(__name__) - -MODEL_NAME = "gpt-4o-mini" -INSTRUCTIONS = """You are a local sandbox assistant. - -You have access to shell tools that run real commands on the local machine. - -Guidelines: -- ALWAYS use the shell tools to actually run commands — never guess or make up - output. If the user asks for the Python version, run `python3 --version`. If - they ask to list files, run `ls`. If they ask you to compute something, use - `python3 -c "..."`. -- Run the minimal command(s) needed to answer the question. -- Report the real command output back to the user, concisely. -""" - - -class StateModel(BaseModel): - """State model for preserving conversation history across turns.""" - - input_list: list = [] - turn_number: int = 0 - - -@workflow.defn(name=environment_variables.WORKFLOW_NAME) -class At120OpenaiAgentsLocalSandboxWorkflow(BaseWorkflow): - """Long-running Temporal workflow that runs a SandboxAgent against the local sandbox.""" - - def __init__(self): - super().__init__(display_name=environment_variables.AGENT_NAME) - self._complete_task = False - self._state: StateModel | None = None - self._task_id = None - self._trace_id = None - self._parent_span_id = None - - @workflow.signal(name=SignalName.RECEIVE_EVENT) - async def on_task_event_send(self, params: SendEventParams) -> None: - logger.info(f"Received task event: {params.task.id}") - - if self._state is None: - raise ValueError("State is not initialized") - - self._state.turn_number += 1 - - # The ContextInterceptor reads ``self._task_id`` off the workflow - # instance and threads it through activity headers so the streaming - # model + hooks know which task to stream/persist to. - self._task_id = params.task.id - self._trace_id = params.task.id - - # Add the user message to conversation history. - self._state.input_list.append({"role": "user", "content": params.event.content.content}) - - # Echo back the client's message so it shows up in the UI. - await adk.messages.create(task_id=params.task.id, content=params.event.content) - - async with adk.tracing.span( - trace_id=params.task.id, - name=f"Turn {self._state.turn_number}", - input=self._state.model_dump(), - ) as span: - self._parent_span_id = span.id if span else None - - # Build the sandbox agent. The Shell capability becomes real shell - # tools backed by the sandbox client resolved at run time. - agent = SandboxAgent( - name="Local Sandbox Assistant", - model=MODEL_NAME, - instructions=INSTRUCTIONS, - capabilities=[Shell()], - ) - - # Point the run at the LOCAL sandbox backend registered on the worker - # under the name "local". ``temporal_sandbox_client`` resolves that - # registration so the sandbox tool calls execute as Temporal - # activities (durable + observable). - # - # IMPORTANT: ``NoopSnapshotSpec()`` skips the per-turn workspace - # snapshot — otherwise stopping the sandbox can raise - # ``WorkspaceArchiveReadError``. - run_config = RunConfig( - sandbox=SandboxRunConfig( - client=temporal_sandbox_client("local"), - options=UnixLocalSandboxClientOptions(), - snapshot=NoopSnapshotSpec(), - ) - ) - - # TemporalStreamingHooks creates the lifecycle messages (tool - # request/response, etc.) and works with the streaming model - # provider to stream tokens to the UI in real time. - result = await Runner.run( - agent, - self._state.input_list, - run_config=run_config, - hooks=TemporalStreamingHooks(task_id=params.task.id), - max_turns=10, - ) - - # IMPORTANT: We do NOT post the assistant message ourselves here. - # The TemporalStreamingModelProvider already streamed and persisted - # the assistant's response. We only persist conversation state for - # the next turn. - self._state.input_list = result.to_input_list() - - if span: - span.output = self._state.model_dump() - - @workflow.run - async def on_task_create(self, params: CreateTaskParams) -> str: - logger.info(f"Task created: {params.task.id}") - - self._state = StateModel(input_list=[], turn_number=0) - - await adk.messages.create( - task_id=params.task.id, - content=TextContent( - author="agent", - content=( - f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" - f"Send me a message and I'll run real shell commands in a local " - f"sandbox (backed by Temporal) to answer." - ), - ), - ) - - await workflow.wait_condition(lambda: self._complete_task, timeout=None) - return "Task completed" - - @workflow.signal - async def complete_task_signal(self) -> None: - logger.info("Received complete_task signal") - self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/tests/test_agent.py deleted file mode 100644 index 5e161c061..000000000 --- a/examples/tutorials/10_async/10_temporal/120_openai_agents_local_sandbox/tests/test_agent.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Tests for the Temporal OpenAI Agents SDK local-sandbox agent. - -This test suite validates that the agent actually runs shell commands in the -LOCAL sandbox (unix_local backend) via the Temporal sandbox bridge, by polling -for the agent's response: -- Ask for the Python version -> response contains "Python 3" -- Ask it to compute 21 * 2 with python3 -> response contains "42" - -To run these tests: -1. Make sure the agent is running (via docker-compose or `agentex agents run`) -2. Set the AGENTEX_API_BASE_URL environment variable if not using default -3. Run: pytest test_agent.py -v - -Configuration: -- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) -- AGENT_NAME: Name of the agent to test (default: at120-openai-agents-local-sandbox) -""" - -import os -import uuid - -import pytest -import pytest_asyncio -from test_utils.async_utils import ( - poll_messages, - send_event_and_poll_yielding, -) - -from agentex import AsyncAgentex -from agentex.types.task_message import TaskMessage -from agentex.types.agent_rpc_params import ParamsCreateTaskRequest - -# Configuration from environment variables -AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") -AGENT_NAME = os.environ.get("AGENT_NAME", "at120-openai-agents-local-sandbox") - - -@pytest_asyncio.fixture -async def client(): - """Create an AsyncAgentex client instance for testing.""" - client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) - yield client - await client.close() - - -@pytest.fixture -def agent_name(): - """Return the agent name for testing.""" - return AGENT_NAME - - -@pytest_asyncio.fixture -async def agent_id(client, agent_name): - """Retrieve the agent ID based on the agent name.""" - agents = await client.agents.list() - for agent in agents: - if agent.name == agent_name: - return agent.id - raise ValueError(f"Agent with name {agent_name} not found.") - - -async def _create_task_and_await_welcome(client: AsyncAgentex, agent_id: str) -> str: - """Create a task and wait for the workflow's welcome message; return the task id.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) - task = task_response.result - assert task is not None - - welcome_found = False - async for message in poll_messages( - client=client, - task_id=task.id, - timeout=30, - sleep_interval=1.0, - ): - assert isinstance(message, TaskMessage) - if message.content and message.content.type == "text" and message.content.author == "agent": - welcome_found = True - break - assert welcome_found, "Task creation (welcome) message not found" - return task.id - - -async def _send_and_collect_agent_text( - client: AsyncAgentex, agent_id: str, task_id: str, user_message: str -) -> str: - """Send a user message and accumulate the streamed agent text into a string.""" - final_message = None - async for message in send_event_and_poll_yielding( - client=client, - agent_id=agent_id, - task_id=task_id, - user_message=user_message, - timeout=60, - sleep_interval=1.0, - yield_updates=True, # Get updates as streaming writes chunks - ): - if message.content and message.content.type == "text" and message.content.author == "agent": - final_message = message - if message.streaming_status == "DONE": - break - - assert final_message is not None, "Should have received an agent text message" - assert final_message.content is not None, "Final message should have content" - return final_message.content.content or "" - - -class TestLocalSandboxEvents: - """Test the Temporal local-sandbox OpenAI Agents SDK agent.""" - - @pytest.mark.asyncio - async def test_shell_python_version(self, client: AsyncAgentex, agent_id: str): - """The agent should run `python3 --version` in the local sandbox. - - The sandbox runs on Python 3.12, so the real output contains "Python 3". - """ - task_id = await _create_task_and_await_welcome(client, agent_id) - text = await _send_and_collect_agent_text( - client, - agent_id, - task_id, - "Use your shell to print the Python version on this machine, then " - "tell me what it is.", - ) - assert text, "Expected a non-empty response from the sandbox agent." - assert "Python 3" in text - - @pytest.mark.asyncio - async def test_shell_compute(self, client: AsyncAgentex, agent_id: str): - """The agent should use python3 in the sandbox to compute 21 * 2 == 42.""" - task_id = await _create_task_and_await_welcome(client, agent_id) - text = await _send_and_collect_agent_text( - client, - agent_id, - task_id, - "Use python3 in your shell to compute 21 * 2 and tell me the result.", - ) - assert text, "Expected a non-empty response from the sandbox agent." - assert "42" in text - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/.dockerignore b/examples/tutorials/10_async/10_temporal/130_langgraph/.dockerignore index c4f7a8b4b..c49489471 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/.dockerignore +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/.dockerignore @@ -40,4 +40,4 @@ venv.bak/ .gitignore # Misc -.DS_Store \ No newline at end of file +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/.env.example b/examples/tutorials/10_async/10_temporal/130_langgraph/.env.example deleted file mode 100644 index ab1a5790f..000000000 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/.env.example +++ /dev/null @@ -1,13 +0,0 @@ -# at130-langgraph - Environment Variables -# Copy this file to .env and fill in the values - -# API key for your LLM provider -LITELLM_API_KEY= - -# LLM base URL (optional - override to use a different provider) -# OPENAI_BASE_URL= - -# SGP Configuration (optional - for tracing) -# SGP_API_KEY= -# SGP_ACCOUNT_ID= -# SGP_CLIENT_BASE_URL= \ No newline at end of file diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/README.md b/examples/tutorials/10_async/10_temporal/130_langgraph/README.md index 61ccaf66a..0820f56ab 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/README.md +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/README.md @@ -1,58 +1,49 @@ -# at130-langgraph — AgentEx Temporal + LangGraph +# Tutorial: Temporal LangGraph Agent -A minimal Temporal-backed [LangGraph](https://langchain-ai.github.io/langgraph/) -agent. It uses the official [`temporalio.contrib.langgraph`](https://docs.temporal.io/develop/python/integrations/langgraph) -plugin so each LangGraph node runs as a durable **Temporal activity** (the LLM -`agent` node) or inline in the **workflow** (the `tools` node) — set per node -with `execute_in`. *Temporal is the runtime; LangGraph is the agent framework.* +This tutorial demonstrates how to build a **Temporal-backed** LangGraph agent on +AgentEx using the **unified harness surface**. The agent's LLM node runs as a +durable Temporal activity; the tools node runs inline in the workflow. -> The Temporal LangGraph plugin is currently **experimental**. +## Key Concepts -## The graph +### Temporal + LangGraph -``` -START → agent → (tool calls?) → tools → agent - → (no tool calls?) → END -``` - -- `agent` (`execute_in="activity"`): the LLM call — a retried, observable Temporal activity. -- `tools` (`execute_in="workflow"`): runs the tool calls inline in the workflow. +The ``LangGraphPlugin`` from ``temporalio.contrib.langgraph`` turns annotated graph +nodes into Temporal activities or inline workflow callables: -The router and tools are `async` so LangGraph awaits them directly (a sync -callable is offloaded via `run_in_executor`, which Temporal workflows forbid). +- `agent` node: `execute_in="activity"` (durable, retryable LLM call) +- `tools` node: `execute_in="workflow"` (inline, fast tool execution) -## Project structure - -``` -130_langgraph/ -├── project/ -│ ├── acp.py # Thin async ACP server; registers the LangGraphPlugin -│ ├── workflow.py # Runs the graph each turn; keeps multi-turn memory -│ ├── graph.py # LangGraph graph; nodes tagged execute_in activity/workflow -│ └── tools.py # Async tool(s) -└── run_worker.py is project/run_worker.py -``` +### Message surfacing -## Running +After each turn, ``emit_langgraph_messages`` converts the new LangGraph messages +(tool requests, tool responses, final text) into AgentEx ``TaskMessage`` objects +and posts them to the task's message stream. -```bash -agentex agents run --manifest manifest.yaml -``` +This is the Temporal-specific path. The non-Temporal async/sync channels use +``UnifiedEmitter.auto_send_turn`` / ``UnifiedEmitter.yield_turn`` with +``LangGraphTurn`` instead. -Open the Temporal UI at http://localhost:8080 to watch the workflow and the -`agent` activity execute. Use `dev.ipynb` to create a task and send messages. +## Files -## Adding tools +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server (Temporal config, LangGraphPlugin) | +| `project/graph.py` | LangGraph graph (agent + tools nodes) | +| `project/workflow.py` | Temporal workflow (signal handlers, emit_langgraph_messages) | +| `project/run_worker.py` | Temporal worker runner | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: at130-langgraph) | -Define an **async** `@tool` in `project/tools.py` and add it to `TOOLS`. The -model is bound with `TOOLS` and the tool node runs them by name. +## Running Locally -For a fuller version with human-in-the-loop approval and graph-introspection -queries, scaffold the `temporal-langgraph` template via `agentex init`. +```bash +agentex agents run +``` -## Tests +## Running Tests -- `tests/test_graph_temporal.py` — hermetic ReAct-loop test with a stub model, - plus a live end-to-end run through the real Temporal plugin (skipped unless - `LITELLM_API_KEY` is set). -- `tests/test_agent.py` — live integration against a running agent. +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/dev.ipynb b/examples/tutorials/10_async/10_temporal/130_langgraph/dev.ipynb deleted file mode 100644 index 5320daac7..000000000 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/dev.ipynb +++ /dev/null @@ -1,126 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "36834357", - "metadata": {}, - "outputs": [], - "source": [ - "from agentex import Agentex\n", - "\n", - "client = Agentex(base_url=\"http://localhost:5003\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1c309d6", - "metadata": {}, - "outputs": [], - "source": [ - "AGENT_NAME = \"at130-langgraph\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f6e6ef0", - "metadata": {}, - "outputs": [], - "source": [ - "# (REQUIRED) Create a new task. For Async agents, you must create a task for messages to be associated with.\n", - "import uuid\n", - "\n", - "rpc_response = client.agents.create_task(\n", - " agent_name=AGENT_NAME,\n", - " params={\n", - " \"name\": f\"{str(uuid.uuid4())[:8]}-task\",\n", - " \"params\": {}\n", - " }\n", - ")\n", - "\n", - "task = rpc_response.result\n", - "print(task)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b03b0d37", - "metadata": {}, - "outputs": [], - "source": [ - "# Send an event to the agent\n", - "\n", - "# The response is expected to be a list of TaskMessage objects, which is a union of the following types:\n", - "# - TextContent: A message with just text content \n", - "# - DataContent: A message with JSON-serializable data content\n", - "# - ToolRequestContent: A message with a tool request, which contains a JSON-serializable request to call a tool\n", - "# - ToolResponseContent: A message with a tool response, which contains response object from a tool call in its content\n", - "\n", - "# When processing the message/send response, if you are expecting more than TextContent, such as DataContent, ToolRequestContent, or ToolResponseContent, you can process them as well\n", - "\n", - "rpc_response = client.agents.send_event(\n", - " agent_name=AGENT_NAME,\n", - " params={\n", - " \"content\": {\"type\": \"text\", \"author\": \"user\", \"content\": \"Hello what can you do?\"},\n", - " \"task_id\": task.id,\n", - " }\n", - ")\n", - "\n", - "event = rpc_response.result\n", - "print(event)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6927cc0", - "metadata": {}, - "outputs": [], - "source": [ - "# Subscribe to the async task messages produced by the agent\n", - "from agentex.lib.utils.dev_tools import subscribe_to_async_task_messages\n", - "\n", - "task_messages = subscribe_to_async_task_messages(\n", - " client=client,\n", - " task=task, \n", - " only_after_timestamp=event.created_at, \n", - " print_messages=True,\n", - " rich_print=True,\n", - " timeout=5,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4864e354", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/environments.yaml b/examples/tutorials/10_async/10_temporal/130_langgraph/environments.yaml deleted file mode 100644 index d54d8e5ff..000000000 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/environments.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# Agent Environment Configuration -# ------------------------------ -# This file defines environment-specific settings for your agent. -# This DIFFERS from the manifest.yaml file in that it is used to program things that are ONLY per environment. - -# ********** EXAMPLE ********** -# schema_version: "v1" # This is used to validate the file structure and is not used by the agentex CLI -# environments: -# dev: -# auth: -# principal: -# user_id: "1234567890" -# user_name: "John Doe" -# user_email: "john.doe@example.com" -# user_role: "admin" -# user_permissions: "read, write, delete" -# helm_overrides: # This is used to override the global helm values.yaml file in the agentex-agent helm charts -# replicas: 3 -# resources: -# requests: -# cpu: "1000m" -# memory: "2Gi" -# limits: -# cpu: "2000m" -# memory: "4Gi" -# env: -# - name: LOG_LEVEL -# value: "DEBUG" -# - name: ENVIRONMENT -# value: "staging" -# -# kubernetes: -# # OPTIONAL - Otherwise it will be derived from separately. However, this can be used to override the derived -# # namespace and deploy it with in the same namespace that already exists for a separate agent. -# namespace: "team-at130-langgraph" -# ********** END EXAMPLE ********** - -schema_version: "v1" # This is used to validate the file structure and is not used by the agentex CLI -environments: - dev: - auth: - principal: - user_id: # TODO: Fill in - account_id: # TODO: Fill in - helm_overrides: - # This is used to override the global helm values.yaml file in the agentex-agent helm charts - replicaCount: 2 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - temporal-worker: - enabled: true - replicaCount: 2 - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" \ No newline at end of file diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/manifest.yaml b/examples/tutorials/10_async/10_temporal/130_langgraph/manifest.yaml index d1f5960b1..534c8dd58 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/manifest.yaml +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/manifest.yaml @@ -1,20 +1,5 @@ -# Agent Manifest Configuration -# --------------------------- -# This file defines how your agent should be built and deployed. - -# Build Configuration -# ------------------ -# The build config defines what gets packaged into your agent's Docker image. -# This same configuration is used whether building locally or remotely. -# -# When building: -# 1. All files from include_paths are collected into a build context -# 2. The context is filtered by dockerignore rules -# 3. The Dockerfile uses this context to build your agent's image -# 4. The image is pushed to a registry and used to run your agent build: context: - # Build from the tutorials root so shared test_utils are available. root: ../../../ include_paths: - 10_async/10_temporal/130_langgraph @@ -22,107 +7,53 @@ build: dockerfile: 10_async/10_temporal/130_langgraph/Dockerfile dockerignore: 10_async/10_temporal/130_langgraph/.dockerignore - -# Local Development Configuration -# ----------------------------- -# Only used when running the agent locally local_development: agent: - port: 8000 # Port where your local ACP server is running - host_address: host.docker.internal # Host address for Docker networking (host.docker.internal for Docker, localhost for direct) - - # File paths for local development (relative to this manifest.yaml) + port: 8000 + host_address: host.docker.internal paths: - # Path to ACP server file - # Examples: - # project/acp.py (standard) - # src/server.py (custom structure) - # ../shared/acp.py (shared across projects) - # /absolute/path/acp.py (absolute path) acp: project/acp.py - - # Path to temporal worker file - # Examples: - # project/run_worker.py (standard) - # workers/temporal.py (custom structure) - # ../shared/worker.py (shared across projects) worker: project/run_worker.py - -# Agent Configuration -# ----------------- agent: - # Type of agent - either sync or async acp_type: async - - # Unique name for your agent - # Used for task routing and monitoring name: at130-langgraph + description: "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" - # Description of what your agent does - # Helps with documentation and discovery - description: "A Temporal-backed LangGraph agent whose nodes run as Temporal activities" - - # Temporal workflow configuration - # This enables your agent to run as a Temporal workflow for long-running tasks temporal: enabled: true workflows: - # Name of the workflow class - # Must match the @workflow.defn name in your workflow.py - name: at130-langgraph - - # Queue name for task distribution - # Used by Temporal to route tasks to your agent - # Convention: _task_queue queue_name: at130_langgraph_queue - # Optional: Health check port for temporal worker - # Defaults to 80 if not specified - # health_check_port: 80 - - # Optional: Credentials mapping - # Maps Kubernetes secrets to environment variables - # Common credentials include: credentials: - env_var_name: REDIS_URL secret_name: redis-url-secret secret_key: url - # - env_var_name: LITELLM_API_KEY - # secret_name: litellm-api-key - # secret_key: api-key - - # Optional: Set Environment variables for running your agent locally as well - # as for deployment later on - env: {} - # LITELLM_API_KEY: "" - # OPENAI_BASE_URL: "" - # OPENAI_ORG_ID: "" + # graph.py builds ChatOpenAI(model=MODEL_NAME); a deployed worker needs the + # model credential or the first activity call fails. + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + env: {} -# Deployment Configuration -# ----------------------- -# Configuration for deploying your agent to Kubernetes clusters deployment: - # Container image configuration image: - repository: "" # Update with your container registry - tag: "latest" # Default tag, should be versioned in production + repository: "" + tag: "latest" - imagePullSecrets: [] # Update with your image pull secret name - # - name: my-registry-secret + imagePullSecrets: [] - # Global deployment settings that apply to all clusters - # These can be overridden in cluster-specific environments (environments.yaml) global: - # Default replica count + agent: + name: "at130-langgraph" + description: "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" replicaCount: 1 - - # Default resource requirements resources: requests: cpu: "500m" memory: "1Gi" limits: cpu: "1000m" - memory: "2Gi" \ No newline at end of file + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/project/acp.py b/examples/tutorials/10_async/10_temporal/130_langgraph/project/acp.py index c01f8831c..7af9c5e68 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/project/acp.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/project/acp.py @@ -1,19 +1,13 @@ -"""ACP server for the Temporal LangGraph agent. +"""ACP server for the Temporal harness LangGraph agent. -This file is intentionally thin. When ``acp_type="async"`` is combined with -``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: +Follows the ``130_langgraph`` pattern: the Temporal ``LangGraphPlugin`` runs +graph nodes as Temporal activities. The agent logic lives in ``workflow.py`` +(the runtime) and ``graph.py`` (the LangGraph graph), executed by the Temporal +worker (``run_worker.py``), not by this HTTP process. - HTTP task/create → @workflow.run on the workflow class - HTTP task/event/send → @workflow.signal(SignalName.RECEIVE_EVENT) - HTTP task/cancel → workflow cancellation via the Temporal client - -so we don't define any handlers here. The agent logic lives in -``project/workflow.py`` (the runtime) and ``project/graph.py`` (the LangGraph -graph whose nodes run as Temporal activities), executed by the Temporal worker -(``project/run_worker.py``), not by this HTTP process. - -The ``LangGraphPlugin`` is registered here too so the Temporal client started -by FastACP shares the same graph registry as the worker. +The workflow uses ``emit_langgraph_messages`` to surface turn messages to +AgentEx. That helper is Temporal-specific and is not replaced by the unified +harness here (``UnifiedEmitter`` targets the non-Temporal async/sync channels). """ from __future__ import annotations @@ -33,10 +27,8 @@ acp = FastACP.create( acp_type="async", config=TemporalACPConfig( - # When deployed to the cluster, the Temporal address is set automatically. - # Locally we point at the Temporal service from docker compose. type="temporal", temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], ), -) \ No newline at end of file +) diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/project/graph.py b/examples/tutorials/10_async/10_temporal/130_langgraph/project/graph.py index 0589aa9ba..7adba3ae4 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/project/graph.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/project/graph.py @@ -1,24 +1,9 @@ """LangGraph graph for at130-langgraph — nodes run as Temporal activities. -The ``temporalio.contrib.langgraph`` plugin runs each node where its -``execute_in`` metadata says: the LLM ``agent`` node as a durable Temporal -**activity**, the ``tools`` node inline in the **workflow**. - - START → agent → (tool calls?) → tools → agent - → (no tool calls?) → END - -The router and tools are ``async`` so LangGraph awaits them directly — a sync -callable would be offloaded via ``run_in_executor``, which Temporal's workflow -event loop does not support. - -The in-workflow ``tools`` node is a plain ``async`` function rather than -LangGraph's ``ToolNode`` prebuilt on purpose. The plugin wraps an in-workflow -node in ``wrap_workflow``, whose closure captures the wrapped object. When that -object is itself a LangChain ``Runnable`` (as ``ToolNode`` is), LangGraph's -``compile()`` subgraph detection (``find_subgraph_pregel`` → -``get_function_nonlocals``) recurses through that wrapper without cycle -detection and never terminates, tripping Temporal's deadlock detector. A plain -function isn't a ``Runnable``, so compile stays trivial. +Identical in structure to ``130_langgraph/project/graph.py``. The graph +definition is not affected by the harness migration; only the agent naming +changes. The LLM ``agent`` node runs as a durable Temporal activity; +the ``tools`` node runs inline in the workflow. """ from __future__ import annotations @@ -40,10 +25,8 @@ from project.tools import TOOLS -# Look up tools by name for the in-workflow tools node. _TOOLS_BY_NAME = {tool.name: tool for tool in TOOLS} -# Name this graph is registered under in the LangGraphPlugin (acp.py / run_worker.py). GRAPH_NAME = "at130-langgraph" MODEL_NAME = "gpt-4o" SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. @@ -62,37 +45,27 @@ async def agent_node(state: AgentState) -> dict[str, Any]: llm = ChatOpenAI(model=MODEL_NAME).bind_tools(TOOLS) messages = state["messages"] if not messages or not isinstance(messages[0], SystemMessage): - system = SystemMessage( - content=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - ) + system = SystemMessage(content=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) messages = [system, *messages] return {"messages": [await llm.ainvoke(messages)]} async def tools_node(state: AgentState) -> dict[str, Any]: - """Run the tool calls the model requested. Runs inline in the workflow. - - A plain ``async`` function (not LangGraph's ``ToolNode``) — see the module - docstring for why a ``Runnable`` tools node can't be compiled here. - """ + """Run the tool calls the model requested. Runs inline in the workflow.""" last = state["messages"][-1] results: list[Any] = [] for call in getattr(last, "tool_calls", None) or []: tool = _TOOLS_BY_NAME.get(call["name"]) - # Mirror ToolNode: surface an unknown/hallucinated tool name as an error - # ToolMessage so the graph keeps running instead of crashing the node. if tool is None: output = f"Error: unknown tool {call['name']!r}. Available: {list(_TOOLS_BY_NAME)}" else: output = await tool.ainvoke(call["args"]) - results.append( - ToolMessage(content=str(output), tool_call_id=call["id"], name=call["name"]) - ) + results.append(ToolMessage(content=str(output), tool_call_id=call["id"], name=call["name"])) return {"messages": results} async def route_after_agent(state: AgentState) -> str: - """Go to the tools node if the model requested tools, else finish (async router).""" + """Go to the tools node if the model requested tools, else finish.""" last = state["messages"][-1] return "tools" if getattr(last, "tool_calls", None) else END diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/project/run_worker.py b/examples/tutorials/10_async/10_temporal/130_langgraph/project/run_worker.py index 7040f560b..4b31bf396 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/project/run_worker.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/project/run_worker.py @@ -5,8 +5,7 @@ The ``LangGraphPlugin`` is given the graph registry (``{ GRAPH_NAME: graph }``). At runtime it turns the graph's ``execute_in="activity"`` nodes into Temporal -activities and registers them on the worker automatically — so we don't have -to enumerate node activities by hand. +activities and registers them on the worker automatically. """ import asyncio @@ -14,7 +13,7 @@ from temporalio.contrib.langgraph import LangGraphPlugin from project.graph import GRAPH_NAME, build_graph -from project.workflow import At130LanggraphWorkflow +from project.workflow import AtHarnessLanggraphWorkflow from agentex.lib.utils.debug import setup_debug_if_enabled from agentex.lib.utils.logging import make_logger from agentex.lib.environment_variables import EnvironmentVariables @@ -32,9 +31,6 @@ async def main(): if task_queue_name is None: raise ValueError("WORKFLOW_TASK_QUEUE is not set") - # AgentexWorker runs workflows with an unsandboxed runner, so importing - # langchain/langgraph inside the workflow + nodes is fine. The LangGraph - # plugin registers the graph's activity-nodes for us. worker = AgentexWorker( task_queue=task_queue_name, plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], @@ -42,9 +38,9 @@ async def main(): await worker.run( activities=get_all_activities(), - workflow=At130LanggraphWorkflow, + workflow=AtHarnessLanggraphWorkflow, ) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/project/tools.py b/examples/tutorials/10_async/10_temporal/130_langgraph/project/tools.py index 20b7185ee..e7220016e 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/project/tools.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/project/tools.py @@ -1,20 +1,37 @@ -"""Tools for the LangGraph agent. +"""Tool definitions for the 130_langgraph temporal agent.""" -Tools are ``async`` so the in-workflow tool node can await them directly -(a sync tool would be offloaded via ``run_in_executor``, which Temporal's -workflow event loop does not allow). -""" +from langchain_core.tools import Tool -from __future__ import annotations -from langchain_core.tools import tool +def get_weather(city: str) -> str: + """Get the current weather for a city. + Args: + city: The name of the city to get weather for. -@tool -async def get_weather(city: str) -> str: - """Get the current weather for a city.""" - # TODO: replace with a real weather API call. + Returns: + A string describing the weather conditions. + """ return f"The weather in {city} is sunny and 72°F" -TOOLS = [get_weather] +async def aget_weather(city: str) -> str: + """Native async tool entrypoint. + + ``tools_node`` runs inline in the Temporal workflow and invokes tools via + ``tool.ainvoke``. A sync-only tool forces LangChain to bridge through + ``run_in_executor`` (a thread pool), which the deterministic Temporal + workflow event loop forbids (``NotImplementedError``). Providing a real + coroutine keeps tool execution on the workflow loop. + """ + return get_weather(city) + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + coroutine=aget_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/project/workflow.py b/examples/tutorials/10_async/10_temporal/130_langgraph/project/workflow.py index a50670251..b9224ca00 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/project/workflow.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/project/workflow.py @@ -1,4 +1,4 @@ -"""Temporal workflow for at130-langgraph — Temporal as the LangGraph runtime. +"""Temporal workflow for at130-langgraph. Each turn the workflow runs the LangGraph graph (``project/graph.py``) via the ``temporalio.contrib.langgraph`` plugin. The plugin runs the LLM ``agent`` node @@ -37,7 +37,7 @@ @workflow.defn(name=environment_variables.WORKFLOW_NAME) -class At130LanggraphWorkflow(BaseWorkflow): +class AtHarnessLanggraphWorkflow(BaseWorkflow): """Runs the LangGraph agent each turn; its nodes run as Temporal activities.""" def __init__(self) -> None: @@ -56,10 +56,7 @@ async def on_task_event_send(self, params: SendEventParams) -> None: result = await compiled.ainvoke({"messages": self._messages}) self._messages = result["messages"] - # Surface the messages this turn produced (tool calls, results, final - # text) to the AgentEx UI. The SDK helper does the LangGraph→AgentEx - # message conversion. - await emit_langgraph_messages(self._messages[self._emitted:], params.task.id) + await emit_langgraph_messages(self._messages[self._emitted :], params.task.id) self._emitted = len(self._messages) @workflow.signal diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/pyproject.toml b/examples/tutorials/10_async/10_temporal/130_langgraph/pyproject.toml index e22905de4..6d2262761 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/pyproject.toml +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/pyproject.toml @@ -5,13 +5,11 @@ build-backend = "hatchling.build" [project] name = "at130-langgraph" version = "0.1.0" -description = "A Temporal-backed LangGraph agent whose nodes run as Temporal activities" +description = "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" requires-python = ">=3.12" dependencies = [ "agentex-sdk", "scale-gp", - # Temporal with the LangGraph plugin (temporalio.contrib.langgraph), - # which runs LangGraph nodes as Temporal activities. Needs >=1.27.0. "temporalio[langgraph]>=1.27.0", "langchain-openai", "langchain-core", @@ -39,4 +37,4 @@ target-version = ['py312'] [tool.isort] profile = "black" -line_length = 88 \ No newline at end of file +line_length = 88 diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_agent.py index b798f568f..f2292389f 100644 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_agent.py +++ b/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_agent.py @@ -1,4 +1,4 @@ -"""Integration tests for the Temporal + LangGraph agent (live agent required). +"""Integration tests for the Temporal harness LangGraph agent (live agent required). These drive a *running* agent over the AgentEx API and verify that: - the agent sends a welcome message on task creation, @@ -6,9 +6,6 @@ (proving the LLM node ran as a Temporal activity and the tool node ran), - the final answer reflects the tool output. -For fast, network-free coverage of the graph + human-in-the-loop logic, see -``test_graph_temporal.py``. - To run: 1. Start the agent (worker + ACP server): ``agentex agents run --manifest manifest.yaml`` 2. Set AGENTEX_API_BASE_URL if not using the default @@ -60,29 +57,18 @@ class TestNonStreamingEvents: @pytest.mark.asyncio async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): """Create a task, ask about weather, verify the tool round-trip.""" - task_response = await client.agents.create_task( - agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex) - ) + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) task = task_response.result assert task is not None - # Wait for the welcome message from on_task_create task_creation_found = False - async for message in poll_messages( - client=client, task_id=task.id, timeout=30, sleep_interval=1.0 - ): + async for message in poll_messages(client=client, task_id=task.id, timeout=30, sleep_interval=1.0): assert isinstance(message, TaskMessage) - if ( - message.content - and message.content.type == "text" - and message.content.author == "agent" - ): + if message.content and message.content.type == "text" and message.content.author == "agent": task_creation_found = True break assert task_creation_found, "Task creation welcome message not found" - # Ask about weather — the agent (LangGraph node, as a Temporal activity) - # should call get_weather. seen_tool_request = False seen_tool_response = False final_message = None @@ -101,11 +87,7 @@ async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): if message.content and message.content.type == "tool_response": seen_tool_response = True - if ( - message.content - and message.content.type == "text" - and message.content.author == "agent" - ): + if message.content and message.content.type == "text" and message.content.author == "agent": final_message = message content_length = len(getattr(message.content, "content", "") or "") if getattr(message, "streaming_status", None) in (None, "DONE") and content_length > 0: @@ -115,11 +97,8 @@ async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): assert seen_tool_request, "Expected a tool_request (agent calling get_weather)" assert seen_tool_response, "Expected a tool_response (get_weather result)" assert final_message is not None, "Expected a final agent text message" - final_text = ( - getattr(final_message.content, "content", None) if final_message.content else None - ) + final_text = getattr(final_message.content, "content", None) if final_message.content else None assert isinstance(final_text, str) and len(final_text) > 0 - # get_weather always returns "72°F" — the response should mention it. assert "72" in final_text, "Expected weather response to mention 72°F" diff --git a/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_graph_temporal.py b/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_graph_temporal.py deleted file mode 100644 index 485b896f6..000000000 --- a/examples/tutorials/10_async/10_temporal/130_langgraph/tests/test_graph_temporal.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Tests for the Temporal + LangGraph agent's graph. - -Two layers: - -1. ``TestGraphLogic`` — hermetic, no network. Compiles the actual shipped - graph (``project/graph.py``) with a deterministic stub model and runs the - ReAct loop (agent → tools → agent) to completion. - -2. ``TestTemporalPlugin`` — end-to-end through the real Temporal LangGraph - plugin on a local Temporal server, proving the LLM node runs as an activity - and the tool node in the workflow. Needs a real model, so it is skipped - unless ``LITELLM_API_KEY`` (or ``OPENAI_API_KEY``) is set. - -Run from the agent's own (uv) environment: pytest tests/test_graph_temporal.py -v -""" - -from __future__ import annotations - -import os -import uuid - -import pytest - -pytest.importorskip("langgraph") -pytest.importorskip("temporalio.contrib.langgraph") - -import project.graph as graph_module -from temporalio import workflow -from project.graph import GRAPH_NAME, build_graph -from langchain_core.messages import AIMessage, ToolMessage -from temporalio.contrib.langgraph import graph as lg_graph - - -@workflow.defn -class _DriverWorkflow: - """Module-level driver workflow (Temporal forbids local workflow classes).""" - - @workflow.run - async def run(self, message: str) -> str: - compiled = lg_graph(GRAPH_NAME).compile() - result = await compiled.ainvoke({"messages": [{"role": "user", "content": message}]}) - return result["messages"][-1].content - - -class _StubModel: - """Deterministic stand-in for ``ChatOpenAI(...).bind_tools(...)``. - - First call → emit a tool call for ``get_weather``; once a ToolMessage is in - the history → emit a plain text answer. Drives the full ReAct loop offline. - """ - - def bind_tools(self, _tools): - return self - - async def ainvoke(self, messages): - if any(isinstance(m, ToolMessage) for m in messages): - return AIMessage(content="All done — the tool has run.") - return AIMessage( - content="", - tool_calls=[{"id": "call_1", "name": "get_weather", "args": {"city": "Denver"}}], - ) - - -class TestGraphLogic: - """Hermetic test of the ReAct loop, no network.""" - - @pytest.mark.asyncio - async def test_react_loop_runs_tool(self, monkeypatch): - monkeypatch.setattr(graph_module, "ChatOpenAI", lambda *_a, **_k: _StubModel()) - compiled = build_graph().compile() - result = await compiled.ainvoke({"messages": [{"role": "user", "content": "go"}]}) - - tool_outputs = [m.content for m in result["messages"] if isinstance(m, ToolMessage)] - assert any("sunny" in o for o in tool_outputs) - assert "done" in result["messages"][-1].content.lower() - - -@pytest.mark.skipif( - not (os.environ.get("LITELLM_API_KEY") or os.environ.get("OPENAI_API_KEY")), - reason="needs a real model (set LITELLM_API_KEY) for the live Temporal run", -) -class TestTemporalPlugin: - """End-to-end through the real Temporal LangGraph plugin on a local server.""" - - @pytest.mark.asyncio - async def test_nodes_run_as_activities_via_plugin(self): - from temporalio.worker import Worker, UnsandboxedWorkflowRunner - from temporalio.testing import WorkflowEnvironment - from temporalio.contrib.langgraph import LangGraphPlugin - - plugin = LangGraphPlugin(graphs={GRAPH_NAME: build_graph()}) - async with await WorkflowEnvironment.start_local(plugins=[plugin]) as env: - async with Worker( - env.client, - task_queue="tq", - workflows=[_DriverWorkflow], - workflow_runner=UnsandboxedWorkflowRunner(), - ): - out = await env.client.execute_workflow( - _DriverWorkflow.run, - "What's the weather in Denver? Use the get_weather tool.", - id=f"wf-{uuid.uuid4()}", - task_queue="tq", - ) - assert "denver" in out.lower() diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore b/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile b/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile new file mode 100644 index 000000000..c909ee6c7 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile @@ -0,0 +1,46 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/140_claude_code/pyproject.toml /app/140_claude_code/pyproject.toml +COPY 10_async/10_temporal/140_claude_code/README.md /app/140_claude_code/README.md + +WORKDIR /app/140_claude_code + +COPY 10_async/10_temporal/140_claude_code/project /app/140_claude_code/project +COPY 10_async/10_temporal/140_claude_code/tests /app/140_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at140-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When deploying the worker, replace the CMD with: +# CMD ["python", "project/run_worker.py"] diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/README.md b/examples/tutorials/10_async/10_temporal/140_claude_code/README.md new file mode 100644 index 000000000..61cc94183 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 140 (async/temporal): Temporal Claude Code Agent + +This tutorial demonstrates how to build a **Temporal-backed** agent that +spawns the Claude Code CLI as a local subprocess and delivers its output +through the Agentex unified harness surface via ``ClaudeCodeTurn`` and +``UnifiedEmitter.auto_send_turn``, with Temporal providing durable execution +and crash recovery. + +## Key Concepts + +### Temporal + ClaudeCodeTurn + +The Temporal workflow (``project/workflow.py``) holds state durably. Each user +message arrives as a signal (``on_task_event_send``), spawns the Claude Code +CLI locally, wraps the stdout line stream in ``ClaudeCodeTurn``, and pushes +events to the task's Redis stream via ``UnifiedEmitter.auto_send_turn``. + +``workflow.now()`` is passed as ``created_at`` so message timestamps are +deterministic under Temporal replay. + +### Multi-turn session resume + +The workflow persists the Claude Code ``session_id`` from the ``result`` +envelope. On the next turn, ``-r `` is passed to the CLI to +resume the conversation. Temporal's durable state ensures the session_id +survives worker crashes. + +### Note on subprocess in workflow code + +For simplicity, this tutorial spawns the subprocess directly inside the +workflow signal handler. For production use, move the spawn into a custom +Temporal activity so each subprocess invocation gets independent retry and +timeout guarantees. See +``examples/tutorials/10_async/10_temporal/030_custom_activities/`` for +that pattern. + +### Injectable spawn seam + +``_spawn_claude`` in ``project/workflow.py`` is a top-level async generator. +Tests monkeypatch it to inject pre-recorded stream-json lines so offline +unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | Thin ACP server; wires Temporal (no handlers) | +| ``project/workflow.py`` | Temporal workflow + ``_spawn_claude`` seam | +| ``project/run_worker.py`` | Temporal worker entry point | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + Temporal + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires Temporal server, the ``claude`` CLI, and ``ANTHROPIC_API_KEY``: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI, Temporal, or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. +- The subprocess spawn should be moved to a custom activity in production. +- The ``--verbose`` flag is included to match the golden agent's invocation. diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml b/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml new file mode 100644 index 000000000..9328b1713 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/140_claude_code + - test_utils + dockerfile: 10_async/10_temporal/140_claude_code/Dockerfile + dockerignore: 10_async/10_temporal/140_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at140-claude-code + description: A Temporal-backed Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: true + workflows: + - name: at140-claude-code + queue_name: at140_claude_code_queue + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at140-claude-code" + description: "A Temporal-backed Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/__init__.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py new file mode 100644 index 000000000..07258f6d8 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py @@ -0,0 +1,31 @@ +"""ACP server for the Temporal Claude Code tutorial. + +This file is intentionally thin. When ``acp_type="async"`` is combined +with ``TemporalACPConfig``, FastACP auto-wires: + + HTTP task/create -> @workflow.run on the workflow class + HTTP task/event/send -> @workflow.signal(SignalName.RECEIVE_EVENT) + HTTP task/cancel -> workflow cancellation via the Temporal client + +The actual agent code lives in ``project/workflow.py`` and is executed by +the Temporal worker (``project/run_worker.py``), not by this HTTP process. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py new file mode 100644 index 000000000..dcba0f9a7 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py @@ -0,0 +1,139 @@ +"""Temporal activity for the Claude Code tutorial. + +Subprocess spawning (and any other I/O) must run inside a Temporal *activity*, +not in workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(or threads / sockets), so spawning the CLI directly in the signal handler +raises ``NotImplementedError``. This activity runs the Claude Code CLI, drives +the ``ClaudeCodeTurn`` through ``UnifiedEmitter.auto_send_turn`` (the async +Redis push path), and returns the turn result to the workflow. + +The ``_spawn_claude`` async generator is an injectable seam: offline tests +provide a fake that yields pre-recorded stdout lines so no real CLI runs. +""" + +from __future__ import annotations + +import asyncio +from typing import Any, AsyncIterator +from datetime import datetime + +from temporalio import activity + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel + +logger = make_logger(__name__) + +RUN_CLAUDE_CODE_TURN_ACTIVITY = "run_claude_code_turn" + + +class RunClaudeCodeTurnParams(BaseModel): + """Arguments for one Claude Code turn run inside an activity.""" + + task_id: str + prompt: str + trace_id: str | None = None + parent_span_id: str | None = None + session_id: str | None = None + created_at: datetime | None = None + + +class RunClaudeCodeTurnResult(BaseModel): + """Result returned from the activity to the workflow.""" + + final_text: str + session_id: str | None = None + + +async def _spawn_claude(prompt: str, session_id: str | None = None) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + Pass ``session_id`` to resume a previous Claude Code session (multi-turn + memory via ``-r ``). + + Injectable seam: tests monkeypatch this with a fake async iterator so no + real CLI invocation is needed offline. + """ + cmd = [ + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + ] + if session_id: + cmd.extend(["-r", session_id]) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@activity.defn(name=RUN_CLAUDE_CODE_TURN_ACTIVITY) +async def run_claude_code_turn(params: RunClaudeCodeTurnParams) -> dict[str, Any]: + """Run one Claude Code turn end-to-end and stream events to the task. + + Runs in an activity (real asyncio loop) so subprocess I/O is permitted. + """ + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + turn = ClaudeCodeTurn(_spawn_claude(params.prompt, session_id=params.session_id)) + result = await emitter.auto_send_turn(turn, created_at=params.created_at) + + return RunClaudeCodeTurnResult(final_text=result.final_text, session_id=turn.session_id).model_dump() diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py new file mode 100644 index 000000000..58802737e --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py @@ -0,0 +1,41 @@ +"""Temporal worker for the Claude Code tutorial. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The Claude Code CLI subprocess runs in the ``run_claude_code_turn`` activity +(registered below alongside the built-in Agentex activities), because +subprocess I/O is not permitted on the Temporal workflow event loop. +""" + +import asyncio + +from project.workflow import At140ClaudeCodeWorkflow +from project.activities import run_claude_code_turn +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=[run_claude_code_turn, *get_all_activities()], + workflow=At140ClaudeCodeWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py new file mode 100644 index 000000000..7f50ba8d5 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py @@ -0,0 +1,137 @@ +"""Temporal workflow for the Claude Code tutorial. + +Holds conversation state (session_id for multi-turn resume) durably across +crashes. Each user message triggers ``on_task_event_send``, which delegates the +turn to the ``run_claude_code_turn`` activity. The activity spawns the Claude +Code CLI, wraps its stdout in ``ClaudeCodeTurn``, and delivers the turn via +``UnifiedEmitter.auto_send_turn`` (the async Redis push path). + +Note on subprocess inside Temporal +------------------------------------ +Subprocess (and all other) I/O must run in a Temporal *activity*, never in +workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(spawning the CLI there raises ``NotImplementedError``). The activity also gets +Temporal's retry + timeout guarantees. See +``examples/tutorials/10_async/10_temporal/030_custom_activities/`` for the +activity pattern. +""" + +from __future__ import annotations + +import os +import json +from datetime import timedelta + +from temporalio import workflow + +from agentex.lib import adk +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +with workflow.unsafe.imports_passed_through(): + from project.activities import RunClaudeCodeTurnParams, run_claude_code_turn + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class At140ClaudeCodeWorkflow(BaseWorkflow): + """Temporal workflow that runs Claude Code locally for each user message. + + Persists the Claude Code session_id across turns so the CLI can resume + the conversation (``-r ``). Temporal's durable state ensures + the session_id survives worker crashes. + """ + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + # Claude Code session_id for multi-turn resume. + self._session_id: str | None = None + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a user message: spawn Claude Code and push events to the task stream.""" + self._turn_number += 1 + task_id = params.task.id + prompt = params.event.content.content + logger.info("Turn %d for task %s", self._turn_number, task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name=f"Turn {self._turn_number}", + input={"message": prompt}, + ) as span: + # Delegate the subprocess turn to an activity: subprocess I/O is not + # permitted on the Temporal workflow event loop. The activity streams + # events to the task and returns the final text + session_id. + # workflow.now() gives a deterministic timestamp under replay. + result = await workflow.execute_activity( + run_claude_code_turn, + RunClaudeCodeTurnParams( + task_id=task_id, + prompt=prompt, + trace_id=task_id, + parent_span_id=span.id if span else None, + session_id=self._session_id, + created_at=workflow.now(), + ), + start_to_close_timeout=timedelta(minutes=5), + ) + + # Capture session_id to enable Claude Code resume on the next turn. + sid = result.get("session_id") + if sid: + self._session_id = sid + + if span: + span.output = {"final_text": result.get("final_text")} + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + logger.info("Task created: %s", params.task.id) + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" + "Send me a message and I'll run it through Claude Code locally." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml b/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml new file mode 100644 index 000000000..b9d517267 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at140-claude-code" +version = "0.1.0" +description = "A Temporal-backed Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py new file mode 100644 index 000000000..767c707b9 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py @@ -0,0 +1,249 @@ +"""Tests for the Temporal Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require Temporal server, the ACP server, the Temporal worker, the ``claude`` + CLI on PATH, and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline drives + ``auto_send_turn``, populates usage, and satisfies the ``HarnessTurn`` + protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-offline-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Temporal Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-offline-1", + "usage": {"input_tokens": 15, "output_tokens": 7}, + "cost_usd": 0.00015, + "duration_ms": 350, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-t1", task_id="task-temporal-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI, Temporal, or network.""" + + @pytest.mark.asyncio + async def test_auto_send_text_only_produces_output(self): + """auto_send_turn result carries the agent's reply text.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-temporal", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + assert "Hello from Temporal Claude Code" in result.final_text + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """Usage is populated after the events stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + usage = turn.usage() + assert usage.input_tokens == 15 + assert usage.output_tokens == 7 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear via yield_turn on a ClaudeCodeTurn.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_session_id_captured_in_result_envelope(self): + """The result envelope carries session_id (multi-turn resume support).""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + assert turn._result_envelope is not None + assert turn._result_envelope.get("session_id") == "sess-temporal-offline-1" + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at140-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live Temporal tests -- needs Temporal server + the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + @pytest.fixture + def agent_id(self, client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent {agent_name!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Create a task, send a message, and poll until a response appears.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + task_id = task.id + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task_id, + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 90 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task_id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + response_msgs = [m for m in agent_msgs if "Task initialized" not in str(getattr(m.content, "content", ""))] + if response_msgs: + assert len(response_msgs) >= 1 + return + time.sleep(3) + + raise AssertionError("No agent response received within 90 s") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..1adc553f1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py @@ -0,0 +1,230 @@ +"""Offline unit tests for the Temporal Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI, Temporal, or ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in place of +the real subprocess spawn and a fake streaming backend, then assert that the +workflow's turn logic correctly drives ``UnifiedEmitter.auto_send_turn``. + +The injection seam is the ``_spawn_claude`` function in ``project/workflow.py``. +Tests monkeypatch it with a coroutine returning a pre-recorded async iterator. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Temporal Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-1", + "usage": {"input_tokens": 15, "output_tokens": 7}, + "cost_usd": 0.00015, + "duration_ms": 350, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_temporal", + "name": "Bash", + "input": {"command": "ls /tmp"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_temporal", + "content": "file1\nfile2\n", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Listed files."}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-2", + "usage": {"input_tokens": 30, "output_tokens": 12}, + "cost_usd": 0.0004, + "duration_ms": 600, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-t1", task_id="task-temporal-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _run_turn(lines: list[str]): + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter( + task_id="offline-temporal", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming.sink, turn + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_text_only_produces_agent_output(): + result, sink, _ = await _run_turn(_TEXT_ONLY_LINES) + assert "Hello from Temporal Claude Code" in result.final_text + + +@pytest.mark.asyncio +async def test_usage_from_result_envelope(): + """Usage is available from turn.usage() after the events are exhausted. + + UnifiedEmitter.auto_send_turn evaluates turn.usage() eagerly before the + async generator is consumed, so result.usage is a pre-exhaust snapshot. + Read usage directly from the turn after _run_turn completes instead. + """ + result, _, turn = await _run_turn(_TEXT_ONLY_LINES) + usage = turn.usage() + assert usage.input_tokens == 15 + assert usage.output_tokens == 7 + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_session_id_captured_in_result_envelope(): + """Verify the result envelope carries session_id (multi-turn resume support).""" + _, _, turn = await _run_turn(_TEXT_ONLY_LINES) + assert turn._result_envelope is not None + assert turn._result_envelope.get("session_id") == "sess-temporal-1" + + +@pytest.mark.asyncio +async def test_tool_call_context_types(): + result, sink, _ = await _run_turn(_TOOL_CALL_LINES) + opened = [s for s in sink if s[0] == "open"] + content_types = [s[1] for s in opened] + assert "tool_request" in content_types + assert "text" in content_types + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/workflow.py. + + ``_spawn_claude(prompt, session_id=None)`` is a top-level async generator. + A drop-in replacement (e.g. via monkeypatch) supplies pre-recorded lines + and captures call arguments. The session_id parameter enables multi-turn + resume (``claude -r ``). + """ + called: list[tuple] = [] + + async def _fake_spawn(prompt: str, session_id=None) -> AsyncIterator[str]: + called.append((prompt, session_id)) + for line in _TEXT_ONLY_LINES: + yield line + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_spawn("temporal prompt", session_id="old-sid")) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert called == [("temporal prompt", "old-sid")] + assert "Hello from Temporal Claude Code" in result.final_text diff --git a/examples/tutorials/10_async/10_temporal/150_codex/.dockerignore b/examples/tutorials/10_async/10_temporal/150_codex/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/150_codex/Dockerfile b/examples/tutorials/10_async/10_temporal/150_codex/Dockerfile new file mode 100644 index 000000000..e861c7f33 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/Dockerfile @@ -0,0 +1,48 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install the codex CLI: the worker spawns `codex exec --json`, so the binary +# must be present on PATH in the image. +RUN npm install -g @openai/codex + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/150_codex/pyproject.toml /app/150_codex/pyproject.toml +COPY 10_async/10_temporal/150_codex/README.md /app/150_codex/README.md + +WORKDIR /app/150_codex + +COPY 10_async/10_temporal/150_codex/project /app/150_codex/project +COPY 10_async/10_temporal/150_codex/tests /app/150_codex/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app +ENV AGENT_NAME=at150-codex + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When deploying the worker, replace CMD with: +# CMD ["python", "-m", "project.run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/150_codex/README.md b/examples/tutorials/10_async/10_temporal/150_codex/README.md new file mode 100644 index 000000000..498b81374 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/README.md @@ -0,0 +1,48 @@ +# 150_codex (Temporal) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for a **Temporal-durable** async ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox) + inside a Temporal workflow signal handler. +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to Redis via + `UnifiedEmitter.auto_send_turn`, passing `created_at=workflow.now()` for + deterministic Temporal replay timestamps. +- Keeping the codex thread ID on the workflow instance (durable across crashes + without an external `adk.state` round-trip). + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +> **Temporal determinism note:** Subprocess spawning happens inside +> `@workflow.signal` handler bodies. Temporal does NOT replay signal handler +> bodies (only `@workflow.run` is subject to replay constraints), so this is +> safe. A production agent would wrap the subprocess in a Temporal activity for +> full durability and retry semantics. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. +3. A running Temporal server. + +## Running offline unit tests + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/10_async/10_temporal/150_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/150_codex/conftest.py b/examples/tutorials/10_async/10_temporal/150_codex/conftest.py new file mode 100644 index 000000000..6370f278d --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/conftest.py @@ -0,0 +1,17 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so FastACP, tracing, and the +Temporal workflow module can be imported without a running server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +# AGENT_NAME must match the manifest's agent name: the live test queries the +# server by this name, and project.workflow reads it at import time. +os.environ.setdefault("AGENT_NAME", "at150-codex") +os.environ.setdefault("ACP_URL", "http://localhost:8000") +os.environ.setdefault("WORKFLOW_NAME", "at150-codex") +os.environ.setdefault("WORKFLOW_TASK_QUEUE", "at150_codex_queue") diff --git a/examples/tutorials/10_async/10_temporal/150_codex/manifest.yaml b/examples/tutorials/10_async/10_temporal/150_codex/manifest.yaml new file mode 100644 index 000000000..d64bdfad0 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/150_codex + - test_utils + dockerfile: 10_async/10_temporal/150_codex/Dockerfile + dockerignore: 10_async/10_temporal/150_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at150-codex + description: Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: true + workflows: + - name: at150-codex + queue_name: at150_codex_queue + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at150-codex" + description: "Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/150_codex/project/__init__.py b/examples/tutorials/10_async/10_temporal/150_codex/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/150_codex/project/acp.py b/examples/tutorials/10_async/10_temporal/150_codex/project/acp.py new file mode 100644 index 000000000..39a81dde9 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/project/acp.py @@ -0,0 +1,32 @@ +"""ACP server for the Temporal Codex harness tutorial. + +This file is intentionally thin. When ``acp_type="async"`` is combined with +``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: + + HTTP task/create -> @workflow.run on the workflow class + HTTP task/event/send -> @workflow.signal(SignalName.RECEIVE_EVENT) + HTTP task/cancel -> workflow cancellation via the Temporal client + +so we don't define any handlers here. The actual agent code lives in +``project/workflow.py`` and is executed by the Temporal worker +(``project/run_worker.py``), not by this HTTP process. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/150_codex/project/activities.py b/examples/tutorials/10_async/10_temporal/150_codex/project/activities.py new file mode 100644 index 000000000..363347635 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/project/activities.py @@ -0,0 +1,145 @@ +"""Temporal activity for the Codex harness tutorial. + +Subprocess spawning (and any other I/O) must run inside a Temporal *activity*, +not in workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(or threads / sockets), so spawning ``codex exec`` directly in the signal +handler raises ``NotImplementedError``. This activity runs codex, drives the +``CodexTurn`` through ``UnifiedEmitter.auto_send_turn`` (the async Redis push +path), and returns the turn result to the workflow. + +The ``_spawn_codex`` / ``_process_stdout`` seams are injectable: offline tests +replace them with fakes that yield pre-recorded event lines so no real CLI +runs. +""" + +from __future__ import annotations + +import os +import codecs +import asyncio +from typing import Any +from datetime import datetime +from collections.abc import AsyncIterator + +from temporalio import activity + +from agentex.lib.adk import CodexTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel + +logger = make_logger(__name__) + +RUN_CODEX_TURN_ACTIVITY = "run_codex_turn" + + +class RunCodexTurnParams(BaseModel): + """Arguments for one codex turn run inside an activity.""" + + task_id: str + prompt: str + model: str + trace_id: str | None = None + parent_span_id: str | None = None + thread_id: str | None = None + created_at: datetime | None = None + + +class RunCodexTurnResult(BaseModel): + """Result returned from the activity to the workflow.""" + + final_text: str + session_id: str | None = None + model: str | None = None + + +async def _spawn_codex( + model: str, + thread_id: str | None = None, +) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + base_flags = [ + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + ] + + if thread_id: + cmd = ["codex", "exec", *base_flags, "resume", thread_id, "-"] + else: + cmd = ["codex", "exec", *base_flags, "-"] + + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@activity.defn(name=RUN_CODEX_TURN_ACTIVITY) +async def run_codex_turn(params: RunCodexTurnParams) -> dict[str, Any]: + """Run one codex turn end-to-end and stream events to the task. + + Runs in an activity (real asyncio loop) so subprocess I/O is permitted. + """ + process = await _spawn_codex(params.model, thread_id=params.thread_id) + + assert process.stdin is not None + process.stdin.write(params.prompt.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn(events=_process_stdout(process), model=params.model) + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + result = await emitter.auto_send_turn(turn, created_at=params.created_at) + + await process.wait() + + return RunCodexTurnResult( + final_text=result.final_text, + session_id=turn.session_id, + model=turn.usage().model, + ).model_dump() diff --git a/examples/tutorials/10_async/10_temporal/150_codex/project/run_worker.py b/examples/tutorials/10_async/10_temporal/150_codex/project/run_worker.py new file mode 100644 index 000000000..b8972806b --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/project/run_worker.py @@ -0,0 +1,41 @@ +"""Temporal worker for the Codex harness tutorial. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The codex CLI subprocess runs in the ``run_codex_turn`` activity (registered +below alongside the built-in Agentex activities), because subprocess I/O is not +permitted on the Temporal workflow event loop. +""" + +import asyncio + +from project.workflow import AtHarnessCodexWorkflow +from project.activities import run_codex_turn +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=[run_codex_turn, *get_all_activities()], + workflow=AtHarnessCodexWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/150_codex/project/workflow.py b/examples/tutorials/10_async/10_temporal/150_codex/project/workflow.py new file mode 100644 index 000000000..1970b478f --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/project/workflow.py @@ -0,0 +1,145 @@ +"""Temporal workflow for the Codex harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for a Temporal-durable ACP agent. + +KEY CONCEPTS DEMONSTRATED: +- Running ``codex exec --json`` in the ``run_codex_turn`` activity. Subprocess + I/O is not permitted on the Temporal workflow event loop (the deterministic + sandbox loop does not implement ``subprocess_exec``), so the signal handler + delegates the turn to an activity, which also gets Temporal's retry + timeout + guarantees. +- Wrapping the stdout line stream in a ``CodexTurn`` (inside the activity). +- Delivering events via ``UnifiedEmitter.auto_send_turn``, which pushes + ``StreamTaskMessage*`` events to Redis so the UI sees tokens in real time. +- Passing ``created_at=workflow.now()`` for deterministic timestamps under + Temporal replay (required for Temporal-safe delivery). +- Persisting the codex thread ID on the workflow instance itself — Temporal's + workflow state is durable, so no external ``adk.state`` round-trip is needed. +""" + +from __future__ import annotations + +import os +from datetime import timedelta + +from temporalio import workflow + +from agentex.lib import adk +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +with workflow.unsafe.imports_passed_through(): + from project.activities import RunCodexTurnParams, run_codex_turn + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class AtHarnessCodexWorkflow(BaseWorkflow): + """Long-running Temporal workflow that runs codex exec for each turn. + + Conversation state (codex thread ID + turn counter) is kept on the + workflow instance. Temporal's durable replay reconstructs this state if + the worker crashes, so no external ``adk.state`` round-trip is needed. + """ + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + self._codex_thread_id: str | None = None + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a new user message: spawn codex, stream events via UnifiedEmitter.""" + logger.info("Received task event: %s", params.task.id) + self._turn_number += 1 + + await adk.messages.create(task_id=params.task.id, content=params.event.content) + + user_message = params.event.content.content + + async with adk.tracing.span( + trace_id=params.task.id, + task_id=params.task.id, + name=f"Turn {self._turn_number}", + input={"message": user_message}, + ) as span: + # Delegate the subprocess turn to an activity: subprocess I/O is not + # permitted on the Temporal workflow event loop. The activity streams + # events to the task and returns the final text + codex thread id. + # workflow.now() gives a deterministic timestamp under replay. + result = await workflow.execute_activity( + run_codex_turn, + RunCodexTurnParams( + task_id=params.task.id, + prompt=user_message, + model=MODEL, + trace_id=params.task.id, + parent_span_id=span.id if span else None, + thread_id=self._codex_thread_id, + created_at=workflow.now(), + ), + start_to_close_timeout=timedelta(minutes=5), + ) + + # Persist the codex thread id so the next turn resumes the session. + session_id = result.get("session_id") + if session_id: + self._codex_thread_id = session_id + + if span: + span.output = { + "final_text": result.get("final_text"), + "model": result.get("model"), + } + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + """Workflow entry point — keep the conversation alive for incoming signals.""" + logger.info("Task created: %s", params.task.id) + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized.\n" + f"Send me a message and I'll run codex (local subprocess) " + f"to answer, streaming events via the unified harness surface." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + """Graceful workflow shutdown signal.""" + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/150_codex/pyproject.toml b/examples/tutorials/10_async/10_temporal/150_codex/pyproject.toml new file mode 100644 index 000000000..7e1d6250f --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at150-codex" +version = "0.1.0" +description = "Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/10_async/10_temporal/150_codex/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/150_codex/tests/test_agent.py new file mode 100644 index 000000000..fa6c66083 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/150_codex/tests/test_agent.py @@ -0,0 +1,275 @@ +"""Tests for the Temporal Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH, ``OPENAI_API_KEY``, and a running + Temporal + Agentex server. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexWorkflow``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert the signal handler drives ``UnifiedEmitter.auto_send_turn`` and + captures the codex thread ID on the workflow instance. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-temporal-1"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-t1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-t1", "type": "agent_message", "text": "Hello from Temporal!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 6, "output_tokens": 3, "total_tokens": 9}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class _FakeSpan: + id = "span-temporal-1" + output: Any = None + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + pass + + +class TestOfflineCodexWorkflow: + """Unit tests that run without a real codex CLI, Temporal, or network.""" + + @pytest.mark.asyncio + async def test_codex_turn_usage_with_temporal_events(self): + """CodexTurn.usage() is correct after exhausting the temporal sample events.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + _ = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 6 + assert usage.output_tokens == 3 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_unified_emitter_auto_send_with_created_at(self): + """UnifiedEmitter.auto_send_turn accepts created_at=None without error.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message import TaskMessage + from agentex.types.text_content import TextContent + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + real_task_msg = TaskMessage( + id="msg-fake", + task_id="t", + content=TextContent(type="text", author="agent", content=""), + ) + + fake_streaming = MagicMock() + fake_ctx = AsyncMock() + fake_ctx.__aenter__ = AsyncMock(return_value=fake_ctx) + fake_ctx.__aexit__ = AsyncMock(return_value=False) + fake_ctx.stream_update = AsyncMock(return_value=MagicMock()) + fake_ctx.close = AsyncMock() + fake_ctx.task_message = real_task_msg + fake_streaming.streaming_task_message_context = MagicMock(return_value=fake_ctx) + + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + + result = await emitter.auto_send_turn(turn, created_at=None) + assert result is not None + + @pytest.mark.asyncio + async def test_thread_id_captured_after_exhausted_stream(self): + """CodexTurn._result captures the thread_id from thread.started.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + _ = [e async for e in turn.events] + + assert turn._result is not None + assert turn._result["session_id"] == "thread-temporal-1" + + @pytest.mark.asyncio + async def test_signal_handler_delegates_to_activity_and_captures_thread_id(self): + """Signal handler runs the turn via execute_activity, increments the turn + counter, and captures the codex thread ID returned by the activity.""" + captured: dict[str, Any] = {} + + async def _fake_execute_activity(_activity, params, **_kw): + captured["params"] = params + return { + "session_id": "thread-temporal-1", + "final_text": "Hello from Temporal!", + "model": "o4-mini", + } + + with patch("project.workflow.adk.messages.create", new=AsyncMock()), patch( + "project.workflow.adk.tracing.span" + ) as mock_span, patch( + "project.workflow.workflow.execute_activity", new=_fake_execute_activity + ), patch("project.workflow.workflow.now", return_value=None): + mock_span.return_value = _FakeSpan() + + from project.workflow import AtHarnessCodexWorkflow + + wf = AtHarnessCodexWorkflow.__new__(AtHarnessCodexWorkflow) + wf._turn_number = 0 + wf._codex_thread_id = None + wf._complete_task = False + wf._display_name = "test" + + params = MagicMock() + params.task.id = "task-temporal-offline-1" + params.event.content.content = "say hello temporal" + + await wf.on_task_event_send(params) + + assert wf._turn_number == 1 + assert wf._codex_thread_id == "thread-temporal-1" + assert captured["params"].prompt == "say hello temporal" + assert captured["params"].thread_id is None + + @pytest.mark.asyncio + async def test_run_codex_turn_activity_streams_and_returns_thread_id(self): + """The run_codex_turn activity drives the turn and returns the thread id.""" + from agentex.lib.core.harness import UnifiedEmitter + + async def _fake_spawn(model, thread_id=None): # noqa: ARG001 + fake_stdin = MagicMock() + fake_stdin.write = MagicMock() + fake_stdin.drain = AsyncMock() + fake_stdin.close = MagicMock() + proc = MagicMock() + proc.stdin = fake_stdin + proc.wait = AsyncMock(return_value=0) + return proc + + async def _fake_process_stdout(_process): # noqa: ARG001 + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + class _FakeTurnResult: + final_text = "Hello from Temporal!" + + async def _auto_send(_self, turn, *_a, **_kw): + async for _ in turn.events: + pass + return _FakeTurnResult() + + with patch("project.activities._spawn_codex", new=_fake_spawn), patch( + "project.activities._process_stdout", new=_fake_process_stdout + ), patch.object(UnifiedEmitter, "auto_send_turn", new=_auto_send): + from project.activities import RunCodexTurnParams, run_codex_turn + + result = await run_codex_turn( + RunCodexTurnParams( + task_id="task-temporal-offline-1", + prompt="say hello temporal", + model="o4-mini", + ) + ) + + assert result["session_id"] == "thread-temporal-1" + assert result["final_text"] == "Hello from Temporal!" + + +# --------------------------------------------------------------------------- +# Live tests +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at150-codex") + + +@pytest.mark.skipif( + not LIVE, + reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY + Temporal are available", +) +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI, Temporal, and Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_id(self, client): + for agent in client.agents.list(): + if agent.name == AGENT_NAME: + return agent.id + raise ValueError(f"Agent {AGENT_NAME!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Temporal agents process events out of band, so create a task, send an + event, and poll the task's messages for the agent's response.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task.id, + content=TextContentParam( + author="user", + content="What is 5+5? Reply with just the number.", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 90 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task.id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + response_msgs = [ + m for m in agent_msgs if "Task initialized" not in str(getattr(m.content, "content", "")) + ] + if response_msgs: + assert len(response_msgs) >= 1 + return + time.sleep(3) + + raise AssertionError("No agent response received within 90 s") diff --git a/pyproject.toml b/pyproject.toml index 98134d993..7ee0cf56b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # overlay (formerly `src/agentex/lib/*`) now lives in `adk/` and ships # as the sibling `agentex-sdk` package — see `adk/pyproject.toml`. name = "agentex-client" -version = "0.14.0" +version = "0.15.0" description = "The official Python REST client for the Agentex API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/src/agentex/_client.py b/src/agentex/_client.py index 1be05b767..b52ae6b78 100644 --- a/src/agentex/_client.py +++ b/src/agentex/_client.py @@ -35,12 +35,24 @@ ) if TYPE_CHECKING: - from .resources import spans, tasks, agents, events, states, tracker, messages, checkpoints, deployment_history + from .resources import ( + spans, + tasks, + agents, + events, + states, + tracker, + messages, + webhooks, + checkpoints, + deployment_history, + ) from .resources.spans import SpansResource, AsyncSpansResource from .resources.tasks import TasksResource, AsyncTasksResource from .resources.events import EventsResource, AsyncEventsResource from .resources.states import StatesResource, AsyncStatesResource from .resources.tracker import TrackerResource, AsyncTrackerResource + from .resources.webhooks import WebhooksResource, AsyncWebhooksResource from .resources.checkpoints import CheckpointsResource, AsyncCheckpointsResource from .resources.agents.agents import AgentsResource, AsyncAgentsResource from .resources.messages.messages import MessagesResource, AsyncMessagesResource @@ -202,6 +214,12 @@ def checkpoints(self) -> CheckpointsResource: return CheckpointsResource(self) + @cached_property + def webhooks(self) -> WebhooksResource: + from .resources.webhooks import WebhooksResource + + return WebhooksResource(self) + @cached_property def with_raw_response(self) -> AgentexWithRawResponse: return AgentexWithRawResponse(self) @@ -457,6 +475,12 @@ def checkpoints(self) -> AsyncCheckpointsResource: return AsyncCheckpointsResource(self) + @cached_property + def webhooks(self) -> AsyncWebhooksResource: + from .resources.webhooks import AsyncWebhooksResource + + return AsyncWebhooksResource(self) + @cached_property def with_raw_response(self) -> AsyncAgentexWithRawResponse: return AsyncAgentexWithRawResponse(self) @@ -634,6 +658,12 @@ def checkpoints(self) -> checkpoints.CheckpointsResourceWithRawResponse: return CheckpointsResourceWithRawResponse(self._client.checkpoints) + @cached_property + def webhooks(self) -> webhooks.WebhooksResourceWithRawResponse: + from .resources.webhooks import WebhooksResourceWithRawResponse + + return WebhooksResourceWithRawResponse(self._client.webhooks) + class AsyncAgentexWithRawResponse: _client: AsyncAgentex @@ -695,6 +725,12 @@ def checkpoints(self) -> checkpoints.AsyncCheckpointsResourceWithRawResponse: return AsyncCheckpointsResourceWithRawResponse(self._client.checkpoints) + @cached_property + def webhooks(self) -> webhooks.AsyncWebhooksResourceWithRawResponse: + from .resources.webhooks import AsyncWebhooksResourceWithRawResponse + + return AsyncWebhooksResourceWithRawResponse(self._client.webhooks) + class AgentexWithStreamedResponse: _client: Agentex @@ -756,6 +792,12 @@ def checkpoints(self) -> checkpoints.CheckpointsResourceWithStreamingResponse: return CheckpointsResourceWithStreamingResponse(self._client.checkpoints) + @cached_property + def webhooks(self) -> webhooks.WebhooksResourceWithStreamingResponse: + from .resources.webhooks import WebhooksResourceWithStreamingResponse + + return WebhooksResourceWithStreamingResponse(self._client.webhooks) + class AsyncAgentexWithStreamedResponse: _client: AsyncAgentex @@ -817,6 +859,12 @@ def checkpoints(self) -> checkpoints.AsyncCheckpointsResourceWithStreamingRespon return AsyncCheckpointsResourceWithStreamingResponse(self._client.checkpoints) + @cached_property + def webhooks(self) -> webhooks.AsyncWebhooksResourceWithStreamingResponse: + from .resources.webhooks import AsyncWebhooksResourceWithStreamingResponse + + return AsyncWebhooksResourceWithStreamingResponse(self._client.webhooks) + Client = Agentex diff --git a/src/agentex/_version.py b/src/agentex/_version.py index 551c0dbac..c567e168b 100644 --- a/src/agentex/_version.py +++ b/src/agentex/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "agentex" -__version__ = "0.14.0" # x-release-please-version +__version__ = "0.15.0" # x-release-please-version diff --git a/src/agentex/lib/adk/__init__.py b/src/agentex/lib/adk/__init__.py index a08131260..e618a20d3 100644 --- a/src/agentex/lib/adk/__init__.py +++ b/src/agentex/lib/adk/__init__.py @@ -10,9 +10,18 @@ from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events from agentex.lib.adk._modules._langgraph_messages import emit_langgraph_messages from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn from agentex.lib.adk._modules._pydantic_ai_async import stream_pydantic_ai_events from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events from agentex.lib.adk._modules._pydantic_ai_tracing import create_pydantic_ai_tracing_handler +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events +from agentex.lib.adk._modules._claude_code_turn import ( + ClaudeCodeTurn, + claude_code_usage_to_turn_usage, +) +from agentex.lib.adk._modules._codex_sync import convert_codex_to_agentex_events +from agentex.lib.adk._modules._codex_turn import CodexTurn, codex_usage_to_turn_usage from agentex.lib.adk._modules.events import EventsModule from agentex.lib.adk._modules.messages import MessagesModule from agentex.lib.adk._modules.state import StateModule @@ -20,6 +29,19 @@ from agentex.lib.adk._modules.tasks import TasksModule from agentex.lib.adk._modules.tracing import TracingModule +# Unified harness surface (AGX1-375) +from agentex.lib.core.harness import ( + UnifiedEmitter, + SpanTracer, + OpenSpan, + CloseSpan, + SpanSignal, + StreamTaskMessage, + TurnUsage, + TurnResult, + HarnessTurn, +) + from agentex.lib.adk import providers from agentex.lib.adk import utils @@ -50,10 +72,30 @@ "stream_langgraph_events", "emit_langgraph_messages", "convert_langgraph_to_agentex_events", + "LangGraphTurn", # Pydantic AI "stream_pydantic_ai_events", "convert_pydantic_ai_to_agentex_events", "create_pydantic_ai_tracing_handler", + "PydanticAITurn", + # Claude Code + "convert_claude_code_to_agentex_events", + "ClaudeCodeTurn", + "claude_code_usage_to_turn_usage", + # Codex + "convert_codex_to_agentex_events", + "CodexTurn", + "codex_usage_to_turn_usage", + # Unified harness surface (AGX1-375) + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", # Providers "providers", # Utils diff --git a/src/agentex/lib/adk/_modules/_claude_code_sync.py b/src/agentex/lib/adk/_modules/_claude_code_sync.py new file mode 100644 index 000000000..4e25503cf --- /dev/null +++ b/src/agentex/lib/adk/_modules/_claude_code_sync.py @@ -0,0 +1,378 @@ +"""Claude Code stream-json parser tap for the unified harness surface. + +Converts the newline-delimited JSON envelopes emitted by +``claude -p --output-format stream-json`` into the canonical +``StreamTaskMessage*`` stream consumed by the Agentex harness. + +Envelope → canonical mapping +----------------------------- +system/init + Ignored at this layer (session_id tracking is a provider concern). + +assistant / user (content blocks) + text block → Start(TextContent) + Delta(TextDelta)* + Done + thinking block → Start(ReasoningContent) + Delta(ReasoningContentDelta)* + Done + tool_use block → Start(ToolRequestContent) + Done (Full args in Start content) + tool_result block → Full(ToolResponseContent) + +stream_event / content_block_start + type=text → Start(TextContent, empty) + type=thinking → Start(ReasoningContent, empty) + +stream_event / content_block_delta + type=text_delta → Delta(TextDelta) + type=thinking_delta → Delta(ReasoningContentDelta) + +stream_event / content_block_stop + (text open) → Done + (thinking open) → Done (full text known here; update Full via Full event first) + +result + Fires ``on_result`` with the raw envelope so the caller can capture + usage and cost. No StreamTaskMessage is emitted for the result itself. + +Out of scope +------------ +No deployable test agent is provided. claude-code requires the golden +agent's sandbox/subprocess/secret/MCP orchestration to produce the stream. +Live coverage is the golden agent, which will adopt this tap. Do NOT add an +examples/ agent or CI live-matrix row for claude-code. +""" + +from __future__ import annotations + +import json +from typing import Any, Callable, Awaitable, AsyncIterator + +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +logger = make_logger(__name__) + +_MAX_RESULT_LENGTH = 4000 + + +def _truncate(text: str) -> str: + return str(text)[:_MAX_RESULT_LENGTH] + + +def _extract_summary(text: str, max_len: int = 300) -> str: + return text.strip().split("\n", 1)[0][:max_len] + + +async def convert_claude_code_to_agentex_events( + lines: AsyncIterator[str | dict[str, Any]], + on_result: Callable[[dict[str, Any]], Awaitable[None]] | None = None, +) -> AsyncIterator[StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone]: + """Convert a claude-code ``stream-json`` line stream into Agentex ``StreamTaskMessage*`` events. + + Each item in ``lines`` is either a raw JSON string (as read from the CLI's + stdout) or an already-parsed dict. Empty strings are skipped; unparseable + JSON is logged and skipped. + + ``on_result`` is called with the ``result`` envelope when it arrives so the + caller can capture usage and cost. It is awaited before the generator + continues. When ``None``, the result envelope is silently dropped. + + Envelope → canonical mapping is documented in this module's docstring. + """ + next_index = 0 + tool_call_count = 0 + + # Streaming state for content_block_start / content_block_delta / + # content_block_stop triples. + _thinking_open = False + _thinking_buf = "" + _thinking_index: int | None = None + _text_open = False + _text_buf = "" + _text_index: int | None = None + # Track which assistant-message block indices were already streamed via + # stream_event triples. Those blocks must not be re-emitted when the full + # assistant message arrives. Reset at each message boundary (see below) so a + # later turn's block indices don't collide with an earlier turn's. + _streamed_block_indexes: set[int] = set() + # Once-guard so a thinking block's pending index is claimed on its first + # thinking_delta only. Reset per turn alongside _streamed_block_indexes. + _saw_thinking_stream = False + # For deferred ReasoningStarted: if a content_block_start(thinking) arrives + # but no thinking_delta ever follows, the final assistant block's thinking + # field fills the reasoning content instead. + _pending_thinking_block_index: int | None = None + + async for raw in lines: + if not raw: + continue + + if isinstance(raw, dict): + evt = raw + else: + line = raw.strip() + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + logger.debug("claude-code: skipping non-JSON line: %r", line[:120]) + continue + + evt_type = evt.get("type", "") + + # ----------------------------------------------------------------------- + # assistant / user — materialised content blocks + # ----------------------------------------------------------------------- + if evt_type in ("assistant", "user"): + msg = evt.get("message", {}) + blocks = msg.get("content", []) + if not isinstance(blocks, list): + blocks = [blocks] + + for idx, block in enumerate(blocks): + if not isinstance(block, dict): + continue + block_type = block.get("type", "") + + if block_type == "text": + # Skip only the specific blocks already delivered via + # stream_event deltas (per-block, not a turn-wide latch). + if idx in _streamed_block_indexes: + continue + text = block.get("text", "") + if text: + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + yield StreamTaskMessageDelta( + type="delta", + index=msg_index, + delta=TextDelta(type="text", text_delta=text), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "thinking": + # Skip only the specific blocks already delivered via + # stream_event deltas (per-block, not a turn-wide latch). + if idx in _streamed_block_indexes: + continue + thinking_text = block.get("thinking", "") + if thinking_text: + summary = _extract_summary(thinking_text) + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[summary], + content=[], + style="active", + ), + ) + yield StreamTaskMessageDelta( + type="delta", + index=msg_index, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=thinking_text, + ), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "tool_use": + tool_call_count += 1 + tool_id = block.get("id", f"tool_{tool_call_count}") + name = block.get("name", "unknown") + arguments = block.get("input", {}) + if not isinstance(arguments, dict): + arguments = {} + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_id, + name=name, + arguments=arguments, + ), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "tool_result": + tool_id = block.get("tool_use_id", "") + content = block.get("content", "") + is_error = block.get("is_error", False) + if isinstance(content, list): + content = "\n".join(b.get("text", str(b)) if isinstance(b, dict) else str(b) for b in content) + result_str = _truncate(str(content)) + msg_index = next_index + next_index += 1 + yield StreamTaskMessageFull( + type="full", + index=msg_index, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id=tool_id, + name="", + content={"result": result_str, **({"is_error": True} if is_error else {})}, + ), + ) + + # End of a materialised message: reset per-turn streaming dedup state + # so the next turn's stream_event indices start clean. Without this, + # a block index streamed in an earlier turn would linger in the set + # and silently drop a later turn's non-streamed block at that index. + _streamed_block_indexes = set() + _saw_thinking_stream = False + + # ----------------------------------------------------------------------- + # stream_event — incremental streaming deltas + # ----------------------------------------------------------------------- + elif evt_type == "stream_event": + se = evt.get("event") or {} + se_type = se.get("type", "") + block_index = se.get("index") + + if se_type == "content_block_start": + block = se.get("content_block") or {} + btype = block.get("type") + + if btype == "thinking": + _thinking_open = True + _thinking_buf = "" + # Defer marking the block as streamed until we actually + # receive a thinking_delta. Some configurations emit a + # thinking block_start but no deltas — in that case we want + # the final assistant-message handler to fill the text. + _pending_thinking_block_index = block_index if isinstance(block_index, int) else None + msg_index = next_index + next_index += 1 + _thinking_index = msg_index + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ) + + elif btype == "text": + _text_open = True + _text_buf = "" + if isinstance(block_index, int): + _streamed_block_indexes.add(block_index) + msg_index = next_index + next_index += 1 + _text_index = msg_index + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + + elif se_type == "content_block_delta": + delta = se.get("delta") or {} + dtype = delta.get("type") + + if dtype == "thinking_delta": + chunk = delta.get("thinking", "") + if chunk and _thinking_open: + if not _saw_thinking_stream: + _saw_thinking_stream = True + # Now mark the block as claimed so the assistant + # message handler won't re-emit it. + if _pending_thinking_block_index is not None: + _streamed_block_indexes.add(_pending_thinking_block_index) + _thinking_buf += chunk + if _thinking_index is not None: + yield StreamTaskMessageDelta( + type="delta", + index=_thinking_index, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=chunk, + ), + ) + + elif dtype == "text_delta": + chunk = delta.get("text", "") + if chunk and _text_open: + _text_buf += chunk + if _text_index is not None: + yield StreamTaskMessageDelta( + type="delta", + index=_text_index, + delta=TextDelta(type="text", text_delta=chunk), + ) + + elif se_type == "content_block_stop": + if _thinking_open: + _thinking_open = False + _thinking_buf = "" + _pending_thinking_block_index = None + # Reset the once-guard per thinking block: a turn can stream a + # second thinking block, and without this the guard stays True, + # the second block's index is never claimed, and the final + # assistant envelope re-emits it (duplicate Start/Delta/Done). + _saw_thinking_stream = False + if _thinking_index is not None: + yield StreamTaskMessageDone(type="done", index=_thinking_index) + _thinking_index = None + elif _text_open: + _text_open = False + _text_buf = "" + if _text_index is not None: + yield StreamTaskMessageDone(type="done", index=_text_index) + _text_index = None + + # ----------------------------------------------------------------------- + # system / init — session metadata (ignored at this layer) + # ----------------------------------------------------------------------- + elif evt_type == "system": + # Session ID tracking and MCP status logging are provider concerns. + # This pure parser layer intentionally emits nothing for system events. + pass + + # ----------------------------------------------------------------------- + # result — carries usage + cost; fired to on_result, not emitted as msgs + # ----------------------------------------------------------------------- + elif evt_type == "result": + if on_result is not None: + await on_result(evt) + + else: + logger.debug("claude-code: unhandled envelope type %r", evt_type) diff --git a/src/agentex/lib/adk/_modules/_claude_code_turn.py b/src/agentex/lib/adk/_modules/_claude_code_turn.py new file mode 100644 index 000000000..6c052976a --- /dev/null +++ b/src/agentex/lib/adk/_modules/_claude_code_turn.py @@ -0,0 +1,161 @@ +"""ClaudeCodeTurn — HarnessTurn implementation for the claude-code tap. + +Wraps ``convert_claude_code_to_agentex_events`` to implement the +``HarnessTurn`` protocol: exposes ``events`` (the canonical +``StreamTaskMessage*`` stream) and ``usage()`` (the normalised +``TurnUsage``, populated after the stream is exhausted). + +Usage normalization +------------------- +Claude Code's ``result`` envelope carries usage under several key shapes +depending on the CLI version. We defensive-map all known shapes: + + result.usage.input_tokens -> input_tokens + result.usage.output_tokens -> output_tokens + result.usage.cache_read_input_tokens + result.usage.cache_creation_input_tokens -> cached_input_tokens (sum) + result.cost_usd / result.total_cost_usd -> cost_usd + result.duration_ms -> duration_ms + result.num_turns -> num_llm_calls + +Real zeros are preserved; missing keys default to ``None`` (not zero) so +downstream consumers can distinguish "not reported" from "zero". + +Out of scope: no deployable test agent is provided — see module docstring +in ``_claude_code_sync.py``. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn, StreamTaskMessage +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + + +def claude_code_usage_to_turn_usage(result_envelope: dict[str, Any]) -> TurnUsage: + """Map a claude-code ``result`` envelope to a canonical ``TurnUsage``. + + Defensively handles missing / None values. Real zeros are preserved. + ``cost_usd`` checks both ``cost_usd`` and ``total_cost_usd`` keys (the + CLI has used both across versions). + ``cached_input_tokens`` accumulates cache_read and cache_creation counts + since both represent tokens served from the prompt cache. + """ + usage_raw: dict[str, Any] = result_envelope.get("usage") or {} + + def _int(d: dict[str, Any], key: str) -> int | None: + v = d.get(key) + if v is None: + return None + try: + return int(v) + except (TypeError, ValueError): + return None + + def _float(d: dict[str, Any], *keys: str) -> float | None: + for key in keys: + v = d.get(key) + if v is not None: + try: + return float(v) + except (TypeError, ValueError): + continue + return None + + input_tokens = _int(usage_raw, "input_tokens") + output_tokens = _int(usage_raw, "output_tokens") + + # Aggregate both cache_read and cache_creation into cached_input_tokens + cache_read = _int(usage_raw, "cache_read_input_tokens") + cache_creation = _int(usage_raw, "cache_creation_input_tokens") + if cache_read is not None or cache_creation is not None: + cached_input_tokens = (cache_read or 0) + (cache_creation or 0) + else: + cached_input_tokens = None + + total_tokens: int | None = None + if input_tokens is not None and output_tokens is not None: + total_tokens = input_tokens + output_tokens + + cost_usd = _float(result_envelope, "cost_usd", "total_cost_usd") + duration_ms = _int(result_envelope, "duration_ms") + + # num_llm_calls is provider-reported (from num_turns): default None ("not + # reported") rather than 0 so callers can distinguish it from a real zero, + # matching the None convention used for the token fields above. + num_turns = result_envelope.get("num_turns") + num_llm_calls: int | None = None + if num_turns is not None: + try: + num_llm_calls = int(num_turns) + except (TypeError, ValueError): + pass + + return TurnUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cached_input_tokens=cached_input_tokens, + total_tokens=total_tokens, + cost_usd=cost_usd, + duration_ms=duration_ms, + num_llm_calls=num_llm_calls, + ) + + +class ClaudeCodeTurn: + """HarnessTurn for a claude-code ``stream-json`` line stream. + + Satisfies the ``HarnessTurn`` protocol: + - ``events`` yields the canonical ``StreamTaskMessage*`` stream. + - ``usage()`` returns the normalised ``TurnUsage`` (only valid after + ``events`` is fully consumed). + + ``lines`` is an async iterator of raw JSON strings or pre-parsed dicts, as + produced by reading the claude-code CLI's stdout line by line. + """ + + def __init__(self, lines: AsyncIterator[str | dict[str, Any]]) -> None: + self._lines = lines + self._result_envelope: dict[str, Any] | None = None + self._events_stream: AsyncIterator[StreamTaskMessage] | None = None + + async def _on_result(self, envelope: dict[str, Any]) -> None: + self._result_envelope = envelope + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + if self._events_stream is None: + self._events_stream = convert_claude_code_to_agentex_events( + self._lines, + on_result=self._on_result, + ) + return self._events_stream + + @property + def session_id(self) -> str | None: + """The Claude Code session id, for resuming a multi-turn session. + + Valid only after ``events`` has been fully consumed (populated by the + ``result`` envelope). Returns ``None`` if the stream was truncated or + Claude Code reported no session id. + """ + if not self._result_envelope: + return None + return self._result_envelope.get("session_id") + + def usage(self) -> TurnUsage: + """Return normalised usage for this turn. + + Call only after ``events`` is exhausted. Returns an empty ``TurnUsage`` + if the ``result`` envelope was not received (e.g. stream was truncated). + """ + if self._result_envelope is None: + return TurnUsage() + return claude_code_usage_to_turn_usage(self._result_envelope) + + +# Runtime assert that ClaudeCodeTurn satisfies HarnessTurn protocol +assert isinstance(ClaudeCodeTurn.__new__(ClaudeCodeTurn), HarnessTurn), ( + "ClaudeCodeTurn must satisfy the HarnessTurn protocol" +) diff --git a/src/agentex/lib/adk/_modules/_codex_sync.py b/src/agentex/lib/adk/_modules/_codex_sync.py new file mode 100644 index 000000000..5a951d57e --- /dev/null +++ b/src/agentex/lib/adk/_modules/_codex_sync.py @@ -0,0 +1,601 @@ +"""Codex event-stream parser tap for the unified harness surface. + +Converts a ``codex exec --json`` newline-delimited event stream (already +produced by the golden agent's sandbox/subprocess orchestration) into the +Agentex canonical ``StreamTaskMessage*`` events. + +SCOPE +----- +This module is a **pure parser**. It receives pre-produced codex events +(``str`` lines or already-decoded ``dict`` objects) and yields canonical +``StreamTaskMessage*`` events. All subprocess management, sandbox +provisioning, secret injection, and MCP orchestration remain in the golden +agent at +``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. + +No deployable test agent is included here: running codex requires the +golden agent's sandbox environment and is out of scope for this library tap. + +OUT OF SCOPE (document here so future callers are not surprised): +- Subprocess / sandbox management +- OPENAI_API_KEY / secret injection +- MCP server configuration (--config /tmp/codex_config.toml) +- ``codex exec resume`` session tracking +- ``scale_sandbox`` imports + +CANONICAL MAPPING +----------------- +The table below lists every ``type`` field the codex exec JSON stream can +emit (from ``codex-rs/exec/src/exec_events.rs``) and its mapping. + +Top-level event types +~~~~~~~~~~~~~~~~~~~~~ + thread.started -> (no StreamTaskMessage; session_id captured + internally; surfaced via ``on_result`` callback) + turn.started -> (no StreamTaskMessage; turn was started before + codex launched; nothing to emit here) + turn.completed -> on_result(usage_dict, tool_count, reasoning_count) + yields no StreamTaskMessage (turn lifecycle is + managed by the activity layer) + turn.failed -> StreamTaskMessageFull(TextContent, error text) + error -> StreamTaskMessageFull(TextContent, error text) + +Item sub-types (item.started / item.updated / item.completed) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + agent_message -> text deltas: + item.started / item.updated -> StreamTaskMessageDelta(TextDelta) + item.completed -> StreamTaskMessageDone + reasoning -> reasoning: + item.started -> StreamTaskMessageStart(ReasoningContent) + item.updated -> (no-op; final text arrives on completed) + item.completed -> StreamTaskMessageDelta(ReasoningSummaryDelta) + + StreamTaskMessageDelta(ReasoningContentDelta) + + StreamTaskMessageDone + command_execution -> tool request + response: + item.started -> StreamTaskMessageStart(ToolRequestContent) + + StreamTaskMessageDone + item.completed -> StreamTaskMessageFull(ToolResponseContent) + file_change -> same as command_execution + NOTE: file_change may only emit item.completed (no started); + a synthetic ToolRequestContent Full is emitted before the response. + mcp_tool_call -> same as command_execution + web_search -> same as command_execution + todo_list -> same as command_execution + collab_tool_call -> same as command_execution + error (item type) -> StreamTaskMessageFull(TextContent, error text) on completed only + +UNMAPPED / PARTIALLY MAPPED EVENTS +----------------------------------- + thread.started: session_id is extracted but not forwarded as a + StreamTaskMessage (no canonical content type for + session-lifecycle signals; captured in on_result). + turn.started: no-op; intentional (the caller owns turn lifecycle). + turn.completed: no StreamTaskMessage; usage is forwarded via + on_result so the caller can record it in a span + without this module needing to know about spans. + item.updated (reasoning): the intermediate cumulative text is discarded; + only item.completed carries the final text. + item.updated (tool): tool item types other than agent_message do not + emit updates; item.started opens the request and + item.completed closes it. +""" + +from __future__ import annotations + +import json +from typing import Any, Callable, AsyncIterator + +from agentex.lib.utils.logging import make_logger +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.task_message_content import TextContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta +from agentex.types.reasoning_summary_delta import ReasoningSummaryDelta + +logger = make_logger(__name__) + +# Canonical type alias matching the unified harness surface. +StreamTaskMessage = StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone + +_MAX_RESULT_LENGTH = 4000 + + +def _truncate(text: str, max_len: int = _MAX_RESULT_LENGTH) -> str: + return str(text)[:max_len] + + +def _tool_name_for(item_type: str, payload: dict[str, Any]) -> str: + """Derive a canonical tool name from a codex item type.""" + if item_type == "command_execution": + return "bash" + if item_type == "file_change": + return "file_change" + if item_type == "mcp_tool_call": + server = payload.get("server", "") + tool = payload.get("tool", "") + return f"{server}.{tool}" if (server or tool) else "mcp_tool_call" + if item_type == "web_search": + return "web_search" + if item_type == "todo_list": + return "todo_list" + if item_type == "collab_tool_call": + return "collab_tool_call" + return item_type or "unknown" + + +def _tool_args_for(item_type: str, payload: dict[str, Any]) -> dict[str, Any]: + """Extract canonical arguments dict from a codex item payload.""" + if item_type == "command_execution": + return {"command": payload.get("command", "")} + if item_type == "file_change": + return {"changes": payload.get("changes") or []} + if item_type == "mcp_tool_call": + args = payload.get("arguments") + return args if isinstance(args, dict) else {"value": args} + if item_type == "web_search": + return {"query": payload.get("query", "")} + if item_type == "todo_list": + return {"items": payload.get("items") or []} + if item_type == "collab_tool_call": + # Surface an arguments dict if the payload carries one (mirrors + # mcp_tool_call); otherwise no args rather than fabricating a shape. + args = payload.get("arguments") + return args if isinstance(args, dict) else {} + return {} + + +def _tool_output_for(item_type: str, payload: dict[str, Any]) -> tuple[str, bool]: + """Extract (result_text, is_error) from a completed codex tool item.""" + if item_type == "command_execution": + out = payload.get("aggregated_output") or "" + exit_code = payload.get("exit_code") + is_error = exit_code is not None and exit_code != 0 + return _truncate(out), is_error + if item_type in ("mcp_tool_call", "collab_tool_call"): + # collab_tool_call mirrors mcp_tool_call's error/result convention + # (see _tool_args_for); without this branch a failed collab call would + # fall through to the generic path and be reported as a success. + err = payload.get("error") + if err: + msg = err.get("message", "") if isinstance(err, dict) else str(err) + return _truncate(f"Error: {msg}"), True + result = payload.get("result") + if result is None: + return "", False + try: + return _truncate(json.dumps(result)), False + except (TypeError, ValueError): + return _truncate(str(result)), False + if item_type == "file_change": + changes = payload.get("changes") or [] + status = payload.get("status", "") + return f"status={status}, {len(changes)} changes", status == "failed" + try: + return _truncate(json.dumps(payload, default=str)), False + except (TypeError, ValueError): + return _truncate(str(payload)), False + + +def _error_full(message: str, next_index: int) -> StreamTaskMessageFull: + """Emit a one-shot TextContent full message for an error.""" + return StreamTaskMessageFull( + type="full", + index=next_index, + content=TextContent( + type="text", + author="agent", + content=f"Error: {message}", + format="plain", + ), + ) + + +class _CodexStreamProcessor: + """Stateful parser: consumes codex exec events, yields StreamTaskMessage*. + + Ported from the golden agent's ``_CodexEventProcessor`` in + ``project/harness/providers/codex.py``, adapted to yield + ``StreamTaskMessage*`` directly instead of ``HarnessEvent`` objects. + + State tracked: + - ``_next_index``: monotonically increasing message index. + - ``_text_index``: message index of the current open agent_message block. + - ``_text_accumulated``: cumulative text per agent_message item_id. + - ``_reasoning_index``: message index of the current open reasoning block. + - ``_reasoning_text``: latest cumulative reasoning text per item_id. + - ``_tool_open``: item_ids for which a ToolRequestContent Start was emitted + but no ToolResponseContent Full yet. + - ``_tool_item_types``: item_id -> item_type for open tool calls. + """ + + def __init__(self) -> None: + self._next_index: int = 0 + + # agent_message tracking + self._text_index: dict[str, int] = {} + self._text_accumulated: dict[str, str] = {} + + # reasoning tracking + self._reasoning_index: dict[str, int] = {} + self._reasoning_text: dict[str, str] = {} + + # tool tracking + self._tool_open: set[str] = set() + self._tool_item_types: dict[str, str] = {} + # Remember the tool_call_id assigned per item so the request and response + # halves agree even when item_id is empty (a recomputed fallback would + # drift as tool_call_count advances between started and completed). + self._tool_call_ids: dict[str, str] = {} + + # counters for on_result callback + self.tool_call_count: int = 0 + self.reasoning_count: int = 0 + self.session_id: str | None = None + + def _alloc(self) -> int: + idx = self._next_index + self._next_index += 1 + return idx + + def process(self, evt: dict[str, Any]) -> list[StreamTaskMessage]: + evt_type = evt.get("type", "") + + if evt_type == "thread.started": + sid = evt.get("thread_id") or "" + if sid: + self.session_id = sid + return [] + + if evt_type == "turn.started": + # The activity layer owns turn lifecycle; nothing to emit. + return [] + + if evt_type == "turn.completed": + # Usage forwarded via on_result callback (not a StreamTaskMessage). + return [] + + if evt_type == "turn.failed": + err = evt.get("error") or {} + msg = err.get("message", "codex turn failed") if isinstance(err, dict) else str(err) + return [_error_full(f"Codex turn failed: {msg}", self._alloc())] + + if evt_type == "error": + return [_error_full(evt.get("message", "codex error"), self._alloc())] + + if evt_type in ("item.started", "item.updated", "item.completed"): + item = evt.get("item") or {} + return self._handle_item(evt_type, item) + + logger.debug("[codex] unhandled event type=%s", evt_type) + return [] + + def _handle_item(self, evt_type: str, item: dict[str, Any]) -> list[StreamTaskMessage]: + item_id = item.get("id") or "" + item_type = item.get("type") or "" + out: list[StreamTaskMessage] = [] + + if item_type == "agent_message": + current = item.get("text") or "" + previous = self._text_accumulated.get(item_id, "") + + if evt_type in ("item.started", "item.updated"): + if item_id not in self._text_index: + idx = self._alloc() + self._text_index[item_id] = idx + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + ) + idx = self._text_index[item_id] + delta = "" + if current.startswith(previous) and len(current) > len(previous): + delta = current[len(previous) :] + elif current and current != previous: + delta = current + if delta: + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=TextDelta(type="text", text_delta=delta), + ) + ) + self._text_accumulated[item_id] = current + + elif evt_type == "item.completed": + if item_id not in self._text_index: + idx = self._alloc() + self._text_index[item_id] = idx + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + ) + idx = self._text_index[item_id] + delta = "" + if current.startswith(previous) and len(current) > len(previous): + delta = current[len(previous) :] + elif current and current != previous: + delta = current + if delta: + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=TextDelta(type="text", text_delta=delta), + ) + ) + out.append(StreamTaskMessageDone(type="done", index=idx)) + self._text_accumulated[item_id] = current + + elif item_type == "reasoning": + current = item.get("text") or "" + + if evt_type == "item.started": + idx = self._alloc() + self._reasoning_index[item_id] = idx + self._reasoning_text[item_id] = current + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ) + ) + + elif evt_type == "item.updated": + # Accumulate silently; final text arrives on item.completed. + self._reasoning_text[item_id] = current + + elif evt_type == "item.completed": + text = current or self._reasoning_text.get(item_id, "") + idx = self._reasoning_index.get(item_id) + if text: + self.reasoning_count += 1 + summary = text.strip().split("\n", 1)[0][:300] + if idx is None: + # No started event was seen; open the message now. + idx = self._alloc() + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ) + ) + # Deliver the reasoning as deltas, then close with a Done. + # Emitting a Full here instead would leave the open Start + # context dangling: auto_send routes Full into its own + # throwaway streaming context (ignoring the index), so the + # Start context survives until end-of-turn teardown and + # persists a second, near-empty reasoning message. Streaming + # the content as deltas lets the open context accumulate the + # final ReasoningContent and close cleanly as one message. + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=ReasoningSummaryDelta( + type="reasoning_summary", + summary_index=0, + summary_delta=summary, + ), + ) + ) + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=text, + ), + ) + ) + out.append(StreamTaskMessageDone(type="done", index=idx)) + elif idx is not None: + # Empty reasoning block — still need to close with a Done. + out.append(StreamTaskMessageDone(type="done", index=idx)) + + elif item_type in ( + "command_execution", + "file_change", + "mcp_tool_call", + "web_search", + "todo_list", + "collab_tool_call", + ): + # Resolve a stable id once per item; reuse it for both halves. + tool_call_id = self._tool_call_ids.get(item_id) + if tool_call_id is None: + tool_call_id = item_id or f"codex_tool_{self.tool_call_count + 1}" + self._tool_call_ids[item_id] = tool_call_id + + if evt_type == "item.started": + self.tool_call_count += 1 + self._tool_open.add(item_id) + self._tool_item_types[item_id] = item_type + name = _tool_name_for(item_type, item) + args = _tool_args_for(item_type, item) + req_idx = self._alloc() + out.append( + StreamTaskMessageStart( + type="start", + index=req_idx, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_call_id, + name=name, + arguments=args, + ), + ) + ) + out.append(StreamTaskMessageDone(type="done", index=req_idx)) + + elif evt_type == "item.completed": + # file_change items may only emit item.completed (no started). + if item_id not in self._tool_open: + self.tool_call_count += 1 + self._tool_open.add(item_id) + self._tool_item_types[item_id] = item_type + name = _tool_name_for(item_type, item) + args = _tool_args_for(item_type, item) + req_idx = self._alloc() + out.append( + StreamTaskMessageFull( + type="full", + index=req_idx, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_call_id, + name=name, + arguments=args, + ), + ) + ) + + actual_type = self._tool_item_types.get(item_id, item_type) + result_text, is_error = _tool_output_for(actual_type, item) + name = _tool_name_for(actual_type, item) + resp_content: dict[str, Any] = {"result": result_text} + if is_error: + resp_content["is_error"] = True + out.append( + StreamTaskMessageFull( + type="full", + index=self._alloc(), + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id=tool_call_id, + name=name, + content=resp_content, + ), + ) + ) + self._tool_open.discard(item_id) + # Free the id mapping so a later item reusing an empty id gets a + # fresh fallback rather than colliding with this one. + self._tool_call_ids.pop(item_id, None) + + elif item_type == "error": + if evt_type == "item.completed": + out.append(_error_full(item.get("message", "codex item error"), self._alloc())) + + else: + logger.debug("[codex] unhandled item type=%s evt=%s", item_type, evt_type) + + return out + + +async def convert_codex_to_agentex_events( + events: AsyncIterator[str | dict[str, Any]], + on_result: Callable[[dict[str, Any]], None] | None = None, +) -> AsyncIterator[StreamTaskMessage]: + """Convert a ``codex exec --json`` event stream into Agentex stream events. + + This is a pure parser tap. The caller must supply ``events`` as an async + iterator of either raw newline-delimited JSON strings or pre-decoded dicts. + No subprocess or sandbox management is done here. + + Args: + events: Async iterator of ``str`` (newline-delimited JSON lines) or + ``dict`` (pre-decoded event objects) as produced by the codex CLI's + ``--json`` flag via sandbox stdout. + on_result: Optional callback invoked once when a ``turn.completed`` + event is seen. Receives a dict with keys: + ``usage`` — the raw codex usage dict (or None) + ``session_id`` — the codex thread_id (or None) + ``tool_call_count`` — int + ``reasoning_count`` — int + Use this to record turn-level metrics / usage in the caller's span + without coupling this module to span/tracing APIs. + + Yields: + Canonical ``StreamTaskMessage*`` events (Start/Delta/Full/Done) with + ``TextContent``, ``ReasoningContent``, ``ToolRequestContent``, or + ``ToolResponseContent`` payloads. + + MAPPING (abbreviated — see module docstring for the full table) + thread.started -> no event; session_id captured for on_result + turn.started -> no event + turn.completed -> no event; triggers on_result callback + turn.failed / error -> StreamTaskMessageFull(TextContent, error) + agent_message -> Start + Deltas + Done + reasoning -> Start + Full(ReasoningContent) + command_execution -> Start(ToolRequest)+Done + Full(ToolResponse) + file_change -> Full(ToolRequest) + Full(ToolResponse) + mcp_tool_call -> Start(ToolRequest)+Done + Full(ToolResponse) + web_search / todo_list -> Start(ToolRequest)+Done + Full(ToolResponse) + collab_tool_call -> Start(ToolRequest)+Done + Full(ToolResponse) + """ + processor = _CodexStreamProcessor() + _pending_usage: dict[str, Any] | None = None + + async for raw in events: + if isinstance(raw, dict): + evt = raw + else: + line = raw.strip() if isinstance(raw, str) else "" + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + logger.debug("[codex] non-JSON line: %s", line[:100]) + continue + + # Capture usage before processing so on_result can fire after flush. + if evt.get("type") == "turn.completed": + usage = evt.get("usage") + _pending_usage = usage if isinstance(usage, dict) else None + + messages = processor.process(evt) + for msg in messages: + yield msg + + if on_result is not None: + on_result( + { + "usage": _pending_usage, + "session_id": processor.session_id, + "tool_call_count": processor.tool_call_count, + "reasoning_count": processor.reasoning_count, + } + ) diff --git a/src/agentex/lib/adk/_modules/_codex_turn.py b/src/agentex/lib/adk/_modules/_codex_turn.py new file mode 100644 index 000000000..e7fa1d929 --- /dev/null +++ b/src/agentex/lib/adk/_modules/_codex_turn.py @@ -0,0 +1,214 @@ +"""CodexTurn: HarnessTurn implementation for the codex event-stream tap. + +Wraps ``convert_codex_to_agentex_events`` so callers can pass a ``CodexTurn`` +directly to ``UnifiedEmitter.yield_turn`` or ``UnifiedEmitter.auto_send_turn``. + +Usage:: + + from agentex.lib.adk import convert_codex_to_agentex_events + from agentex.lib.adk._modules._codex_turn import CodexTurn, codex_usage_to_turn_usage + + turn = CodexTurn(events=codex_event_stream, model="o4-mini") + async for msg in emitter.yield_turn(turn): + yield msg + turn_usage = turn.usage() + +OUT OF SCOPE +------------ +Like ``_codex_sync``, this module is a pure library tap. Subprocess +provisioning, sandbox setup, secret injection, and MCP configuration remain +in the golden agent (``teams/sgp/agents/golden_agent/project/harness/``). +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.adk._modules._codex_sync import ( + StreamTaskMessage, + convert_codex_to_agentex_events, +) + + +def codex_usage_to_turn_usage( + raw: dict[str, Any] | None, + *, + model: str | None = None, + tool_call_count: int = 0, + reasoning_count: int = 0, + duration_ms: int | None = None, + cost_usd: float | None = None, +) -> TurnUsage: + """Map a raw codex ``turn.completed`` usage dict to a canonical ``TurnUsage``. + + Codex reports token usage under the ``usage`` key of the + ``turn.completed`` event. The shape follows the OpenAI completion_tokens + convention because codex is built on OpenAI models: + + .. code-block:: json + + { + "input_tokens": 1234, + "output_tokens": 456, + "total_tokens": 1690 + } + + Additionally, codex may report ``reasoning_tokens`` for o-series models: + + .. code-block:: json + + { + "input_tokens": 1234, + "output_tokens": 456, + "reasoning_tokens": 200, + "total_tokens": 1690 + } + + Defensive rules: + - Missing ``raw`` or missing sub-keys default to ``None`` (not zero) so + downstream callers can distinguish "not reported" from "reported as 0". + - Real zeros (``0`` explicitly present in ``raw``) are preserved as ``0``. + - ``total_tokens`` is accepted from the payload or left as ``None``; + callers should not recompute it because codex may use cached tokens. + - ``cost_usd`` is passed through when codex reports it (not yet common); + defaults to ``None`` if absent. + + Args: + raw: The raw codex usage dict from ``turn.completed``, or ``None``. + model: Model string (e.g. "o4-mini") to attach to the usage record. + tool_call_count: Number of tool calls in the turn (from processor). + reasoning_count: Number of reasoning blocks (from processor). + duration_ms: Wall-clock duration of the turn in milliseconds. + cost_usd: Cost in USD if the caller can derive it; ``None`` otherwise. + + Returns: + A populated ``TurnUsage`` instance. + """ + if not isinstance(raw, dict): + raw = {} + + def _int_or_none(key: str) -> int | None: + val = raw.get(key) + if val is None: + return None + try: + return int(val) + except (TypeError, ValueError): + return None + + def _float_or_none(key: str) -> float | None: + val = raw.get(key) + if val is None: + return None + try: + return float(val) + except (TypeError, ValueError): + return None + + # cost_usd: prefer explicitly passed value, then fall back to raw payload. + effective_cost = cost_usd if cost_usd is not None else _float_or_none("cost_usd") + + return TurnUsage( + model=model or None, + input_tokens=_int_or_none("input_tokens"), + output_tokens=_int_or_none("output_tokens"), + cached_input_tokens=_int_or_none("cached_input_tokens"), + reasoning_tokens=_int_or_none("reasoning_tokens"), + total_tokens=_int_or_none("total_tokens"), + cost_usd=effective_cost, + duration_ms=duration_ms, + num_llm_calls=1, + num_tool_calls=tool_call_count, + num_reasoning_blocks=reasoning_count, + ) + + +class CodexTurn: + """A single codex turn as a ``HarnessTurn``. + + Implements the ``HarnessTurn`` protocol so it can be passed to + ``UnifiedEmitter.yield_turn`` and ``UnifiedEmitter.auto_send_turn``. + + ``usage()`` is valid only after ``events`` has been fully consumed (i.e. + the async generator has been exhausted). Calling ``usage()`` before + exhaustion returns a zero-value ``TurnUsage`` with only ``model`` set. + + Args: + events: An async iterator of ``str | dict`` codex events, as + produced by reading ``codex exec --json`` stdout line by line. + model: Model string to attach to the ``TurnUsage``. + duration_ms: Optional turn wall-clock duration in milliseconds. + cost_usd: Optional cost in USD; ``None`` if not known. + """ + + def __init__( + self, + events: AsyncIterator[str | dict[str, Any]], + *, + model: str | None = None, + duration_ms: int | None = None, + cost_usd: float | None = None, + ) -> None: + self._raw_events = events + self._model = model + # Public + mutable: the true wall-clock duration (and cost) is usually + # only known after the stream is consumed, so callers may set these + # after construction and before calling usage(). + self.duration_ms = duration_ms + self.cost_usd = cost_usd + + # Populated by the on_result callback once the stream is exhausted. + self._result: dict[str, Any] | None = None + # The events generator is created at most once: ``_raw_events`` is a + # single-consumption AsyncIterator, so re-wrapping it would yield an + # already-exhausted stream that fires on_result with zeros and clobbers + # ``_result``. Cache the generator and hand back the same instance. + self._events_gen: AsyncIterator[StreamTaskMessage] | None = None + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + """Async iterator of canonical ``StreamTaskMessage*`` events. + + The ``on_result`` callback populates ``_result`` when the underlying + codex stream ends, so ``usage()`` returns meaningful data after + exhaustion. Returns the same generator on every access so the underlying + stream is consumed (and ``on_result`` fires) exactly once. + """ + if self._events_gen is None: + self._events_gen = convert_codex_to_agentex_events( + self._raw_events, + on_result=self._on_result, + ) + return self._events_gen + + def _on_result(self, result: dict[str, Any]) -> None: + self._result = result + + @property + def session_id(self) -> str | None: + """The codex session id, for resuming a multi-turn session. + + Valid only after ``events`` has been fully consumed (populated by the + ``on_result`` callback). Returns ``None`` if the stream is not yet + exhausted or codex reported no session id. + """ + return self._result.get("session_id") if self._result else None + + def usage(self) -> TurnUsage: + """Return normalized ``TurnUsage`` for this turn. + + Valid only after ``events`` has been fully consumed. Returns a + zero-value ``TurnUsage`` (model set, counts zero, tokens None) if + called before the stream ends. + """ + if self._result is None: + return TurnUsage(model=self._model) + return codex_usage_to_turn_usage( + self._result.get("usage"), + model=self._model, + tool_call_count=self._result.get("tool_call_count", 0), + reasoning_count=self._result.get("reasoning_count", 0), + duration_ms=self.duration_ms, + cost_usd=self.cost_usd, + ) diff --git a/src/agentex/lib/adk/_modules/_langgraph_async.py b/src/agentex/lib/adk/_modules/_langgraph_async.py index 3e61c42f9..02ef059eb 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_async.py +++ b/src/agentex/lib/adk/_modules/_langgraph_async.py @@ -3,8 +3,21 @@ Converts LangGraph graph.astream() events into Agentex streaming updates and pushes them to Redis via adk.streaming contexts. For use with async ACP agents that stream via Redis rather than HTTP yields. + +Unified surface +--------------- +This module is now implemented on top of ``LangGraphTurn`` and +``UnifiedEmitter.auto_send_turn``, the same surface used by every other +harness adapter (pydantic-ai, openai-agents, etc.). The public signature +and return type are preserved identically. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. """ +from agentex.lib.utils.temporal import workflow_now_if_in_workflow + async def stream_langgraph_events(stream, task_id: str) -> str: """Stream LangGraph events to Agentex via Redis. @@ -18,6 +31,19 @@ async def stream_langgraph_events(stream, task_id: str) -> str: models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks in the Responses API responses/v1 format). + Reimplemented on ``UnifiedEmitter.auto_send_turn(LangGraphTurn(...))`` for + cross-harness consistency. Behavior is identical to the previous bespoke + implementation (verified by characterization tests in test_langgraph_async.py). + + AGX1-377 note: LangGraph emits tool requests as ``Full`` events (from "updates"), + NOT Start+Delta+Done like pydantic-ai. ``auto_send`` handles Full events + correctly; no coalescing wrapper is needed. + + AGX1-378 note: ``created_at`` is set from ``workflow.now()`` when called inside a + Temporal workflow, matching the pattern used by the openai/litellm providers. + Outside a workflow (plain async activities, sync agents) it is ``None`` and the + server's wall clock is used. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) task_id: The Agentex task ID to stream messages to. @@ -25,178 +51,15 @@ async def stream_langgraph_events(stream, task_id: str) -> str: Returns: The accumulated final text output from the agent. """ - # Lazy imports so langgraph/langchain aren't required at module load time - from langchain_core.messages import ToolMessage, AIMessageChunk - - from agentex.lib import adk - from agentex.types.text_content import TextContent - from agentex.types.reasoning_content import ReasoningContent - from agentex.types.task_message_delta import TextDelta - from agentex.types.task_message_update import StreamTaskMessageDelta - from agentex.types.tool_request_content import ToolRequestContent - from agentex.types.tool_response_content import ToolResponseContent - from agentex.types.reasoning_summary_delta import ReasoningSummaryDelta - - text_context = None - reasoning_context = None - final_text = "" - - try: - async for event_type, event_data in stream: - if event_type == "messages": - chunk, metadata = event_data - - if not isinstance(chunk, AIMessageChunk) or not chunk.content: - continue - - # ---------------------------------------------------------- - # Case 1: content is a plain string (regular models) - # ---------------------------------------------------------- - if isinstance(chunk.content, str): - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += chunk.content - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=chunk.content), - type="delta", - ) - ) - - # ---------------------------------------------------------- - # Case 2: content is a list of typed blocks (reasoning models) - # Responses API (responses/v1) format: - # {"type": "reasoning", "summary": [{"type": "summary_text", "text": "..."}]} - # {"type": "text", "text": "..."} - # ---------------------------------------------------------- - elif isinstance(chunk.content, list): - for block in chunk.content: - if not isinstance(block, dict): - continue - - block_type = block.get("type") - - if block_type == "reasoning": - reasoning_text = "" - for s in block.get("summary", []): - if isinstance(s, dict) and s.get("type") == "summary_text": - reasoning_text += s.get("text", "") - if not reasoning_text: - continue - - if text_context: - await text_context.close() - text_context = None - - if not reasoning_context: - reasoning_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - ).__aenter__() - - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningSummaryDelta( - type="reasoning_summary", - summary_index=0, - summary_delta=reasoning_text, - ), - type="delta", - ) - ) - - elif block_type == "text": - text_delta = block.get("text", "") - if not text_delta: - continue - - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += text_delta - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=text_delta), - type="delta", - ) - ) - - elif event_type == "updates": - for node_name, state_update in event_data.items(): - if node_name == "agent": - messages = state_update.get("messages", []) - for msg in messages: - if text_context: - await text_context.close() - text_context = None - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if hasattr(msg, "tool_calls") and msg.tool_calls: - for tc in msg.tool_calls: - await adk.messages.create( - task_id=task_id, - content=ToolRequestContent( - tool_call_id=tc["id"], - name=tc["name"], - arguments=tc["args"], - author="agent", - ), - ) - - elif node_name == "tools": - messages = state_update.get("messages", []) - for msg in messages: - if isinstance(msg, ToolMessage): - await adk.messages.create( - task_id=task_id, - content=ToolResponseContent( - tool_call_id=msg.tool_call_id, - name=msg.name or "unknown", - content=msg.content if isinstance(msg.content, str) else str(msg.content), - author="agent", - ), - ) - finally: - # Always close open contexts - if text_context: - await text_context.close() - if reasoning_context: - await reasoning_context.close() - - return final_text + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + # AGX1-377 note: LangGraph emits tool requests as Full events (from "updates"), + # NOT Start+Delta+Done like pydantic-ai. auto_send handles Full events correctly; + # no coalescing wrapper is needed. + # AGX1-378: stamp messages with workflow.now() inside Temporal for deterministic + # created_at ordering; falls back to None (server wall clock) outside a workflow. + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter(task_id=task_id, trace_id=None, parent_span_id=None) + result = await emitter.auto_send_turn(turn, created_at=workflow_now_if_in_workflow()) + return result.final_text diff --git a/src/agentex/lib/adk/_modules/_langgraph_sync.py b/src/agentex/lib/adk/_modules/_langgraph_sync.py index 6d4ce715f..48231a87d 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_sync.py +++ b/src/agentex/lib/adk/_modules/_langgraph_sync.py @@ -3,10 +3,36 @@ Converts LangGraph graph.astream() events into Agentex TaskMessageUpdate events that are yielded back over the HTTP response. For use with sync ACP agents that stream via HTTP yields rather than Redis. + +Unified sync path +----------------- +Prefer using ``LangGraphTurn`` with ``UnifiedEmitter.yield_turn`` for new +agents, which adds usage capture and optional tracing via the shared harness +surface:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + async for event in emitter.yield_turn(turn): + yield event + +``convert_langgraph_to_agentex_events`` remains available as a lower-level +primitive (e.g. for callers that need the raw event stream without the +harness envelope). """ +from __future__ import annotations + +from typing import Any, Callable, Optional +from collections.abc import AsyncGenerator + -async def convert_langgraph_to_agentex_events(stream): +async def convert_langgraph_to_agentex_events( + stream: Any, + on_final_ai_message: Optional[Callable[..., None]] = None, +) -> AsyncGenerator[Any, None]: """Convert LangGraph streaming events to Agentex TaskMessageUpdate events. Expects the stream from graph.astream() called with @@ -22,8 +48,17 @@ async def convert_langgraph_to_agentex_events(stream): Supports both regular models (chunk.content is a str) and reasoning models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks). + AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` (from + "updates" events), NOT Start+Delta+Done like pydantic-ai. No coalesce_tool_requests + option is needed for LangGraph. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) + on_final_ai_message: Optional callback ``(msg: AIMessage) -> None`` called for + each ``AIMessage`` in an "agent" node update. Use this to capture + ``usage_metadata`` for token accounting without re-traversing the stream. + The callback fires *after* all events for that message are yielded. + No-op when ``None`` (default). Yields: TaskMessageUpdate events (Start, Delta, Done, Full) @@ -32,6 +67,7 @@ async def convert_langgraph_to_agentex_events(stream): from langchain_core.messages import ToolMessage, AIMessageChunk from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import ( StreamTaskMessageDone, @@ -113,7 +149,9 @@ async def convert_langgraph_to_agentex_events(stream): yield StreamTaskMessageStart( type="start", index=message_index, - content=TextContent(type="text", author="agent", content=""), + content=ReasoningContent( + type="reasoning", author="agent", summary=[], content=[], style="active" + ), ) reasoning_streaming = True reasoning_content_index = 0 @@ -205,6 +243,13 @@ async def convert_langgraph_to_agentex_events(stream): ) message_index += 1 + # Notify caller of the final AIMessage (e.g. for usage capture) + if on_final_ai_message is not None: + from langchain_core.messages import AIMessage as _AIMessage + + if isinstance(msg, _AIMessage): + on_final_ai_message(msg) + elif node_name == "tools": messages = state_update.get("messages", []) for msg in messages: diff --git a/src/agentex/lib/adk/_modules/_langgraph_tracing.py b/src/agentex/lib/adk/_modules/_langgraph_tracing.py index 74b8dcb57..2162201e1 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_tracing.py +++ b/src/agentex/lib/adk/_modules/_langgraph_tracing.py @@ -1,4 +1,14 @@ -"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions.""" +"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions. + +.. deprecated:: + ``AgentexLangGraphTracingHandler`` and ``create_langgraph_tracing_handler`` are + superseded by the unified harness surface (``LangGraphTurn`` + + ``UnifiedEmitter``), which derives spans automatically from the canonical + event stream without requiring a LangChain callback handler. + + They remain importable and functional for backward compatibility, but new + agents should use the unified path instead. +""" # ruff: noqa: ARG002 # Callback methods must accept all arguments defined by LangChain's AsyncCallbackHandler interface. @@ -31,6 +41,11 @@ class AgentexLangGraphTracingHandler(AsyncCallbackHandler): ├── llm: (LLM call) ├── tool: (tool execution) └── llm: (LLM call) + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified + harness derives equivalent spans from the canonical event stream, + removing the need for a LangChain callback handler entirely. """ def __init__( @@ -237,6 +252,20 @@ def create_langgraph_tracing_handler( Returns: An ``AgentexLangGraphTracingHandler`` instance ready to use as a LangChain callback. + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified harness + derives equivalent spans from the canonical event stream automatically, with + no LangChain callback required:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + result = await emitter.auto_send_turn(turn) + + This function remains available for backward compatibility. """ return AgentexLangGraphTracingHandler( trace_id=trace_id, diff --git a/src/agentex/lib/adk/_modules/_langgraph_turn.py b/src/agentex/lib/adk/_modules/_langgraph_turn.py new file mode 100644 index 000000000..da8ff0e7c --- /dev/null +++ b/src/agentex/lib/adk/_modules/_langgraph_turn.py @@ -0,0 +1,152 @@ +"""HarnessTurn adapter for LangGraph astream() event streams. + +Provides ``LangGraphTurn`` (a ``HarnessTurn`` implementation) and the +``langgraph_usage_to_turn_usage`` helper that maps LangGraph's +``AIMessage.usage_metadata`` onto the framework-agnostic ``TurnUsage`` model. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator +from collections.abc import AsyncGenerator + +from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + + +def langgraph_usage_to_turn_usage(usage_metadata: Any, model: str | None) -> TurnUsage: + """Map LangGraph ``AIMessage.usage_metadata`` onto ``TurnUsage``. + + ``usage_metadata`` may be ``None`` (model doesn't report usage). + Real zero token counts (e.g. 0 output tokens) are preserved as 0, NOT + coerced to ``None``. + + Mapping:: + + input_tokens -> input_tokens + output_tokens -> output_tokens + total_tokens -> total_tokens + input_token_details.cache_read -> cached_input_tokens + output_token_details.reasoning -> reasoning_tokens + + Args: + usage_metadata: The ``usage_metadata`` dict from an ``AIMessage``, + or ``None`` if the model did not report usage. + model: The model name string to attach to the ``TurnUsage``, or ``None``. + + Returns: + A populated ``TurnUsage`` instance. + """ + if usage_metadata is None: + return TurnUsage(model=model) + + raw_input = (usage_metadata or {}).get("input_tokens") + raw_output = (usage_metadata or {}).get("output_tokens") + raw_total = (usage_metadata or {}).get("total_tokens") + input_details = (usage_metadata or {}).get("input_token_details") or {} + output_details = (usage_metadata or {}).get("output_token_details") or {} + raw_cache_read = input_details.get("cache_read") + raw_reasoning = output_details.get("reasoning") + + return TurnUsage( + model=model, + input_tokens=raw_input, + output_tokens=raw_output, + total_tokens=raw_total, + cached_input_tokens=raw_cache_read, + reasoning_tokens=raw_reasoning, + ) + + +def _add_optional(a: int | None, b: int | None) -> int | None: + """Sum two optional token counts; ``None`` means 'not reported' on that side. + + ``None + None`` stays ``None`` (model never reported usage), while a real 0 + contributes 0 (preserving zero counts rather than coercing them away). + """ + if a is None and b is None: + return None + return (a or 0) + (b or 0) + + +def _accumulate_turn_usage(acc: TurnUsage, call: TurnUsage, model: str | None) -> TurnUsage: + """Add a single LLM call's usage into the running per-turn total. + + A LangGraph turn can make multiple LLM calls (e.g. text -> tool decision -> + final text); summing them avoids silently dropping all but the last call. + """ + return TurnUsage( + model=model, + input_tokens=_add_optional(acc.input_tokens, call.input_tokens), + output_tokens=_add_optional(acc.output_tokens, call.output_tokens), + total_tokens=_add_optional(acc.total_tokens, call.total_tokens), + cached_input_tokens=_add_optional(acc.cached_input_tokens, call.cached_input_tokens), + reasoning_tokens=_add_optional(acc.reasoning_tokens, call.reasoning_tokens), + ) + + +class LangGraphTurn: + """HarnessTurn wrapping a LangGraph ``astream()`` event stream. + + Implements the ``HarnessTurn`` Protocol so it can be passed to either + ``UnifiedEmitter.yield_turn`` (sync HTTP ACP) or + ``UnifiedEmitter.auto_send_turn`` (async / temporal). + + Usage:: + + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + stream_mode=["messages", "updates"], + ) + turn = LangGraphTurn(stream, model=model_name) + + # Sync HTTP ACP + async for event in emitter.yield_turn(turn): + yield event + + # Async / temporal + result = await emitter.auto_send_turn(turn) + + AGX1-377 note: LangGraph tool requests are ``StreamTaskMessageFull`` (from + "updates"), NOT Start+Delta+Done like pydantic-ai. No ``coalesce_tool_requests`` + option is needed. + + Usage data is captured lazily via the ``on_final_ai_message`` callback and + is only valid after ``events`` has been fully consumed. Multi-step turns + (more than one LLM call) accumulate usage additively across calls. + """ + + def __init__(self, stream: Any, model: str | None = None) -> None: + self._stream = stream + self._model = model + self._usage: TurnUsage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._generate_events() + + async def _generate_events(self) -> AsyncGenerator[StreamTaskMessage, None]: + def _capture(ai_msg: Any) -> None: + usage_metadata = getattr(ai_msg, "usage_metadata", None) + if usage_metadata is not None: + call_usage = langgraph_usage_to_turn_usage(usage_metadata, self._model) + # Accumulate across LLM calls — the callback fires once per agent + # node invocation, so a multi-step turn reports usage more than + # once; overwriting would drop all but the last call. + self._usage = _accumulate_turn_usage(self._usage, call_usage, self._model) + + async for ev in convert_langgraph_to_agentex_events(self._stream, on_final_ai_message=_capture): + yield ev + + def usage(self) -> TurnUsage: + """Return the usage accumulated across all AIMessages in the stream. + + Multi-step turns sum each LLM call's usage. Valid only after ``events`` + has been fully consumed. Returns a zero-usage ``TurnUsage`` if the model + did not report usage. + """ + return self._usage diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_async.py b/src/agentex/lib/adk/_modules/_pydantic_ai_async.py index 0bbb5b19d..85abfb845 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_async.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_async.py @@ -6,11 +6,10 @@ HTTP yields. Text and thinking tokens stream as deltas inside coalesced streaming -contexts. Tool requests and tool results are emitted as full -``adk.messages.create(...)`` calls (Option A — matches the async LangGraph -helper's convention). To stream tool-call argument tokens, see the sync -converter at ``agentex.lib.adk._modules._pydantic_ai_sync`` which yields -``ToolRequestDelta`` events. +contexts. Tool requests and tool results are posted as open+close pairs +on a streaming context (the unified surface persists ``initial_content`` +when a context is closed without deltas). This matches the ``auto_send`` +convention used by all other async/Temporal harnesses. Tracing is opt-in via a ``tracing_handler`` parameter — see ``create_pydantic_ai_tracing_handler`` in @@ -19,7 +18,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING if TYPE_CHECKING: from agentex.lib.adk._modules._pydantic_ai_tracing import ( @@ -49,230 +48,18 @@ async def stream_pydantic_ai_events( more text) return only the final text segment, matching the ``stream_langgraph_events`` convention. """ - # Lazy imports so pydantic-ai isn't required at module load time. - import json + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn - from pydantic_ai.messages import ( - TextPart, - PartEndEvent, - ThinkingPart, - ToolCallPart, - TextPartDelta, - PartDeltaEvent, - PartStartEvent, - ThinkingPartDelta, - FunctionToolResultEvent, + turn = PydanticAITurn( + stream, + model=None, + tracing_handler=tracing_handler, ) - - from agentex.lib import adk - from agentex.types.text_content import TextContent - from agentex.types.reasoning_content import ReasoningContent - from agentex.types.task_message_delta import TextDelta - from agentex.types.task_message_update import StreamTaskMessageDelta - from agentex.types.tool_request_content import ToolRequestContent - from agentex.types.tool_response_content import ToolResponseContent - from agentex.types.reasoning_content_delta import ReasoningContentDelta - - text_context = None - reasoning_context = None - final_text = "" - - # Per Pydantic-AI part-index bookkeeping. Part indices restart at 0 on - # each new model response, so we overwrite on PartStartEvent. - part_kind: dict[int, str] = {} - tool_call_info: dict[int, tuple[str, str]] = {} - - async def _close_text(): - nonlocal text_context - if text_context: - await text_context.close() - text_context = None - - async def _close_reasoning(): - nonlocal reasoning_context - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - try: - async for event in stream: - if isinstance(event, PartStartEvent): - if isinstance(event.part, TextPart): - await _close_reasoning() - await _close_text() - - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - part_kind[event.index] = "text" - - # Pydantic AI puts the first streaming chunk in - # PartStartEvent.part.content; surface it as a Delta so it - # actually renders (Start.content is initialization, not body). - if event.part.content: - final_text += event.part.content - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=event.part.content), - type="delta", - ) - ) - - elif isinstance(event.part, ThinkingPart): - await _close_text() - await _close_reasoning() - - reasoning_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - ).__aenter__() - part_kind[event.index] = "reasoning" - - if event.part.content: - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningContentDelta( - type="reasoning_content", - content_index=0, - content_delta=event.part.content, - ), - type="delta", - ) - ) - - elif isinstance(event.part, ToolCallPart): - await _close_text() - await _close_reasoning() - tool_call_info[event.index] = ( - event.part.tool_call_id, - event.part.tool_name, - ) - part_kind[event.index] = "tool_call" - - elif isinstance(event, PartDeltaEvent): - kind = part_kind.get(event.index) - if kind == "text" and isinstance(event.delta, TextPartDelta) and text_context: - final_text += event.delta.content_delta - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=event.delta.content_delta), - type="delta", - ) - ) - elif ( - kind == "reasoning" - and isinstance(event.delta, ThinkingPartDelta) - and reasoning_context - and event.delta.content_delta - ): - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningContentDelta( - type="reasoning_content", - content_index=0, - content_delta=event.delta.content_delta, - ), - type="delta", - ) - ) - # Tool-call arg deltas: Pydantic AI accumulates them; we - # surface the final args on PartEndEvent below (Option A). - - elif isinstance(event, PartEndEvent): - kind = part_kind.get(event.index) - if kind == "text": - await _close_text() - elif kind == "reasoning": - await _close_reasoning() - elif kind == "tool_call" and isinstance(event.part, ToolCallPart): - tool_call_id, tool_name = tool_call_info.get(event.index, ("", "")) - args = event.part.args - if isinstance(args, str): - try: - args = json.loads(args) if args else {} - except json.JSONDecodeError: - args = {"_raw": args} - elif args is None: - args = {} - await adk.messages.create( - task_id=task_id, - content=ToolRequestContent( - tool_call_id=tool_call_id, - name=tool_name, - arguments=args, - author="agent", - ), - ) - if tracing_handler is not None and tool_call_id: - await tracing_handler.on_tool_start( - tool_call_id=tool_call_id, - tool_name=tool_name, - arguments=args, - ) - - elif isinstance(event, FunctionToolResultEvent): - await _close_text() - await _close_reasoning() - - result = event.part - tool_call_id = result.tool_call_id - tool_name = getattr(result, "tool_name", "") or "" - # Preserve structure for dicts / lists / Pydantic models so the - # UI can render them as JSON, not as Python repr. Matches the - # sync converter's ``_tool_return_content`` helper exactly — - # ``str(content)`` on a dict produces ``"{'k': 'v'}"`` which is - # invalid JSON and unreadable in the UI. - content = getattr(result, "content", None) - content_payload: Any - if content is None: - content_payload = str(result) - elif isinstance(content, (str, int, float, bool, list, dict)): - content_payload = content - elif hasattr(content, "model_dump"): - try: - content_payload = content.model_dump() - except Exception: - content_payload = str(content) - else: - content_payload = str(content) - await adk.messages.create( - task_id=task_id, - content=ToolResponseContent( - tool_call_id=tool_call_id, - name=tool_name, - content=content_payload, - author="agent", - ), - ) - if tracing_handler is not None and tool_call_id: - await tracing_handler.on_tool_end( - tool_call_id=tool_call_id, - result=content_payload, - ) - - # FunctionToolCallEvent / FinalResultEvent / AgentRunResultEvent - # are intentionally ignored — same as the sync converter. - - finally: - if text_context: - await text_context.close() - if reasoning_context: - await reasoning_context.close() - - return final_text + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=None, + parent_span_id=None, + ) + result = await emitter.auto_send_turn(turn) + return result.final_text diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py b/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py index d94c0ae12..e4ac31e7e 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py @@ -16,12 +16,32 @@ async def handle_message_send(params): async with agent.run_stream_events(params.content.content) as stream: async for event in convert_pydantic_ai_to_agentex_events(stream): yield event + +Recommended: unified surface +----------------------------- +For new handlers, prefer ``UnifiedEmitter`` + ``PydanticAITurn`` over the +bare converter. The unified surface wires tracing automatically when a +``trace_id`` is provided, so tool and reasoning spans are derived from the +same event stream with no extra setup: + + from agentex.lib.core.harness import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) + turn = PydanticAITurn(agent.run_stream_events(prompt), model="openai:gpt-4o") + async for event in emitter.yield_turn(turn): + yield event # forwarded over the ACP streaming response; spans derived automatically + +``convert_pydantic_ai_to_agentex_events`` remains the low-level tap for +callers that manage their own tracing or need direct access to the raw +converted stream. """ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any, AsyncIterator +import inspect +from typing import TYPE_CHECKING, Any, Callable, AsyncIterator from pydantic_ai.run import AgentRunResultEvent @@ -105,6 +125,7 @@ def _tool_return_content(result: ToolReturnPart | Any) -> Any: async def convert_pydantic_ai_to_agentex_events( stream_response: AsyncIterator[Any], tracing_handler: "AgentexPydanticAITracingHandler | None" = None, + on_result: Callable[[AgentRunResultEvent], Any] | None = None, ) -> AsyncIterator[StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone]: """Convert a Pydantic AI agent event stream into Agentex stream events. @@ -132,6 +153,12 @@ async def convert_pydantic_ai_to_agentex_events( tool call in the run is also recorded as an Agentex child span beneath the handler's configured ``parent_span_id``. Streaming behavior is unchanged when omitted. + on_result: Optional callback invoked with the terminal + ``AgentRunResultEvent`` when the run completes. Both sync and + async callables are accepted. No ``StreamTaskMessage*`` events are + yielded for this terminal event; the callback is the only side + effect. Useful for capturing run-level usage without altering the + streaming output. Yields: Agentex ``StreamTaskMessage*`` events suitable for forwarding back over @@ -328,6 +355,10 @@ async def convert_pydantic_ai_to_agentex_events( # Already covered by PartStart/PartDelta/PartEnd events above, or # informational only (FinalResultEvent / AgentRunResultEvent signal # run-level state, not new content to surface). + if isinstance(event, AgentRunResultEvent) and on_result is not None: + ret = on_result(event) + if inspect.iscoroutine(ret): + await ret continue else: diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py b/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py index aa9d906eb..e199d0a8c 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py @@ -1,5 +1,29 @@ """Tracing handler that records Agentex spans for tool calls in a pydantic-ai agent run. +.. deprecated:: + ``AgentexPydanticAITracingHandler`` and ``create_pydantic_ai_tracing_handler`` + are superseded by the unified harness surface (``UnifiedEmitter`` in + ``agentex.lib.core.harness``). The unified surface derives tool and + reasoning spans directly from the canonical ``StreamTaskMessage*`` stream, + so no separate handler is required. Both symbols remain fully importable + and functional; they will be removed in a future release. New code should + construct a ``UnifiedEmitter`` with a ``trace_id`` instead: + + from agentex.lib.core.harness import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) + turn = PydanticAITurn(agent.run_stream_events(prompt), model="openai:gpt-4o") + async for event in emitter.yield_turn(turn): + yield event + +# NOTE: A runtime ``warnings.warn(..., DeprecationWarning)`` is intentionally +# omitted here. The repo's pyproject ``filterwarnings = ["error"]`` would turn +# it into a test/caller failure, and the async helper (``stream_pydantic_ai_events``) +# still threads this handler through for existing callers that lack a ``trace_id`` +# on the async path. The runtime warning and caller migration are deferred until +# ``trace_id`` threading lands on the async helper in a future API-versioning change. + Mirrors the LangGraph tracing handler pattern: the caller creates a handler bound to a ``trace_id`` and a ``parent_span_id``, then hands it to ``stream_pydantic_ai_events(..., tracing_handler=handler)``. The streamer @@ -63,6 +87,14 @@ def _tool_span_id(trace_id: str, tool_call_id: str) -> str: class AgentexPydanticAITracingHandler: """Records Agentex tracing spans for tool calls observed in a pydantic-ai event stream. + .. deprecated:: + Superseded by ``UnifiedEmitter`` (``agentex.lib.core.harness``), which + derives tool and reasoning spans from the canonical ``StreamTaskMessage*`` + stream automatically when ``trace_id`` is provided. This class remains + fully functional but will be removed in a future release. New code should + use ``UnifiedEmitter`` with a trace context instead of constructing this + handler directly. + Pass an instance to ``stream_pydantic_ai_events(..., tracing_handler=...)`` or call ``on_tool_start`` / ``on_tool_end`` yourself if you're consuming the event stream by hand. @@ -165,6 +197,13 @@ def create_pydantic_ai_tracing_handler( ) -> AgentexPydanticAITracingHandler: """Create a tracing handler that records Agentex spans for pydantic-ai tool calls. + .. deprecated:: + Superseded by ``UnifiedEmitter`` (``agentex.lib.core.harness``), which + derives tool and reasoning spans from the canonical ``StreamTaskMessage*`` + stream automatically when ``trace_id`` is provided. This function remains + fully functional but will be removed in a future release. New code should + construct a ``UnifiedEmitter`` with a trace context instead. + Args: trace_id: The trace ID. Typically the Agentex task ID. parent_span_id: Optional parent span ID to nest tool spans under. If diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py b/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py new file mode 100644 index 000000000..b06172e7f --- /dev/null +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py @@ -0,0 +1,134 @@ +"""PydanticAITurn: a HarnessTurn wrapping a pydantic-ai event stream. + +Adapts a pydantic-ai ``AgentStreamEvent`` stream into the canonical +``StreamTaskMessage*`` stream while capturing run-level usage from the +terminal ``AgentRunResultEvent``. + +Typical usage:: + + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn(stream, model="openai:gpt-4o") + async for event in turn.events: + yield event + span.set_attributes(turn.usage().model_dump()) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, AsyncIterator + +from pydantic_ai.run import AgentRunResultEvent + +from agentex.lib.core.harness.types import TurnUsage +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + +if TYPE_CHECKING: + from agentex.lib.adk._modules._pydantic_ai_tracing import AgentexPydanticAITracingHandler + +StreamTaskMessage = StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone + + +def pydantic_ai_usage_to_turn_usage(usage: Any, model: str | None) -> TurnUsage: + """Map a pydantic-ai ``RunUsage`` onto ``TurnUsage``. + + Uses defensive ``getattr(..., None)`` so a future field rename in + pydantic-ai degrades to ``None`` rather than raising ``AttributeError``. + + RunUsage fields (verified against pydantic-ai in this repo): + input_tokens, cache_write_tokens, cache_read_tokens, output_tokens, + input_audio_tokens, cache_audio_read_tokens, output_audio_tokens, + details, requests, tool_calls. + ``total_tokens`` is a computed property. + + Mapping: + requests -> num_llm_calls + input_tokens -> input_tokens + output_tokens -> output_tokens + cache_read_tokens -> cached_input_tokens + total_tokens -> total_tokens + + getattr results pass straight through: a MISSING attribute degrades to + None (defensive), while a real 0 stays 0 (a cache-hit with 0 output + tokens is a genuine zero, not "unknown") and a real N stays N. + """ + raw_input = getattr(usage, "input_tokens", None) + raw_output = getattr(usage, "output_tokens", None) + raw_cache_read = getattr(usage, "cache_read_tokens", None) + raw_total = getattr(usage, "total_tokens", None) + raw_requests = getattr(usage, "requests", None) + + return TurnUsage( + model=model, + input_tokens=raw_input, + output_tokens=raw_output, + cached_input_tokens=raw_cache_read, + total_tokens=raw_total, + num_llm_calls=raw_requests if raw_requests is not None else 0, + ) + + +class PydanticAITurn: + """A single harness turn backed by a pydantic-ai event stream. + + Satisfies the ``HarnessTurn`` protocol: ``events`` async-generates the + canonical ``StreamTaskMessage*`` stream; ``usage()`` returns a normalized + ``TurnUsage`` (valid only after ``events`` is exhausted). + + ``events`` is identical to the bare ``convert_pydantic_ai_to_agentex_events`` + output (tool calls stream as ``Start + ToolRequestDelta + Done``, preserving + argument-token streaming on the sync/yield channel). The foundation + ``auto_send`` delivers the streamed tool-request shape natively (AGX1-377), + so no coalescing is needed on either channel. + """ + + def __init__( + self, + stream: AsyncIterator[Any], + model: str | None = None, + tracing_handler: "AgentexPydanticAITracingHandler | None" = None, + ) -> None: + self._stream = stream + self._model = model + self._tracing_handler = tracing_handler + self._usage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._generate_events() + + async def _generate_events(self) -> AsyncIterator[StreamTaskMessage]: + def _capture(result_event: AgentRunResultEvent) -> None: + run_result = getattr(result_event, "result", None) + if run_result is None: + return + usage_attr = getattr(run_result, "usage", None) + if usage_attr is None: + return + # In newer pydantic-ai, .usage is a DeprecatedCallableRunUsage — + # it's both a property value and callable (emitting a deprecation + # warning when called). Access it as a plain attribute to avoid the + # warning; it already IS the RunUsage instance. + usage_obj = usage_attr + self._usage = pydantic_ai_usage_to_turn_usage(usage_obj, self._model) + + raw_stream = convert_pydantic_ai_to_agentex_events( + self._stream, + tracing_handler=self._tracing_handler, + on_result=_capture, + ) + async for ev in raw_stream: + yield ev + + def usage(self) -> TurnUsage: + """Return the normalized usage for this turn. + + Valid only after ``events`` is exhausted (single-pass contract). + Before exhaustion the model field is set but token fields are None. + """ + return self._usage diff --git a/src/agentex/lib/adk/_modules/tracing.py b/src/agentex/lib/adk/_modules/tracing.py index 8694c2078..94bf741e4 100644 --- a/src/agentex/lib/adk/_modules/tracing.py +++ b/src/agentex/lib/adk/_modules/tracing.py @@ -6,7 +6,9 @@ from datetime import timedelta from typing import Any +from temporalio import workflow from temporalio.common import RetryPolicy +from temporalio.exceptions import ActivityError, TimeoutError as TemporalTimeoutError, is_cancelled_exception from agentex import AsyncAgentex # noqa: F401 from agentex.lib.adk.utils._modules.client import create_async_agentex_client @@ -26,6 +28,18 @@ logger = make_logger(__name__) DEFAULT_RETRY_POLICY = RetryPolicy(maximum_attempts=1) +TEMPORAL_SPAN_ACTIVITY_DROPPED_METRIC = "agentex.tracing.temporal_span_activity.dropped" + + +def _record_temporal_span_activity_dropped(event_type: str) -> None: + try: + workflow.metric_meter().create_counter( + TEMPORAL_SPAN_ACTIVITY_DROPPED_METRIC, + description="Temporal tracing span activities dropped after fail-open", + unit="1", + ).add(1, {"event_type": event_type}) + except Exception: + pass class TracingModule: @@ -180,14 +194,26 @@ async def start_span( task_id=task_id, ) if in_temporal_workflow(): - return await ActivityHelpers.execute_activity( - activity_name=TracingActivityName.START_SPAN, - request=params, - response_type=Span, - start_to_close_timeout=start_to_close_timeout, - retry_policy=retry_policy, - heartbeat_timeout=heartbeat_timeout, - ) + try: + return await ActivityHelpers.execute_activity( + activity_name=TracingActivityName.START_SPAN, + request=params, + response_type=Span, + start_to_close_timeout=start_to_close_timeout, + retry_policy=retry_policy, + heartbeat_timeout=heartbeat_timeout, + ) + except (ActivityError, TemporalTimeoutError) as err: + if is_cancelled_exception(err): + raise + workflow.logger.warning( + "Failed to start tracing span %r for trace_id=%r; continuing without tracing", + name, + trace_id, + exc_info=True, + ) + _record_temporal_span_activity_dropped("start") + return None else: return await self._tracing_service.start_span( trace_id=trace_id, @@ -224,14 +250,26 @@ async def end_span( span=span, ) if in_temporal_workflow(): - return await ActivityHelpers.execute_activity( - activity_name=TracingActivityName.END_SPAN, - request=params, - response_type=Span, - start_to_close_timeout=start_to_close_timeout, - retry_policy=retry_policy, - heartbeat_timeout=heartbeat_timeout, - ) + try: + return await ActivityHelpers.execute_activity( + activity_name=TracingActivityName.END_SPAN, + request=params, + response_type=Span, + start_to_close_timeout=start_to_close_timeout, + retry_policy=retry_policy, + heartbeat_timeout=heartbeat_timeout, + ) + except (ActivityError, TemporalTimeoutError) as err: + if is_cancelled_exception(err): + raise + workflow.logger.warning( + "Failed to end tracing span %r for trace_id=%r; continuing without closing trace", + span.id, + trace_id, + exc_info=True, + ) + _record_temporal_span_activity_dropped("end") + return span else: return await self._tracing_service.end_span( trace_id=trace_id, diff --git a/src/agentex/lib/adk/providers/_modules/openai_turn.py b/src/agentex/lib/adk/providers/_modules/openai_turn.py new file mode 100644 index 000000000..17a6518ee --- /dev/null +++ b/src/agentex/lib/adk/providers/_modules/openai_turn.py @@ -0,0 +1,134 @@ +"""OpenAITurn: adapt an OpenAI Agents SDK streamed run onto the harness surface. + +A ``HarnessTurn`` exposes a single canonical ``StreamTaskMessage*`` stream plus +normalized usage. ``OpenAITurn`` wraps a ``RunResultStreaming`` (from +``Runner.run_streamed``), converts its native OpenAI events into the canonical +stream via ``convert_openai_to_agentex_events``, and after exhaustion reads the +run's ``raw_responses`` to aggregate usage into a provider-independent +``TurnUsage``. + +Delivery (yield vs auto-send) and tracing are owned by ``UnifiedEmitter``; this +module is purely the provider->canonical adapter. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, AsyncIterator + +from agents.usage import Usage + +from agentex.lib.utils.logging import make_logger +from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage +from agentex.lib.adk.providers._modules.sync_provider import ( + convert_openai_to_agentex_events, +) + +if TYPE_CHECKING: + from agents import ModelResponse, RunResultStreaming + +logger = make_logger(__name__) + + +def openai_usage_to_turn_usage(usage: Usage | None, model: str | None) -> TurnUsage: + """Map an ``agents.Usage`` to a harness-independent ``TurnUsage``. + + All field access is defensive (``getattr(..., None)``): different model + backends populate different subsets of the usage object, and real zeros are + valid values (e.g. 0 output tokens on a pure cache hit), so we never coerce + a present-but-zero value into ``None``. + """ + if usage is None: + return TurnUsage(model=model) + + input_details = getattr(usage, "input_tokens_details", None) + output_details = getattr(usage, "output_tokens_details", None) + + return TurnUsage( + model=model, + num_llm_calls=getattr(usage, "requests", None) or 0, + input_tokens=getattr(usage, "input_tokens", None), + cached_input_tokens=getattr(input_details, "cached_tokens", None), + output_tokens=getattr(usage, "output_tokens", None), + reasoning_tokens=getattr(output_details, "reasoning_tokens", None), + total_tokens=getattr(usage, "total_tokens", None), + ) + + +def _aggregate_usage(raw_responses: list[ModelResponse]) -> Usage | None: + """Sum the per-response ``Usage`` across a run's ``ModelResponse`` list. + + Returns ``None`` when no response carries usage so the caller can emit a + usage object with only the model name set. ``Usage.add`` accumulates + requests/tokens (including cached/reasoning detail fields). + """ + total: Usage | None = None + for response in raw_responses: + resp_usage = getattr(response, "usage", None) + if resp_usage is None: + continue + if total is None: + total = Usage() + total.add(resp_usage) + return total + + +class OpenAITurn: + """A single OpenAI Agents SDK turn adapted to the ``HarnessTurn`` protocol. + + Construct with exactly one of: + - ``result``: a ``RunResultStreaming`` from ``Runner.run_streamed``. Its + ``stream_events()`` is converted to the canonical stream, and after the + stream is exhausted ``raw_responses`` is read to compute usage. + - ``stream``: a pre-built async iterator of canonical ``StreamTaskMessage`` + events (bypasses ``convert_openai_to_agentex_events``). Useful for tests + and for callers that have already produced canonical events. Usage stays + at ``TurnUsage(model=...)`` because there is no run to read usage from. + + ``coalesce_tool_requests`` is accepted for API parity with other provider + turns but is a no-op for OpenAI: the OpenAI converter already emits a single + ``Full(ToolRequestContent)`` per tool call rather than streamed argument + deltas, so there is nothing to coalesce. + """ + + def __init__( + self, + result: RunResultStreaming | None = None, + model: str | None = None, + stream: AsyncIterator[StreamTaskMessage] | None = None, + coalesce_tool_requests: bool = False, # noqa: ARG002 - API parity, no-op for OpenAI + ) -> None: + if result is None and stream is None: + raise ValueError("OpenAITurn requires either `result` or `stream`") + self._result = result + self._model = model + self._stream = stream + self._usage: TurnUsage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._iter_events() + + async def _iter_events(self) -> AsyncIterator[StreamTaskMessage]: + if self._stream is not None: + async for event in self._stream: + yield event + return + + result = self._result + assert result is not None # guaranteed by __init__ + async for event in convert_openai_to_agentex_events(result.stream_events()): + yield event + + # Stream is exhausted: the run has finished and raw_responses is now + # populated, so usage can be aggregated and normalized. + try: + raw_responses: list[Any] = list(getattr(result, "raw_responses", None) or []) + aggregated = _aggregate_usage(raw_responses) + self._usage = openai_usage_to_turn_usage(aggregated, self._model) + except Exception as exc: # pragma: no cover - defensive: never break delivery on usage + logger.warning(f"Failed to aggregate OpenAI usage: {exc}") + self._usage = TurnUsage(model=self._model) + + def usage(self) -> TurnUsage: + """Normalized turn usage. Valid only after ``events`` is exhausted.""" + return self._usage diff --git a/src/agentex/lib/adk/providers/_modules/sync_provider.py b/src/agentex/lib/adk/providers/_modules/sync_provider.py index a34cfcda1..d1d5e1c09 100644 --- a/src/agentex/lib/adk/providers/_modules/sync_provider.py +++ b/src/agentex/lib/adk/providers/_modules/sync_provider.py @@ -32,6 +32,7 @@ from agentex import AsyncAgentex from agentex.lib.utils.logging import make_logger from agentex.lib.core.tracing.tracer import AsyncTracer +from agentex.types.reasoning_content import ReasoningContent from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import ( StreamTaskMessageDone, @@ -55,24 +56,28 @@ def _serialize_item(item: Any) -> dict[str, Any]: Uses model_dump() for Pydantic models, otherwise extracts attributes manually. Filters out internal Pydantic fields that can't be serialized. """ - if hasattr(item, 'model_dump'): + if hasattr(item, "model_dump"): # Pydantic model - use model_dump for proper serialization try: - return item.model_dump(mode='json', exclude_unset=True) + return item.model_dump(mode="json", exclude_unset=True) except Exception: # Fallback to dict conversion - return dict(item) if hasattr(item, '__iter__') else {} + return dict(item) if hasattr(item, "__iter__") else {} else: # Not a Pydantic model - extract attributes manually item_dict = {} for attr_name in dir(item): - if not attr_name.startswith('_') and attr_name not in ('model_fields', 'model_config', 'model_computed_fields'): + if not attr_name.startswith("_") and attr_name not in ( + "model_fields", + "model_config", + "model_computed_fields", + ): try: attr_value = getattr(item, attr_name, None) # Skip methods and None values if attr_value is not None and not callable(attr_value): # Convert to JSON-serializable format - if hasattr(attr_value, 'model_dump'): + if hasattr(attr_value, "model_dump"): item_dict[attr_name] = attr_value.model_dump() elif isinstance(attr_value, (str, int, float, bool, list, dict)): item_dict[attr_name] = attr_value @@ -85,9 +90,26 @@ def _serialize_item(item: Any) -> dict[str, Any]: class SyncStreamingModel(Model): - """Simple model wrapper that adds logging to stream_response and supports tracing.""" + """Simple model wrapper that adds logging to stream_response and supports tracing. + + .. deprecated:: + Prefer the unified harness surface for new OpenAI Agents integrations: + wrap a ``Runner.run_streamed`` result in + ``agentex.lib.adk.providers._modules.openai_turn.OpenAITurn`` and drive + delivery + tracing through ``UnifiedEmitter`` (see the + ``060_harness_openai`` / ``130_harness_openai`` / ``140_harness_openai`` + tutorials). This per-model tracing wrapper predates the harness and is + retained only for backwards compatibility; it will be removed in a + future release. No runtime warning is emitted. + """ - def __init__(self, original_model: Model, trace_id: str | None = None, parent_span_id: str | None = None, tracer: AsyncTracer | None = None): + def __init__( + self, + original_model: Model, + trace_id: str | None = None, + parent_span_id: str | None = None, + tracer: AsyncTracer | None = None, + ): """Initialize with the original OpenAI model to wrap. Args: original_model: The OpenAI model instance to wrap @@ -147,7 +169,7 @@ async def get_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): kwargs["conversation_id"] = conversation_id response = await self.original_model.get_response(**kwargs) @@ -158,12 +180,12 @@ async def get_response( final_output = None # Extract final output text from response - response_final_output = getattr(response, 'final_output', None) + response_final_output = getattr(response, "final_output", None) if response_final_output: final_output = response_final_output # Extract items from the response output - response_output = getattr(response, 'output', None) + response_output = getattr(response, "output", None) if response_output: output_items = response_output if isinstance(response_output, list) else [response_output] @@ -174,12 +196,12 @@ async def get_response( new_items.append(item_dict) # Extract final_output from message type if available - if item_dict.get('type') == 'message' and not final_output: - content = item_dict.get('content', []) + if item_dict.get("type") == "message" and not final_output: + content = item_dict.get("content", []) if content and isinstance(content, list): for content_part in content: - if isinstance(content_part, dict) and 'text' in content_part: - final_output = content_part['text'] + if isinstance(content_part, dict) and "text" in content_part: + final_output = content_part["text"] break except Exception as e: logger.warning(f"Failed to serialize item in get_response: {e}") @@ -207,7 +229,7 @@ async def get_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): kwargs["conversation_id"] = conversation_id return await self.original_model.get_response(**kwargs) @@ -266,7 +288,7 @@ async def stream_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): stream_kwargs["conversation_id"] = conversation_id # Get the stream response from the original model and yield each event @@ -277,11 +299,11 @@ async def stream_response( final_response_text = "" async for event in stream_response: - event_type = getattr(event, 'type', 'no-type') + event_type = getattr(event, "type", "no-type") # Handle response.output_item.done events which contain completed items - if event_type == 'response.output_item.done': - item = getattr(event, 'item', None) + if event_type == "response.output_item.done": + item = getattr(event, "item", None) if item is not None: try: item_dict = _serialize_item(item) @@ -289,12 +311,12 @@ async def stream_response( new_items.append(item_dict) # Update final_response_text from message type if available - if item_dict.get('type') == 'message': - content = item_dict.get('content', []) + if item_dict.get("type") == "message": + content = item_dict.get("content", []) if content and isinstance(content, list): for content_part in content: - if isinstance(content_part, dict) and 'text' in content_part: - final_response_text = content_part['text'] + if isinstance(content_part, dict) and "text" in content_part: + final_response_text = content_part["text"] break except Exception as e: logger.warning(f"Failed to serialize item in stream_response: {e}") @@ -326,7 +348,7 @@ async def stream_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): stream_kwargs["conversation_id"] = conversation_id # Get the stream response from the original model and yield each event @@ -336,8 +358,17 @@ async def stream_response( async for event in stream_response: yield event + class SyncStreamingProvider(OpenAIProvider): - """Simple OpenAI provider wrapper that adds logging to streaming and supports tracing.""" + """Simple OpenAI provider wrapper that adds logging to streaming and supports tracing. + + .. deprecated:: + Prefer the unified harness surface for new OpenAI Agents integrations + (see :class:`SyncStreamingModel` and the ``OpenAITurn`` + + ``UnifiedEmitter`` pattern). This provider wrapper predates the harness + and is retained only for backwards compatibility; it will be removed in + a future release. No runtime warning is emitted. + """ def __init__(self, trace_id: str | None = None, parent_span_id: str | None = None, *args, **kwargs): """Initialize the provider with tracing support. @@ -405,6 +436,7 @@ def _extract_tool_call_info(tool_call_item: Any) -> tuple[str, str, dict[str, An if tool_call_item.arguments: if isinstance(tool_call_item.arguments, str): import json + tool_arguments = json.loads(tool_call_item.arguments) if tool_call_item.arguments else {} else: tool_arguments = tool_call_item.arguments @@ -418,6 +450,7 @@ def _extract_tool_call_info(tool_call_item: Any) -> tuple[str, str, dict[str, An arguments = tool_call_item.arguments if isinstance(arguments, str): import json + tool_arguments = json.loads(arguments) if arguments else {} elif arguments is None: tool_arguments = {} @@ -466,11 +499,11 @@ def _extract_tool_response_info(tool_map: dict[str, Any], tool_output_item: Any) async def convert_openai_to_agentex_events(stream_response): """Convert OpenAI streaming events to AgentEx TaskMessageUpdate events with reasoning support. - + This is an enhanced version of the base converter that includes support for: - Reasoning content deltas (for o1 models) - Reasoning summary deltas (for o1 models) - + Args: stream_response: An async iterator of OpenAI streaming events Yields: @@ -488,8 +521,8 @@ async def convert_openai_to_agentex_events(stream_response): event_count += 1 # Check for raw response events which contain the actual OpenAI streaming events - if hasattr(event, 'type') and event.type == 'raw_response_event': - if hasattr(event, 'data'): + if hasattr(event, "type") and event.type == "raw_response_event": + if hasattr(event, "data"): raw_event = event.data # Check for ResponseOutputItemAddedEvent which signals a new message starting @@ -504,7 +537,7 @@ async def convert_openai_to_agentex_events(stream_response): if item_id in item_id_to_index: # Get the message type to decide whether to send done event message_type = item_id_to_type.get(item_id, "text") - + # Don't send done events for reasoning content/summary # They just end with their last delta if message_type not in ("reasoning_content", "reasoning_summary"): @@ -528,14 +561,20 @@ async def convert_openai_to_agentex_events(stream_response): item_id_to_index[item_id] = message_index item_id_to_type[item_id] = "reasoning_summary" - # Send a start event for this new reasoning summary message + # Send a start event for this new reasoning summary message. + # The start content must be ReasoningContent (not TextContent) + # so consumers that branch on the start event's content type + # render a reasoning/thinking indicator; the final persisted + # content is rebuilt from the reasoning deltas regardless. yield StreamTaskMessageStart( type="start", index=item_id_to_index[item_id], - content=TextContent( - type="text", + content=ReasoningContent( + type="reasoning", author="agent", - content="", # Start with empty content + summary=[], + content=[], + style="active", ), ) @@ -572,14 +611,20 @@ async def convert_openai_to_agentex_events(stream_response): item_id_to_index[item_id] = message_index item_id_to_type[item_id] = "reasoning_content" - # Send a start event for this new reasoning content message + # Send a start event for this new reasoning content message. + # The start content must be ReasoningContent (not TextContent) + # so consumers that branch on the start event's content type + # render a reasoning/thinking indicator; the final persisted + # content is rebuilt from the reasoning deltas regardless. yield StreamTaskMessageStart( type="start", index=item_id_to_index[item_id], - content=TextContent( - type="text", + content=ReasoningContent( + type="reasoning", author="agent", - content="", # Start with empty content + summary=[], + content=[], + style="active", ), ) @@ -608,7 +653,7 @@ async def convert_openai_to_agentex_events(stream_response): # Check if this is a text delta event from OpenAI elif isinstance(raw_event, ResponseTextDeltaEvent): # Check if this event has an item_id - item_id = getattr(raw_event, 'item_id', None) + item_id = getattr(raw_event, "item_id", None) # If this is a new item_id we haven't seen, it's a new message if item_id and item_id not in item_id_to_index: @@ -647,13 +692,13 @@ async def convert_openai_to_agentex_events(stream_response): ) yield delta_message - elif hasattr(event, 'type') and event.type == 'run_item_stream_event': + elif hasattr(event, "type") and event.type == "run_item_stream_event": # Skip reasoning_item events - they're handled via raw_response_event above - if hasattr(event, 'item') and event.item.type == 'reasoning_item': + if hasattr(event, "item") and event.item.type == "reasoning_item": continue # Check for tool_call_item type (this is when a tool is being called) - elif hasattr(event, 'item') and event.item.type == 'tool_call_item': + elif hasattr(event, "item") and event.item.type == "tool_call_item": # Extract tool call information using the helper method call_id, tool_name, tool_arguments = _extract_tool_call_info(event.item.raw_item) tool_map[call_id] = tool_name @@ -671,7 +716,7 @@ async def convert_openai_to_agentex_events(stream_response): ) # Check for tool_call_output_item type (this is when a tool returns output) - elif hasattr(event, 'item') and event.item.type == 'tool_call_output_item': + elif hasattr(event, "item") and event.item.type == "tool_call_output_item": # Extract tool response information using the helper method call_id, tool_name, content = _extract_tool_response_info(tool_map, event.item.raw_item) tool_response_content = ToolResponseContent( @@ -687,4 +732,3 @@ async def convert_openai_to_agentex_events(stream_response): index=message_index, content=tool_response_content, ) - diff --git a/src/agentex/lib/cli/templates/default-langgraph/project/acp.py.j2 b/src/agentex/lib/cli/templates/default-langgraph/project/acp.py.j2 index 3309dc07e..750a271ad 100644 --- a/src/agentex/lib/cli/templates/default-langgraph/project/acp.py.j2 +++ b/src/agentex/lib/cli/templates/default-langgraph/project/acp.py.j2 @@ -15,13 +15,14 @@ if _litellm_key: os.environ["OPENAI_API_KEY"] = _litellm_key import agentex.lib.adk as adk -from agentex.lib.adk import create_langgraph_tracing_handler, stream_langgraph_events +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config from agentex.lib.sdk.fastacp.fastacp import FastACP from agentex.protocol.acp import SendEventParams, CancelTaskParams, CreateTaskParams from agentex.lib.types.fastacp import AsyncACPConfig from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger +from agentex.lib.adk import LangGraphTurn from project.graph import create_graph @@ -67,24 +68,23 @@ async def handle_task_event_send(params: SendEventParams): input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - callback = create_langgraph_tracing_handler( - trace_id=task_id, - parent_span_id=turn_span.id if turn_span else None, - ) - stream = graph.astream( {"messages": [{"role": "user", "content": user_message}]}, - config={ - "configurable": {"thread_id": task_id}, - "callbacks": [callback], - }, + config={"configurable": {"thread_id": task_id}}, stream_mode=["messages", "updates"], ) - final_output = await stream_langgraph_events(stream, task_id) + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) if turn_span: - turn_span.output = {"final_output": final_output} + turn_span.output = {"final_output": result.final_text} @acp.on_task_create diff --git a/src/agentex/lib/cli/templates/default-pydantic-ai/project/acp.py.j2 b/src/agentex/lib/cli/templates/default-pydantic-ai/project/acp.py.j2 index 5692396b2..11d3ab476 100644 --- a/src/agentex/lib/cli/templates/default-pydantic-ai/project/acp.py.j2 +++ b/src/agentex/lib/cli/templates/default-pydantic-ai/project/acp.py.j2 @@ -19,21 +19,19 @@ from dotenv import load_dotenv load_dotenv() -from project.agent import create_agent +from project.agent import MODEL_NAME, create_agent from pydantic_ai.run import AgentRunResultEvent from pydantic_ai.messages import ModelMessagesTypeAdapter import agentex.lib.adk as adk -from agentex.lib.adk import ( - stream_pydantic_ai_events, - create_pydantic_ai_tracing_handler, -) from agentex.protocol.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.types.fastacp import AsyncACPConfig from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.utils.model_utils import BaseModel from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.adk import PydanticAITurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) @@ -125,15 +123,17 @@ async def handle_task_event_send(params: SendEventParams): input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - tracing_handler = create_pydantic_ai_tracing_handler( + # Construct the UnifiedEmitter from the ACP context so tracing is + # automatic and messages are auto-sent to the task stream (Redis). + emitter = UnifiedEmitter( + task_id=task_id, trace_id=task_id, parent_span_id=turn_span.id if turn_span else None, - task_id=task_id, ) # Wrap the pydantic-ai event stream so we can capture the final # AgentRunResultEvent (which carries the full message list for the - # next turn) without changing the streaming-helper's signature. + # next turn) before forwarding events to the emitter. captured_messages: list[Any] = [] async def tee_messages(upstream) -> AsyncIterator[Any]: @@ -143,9 +143,8 @@ async def handle_task_event_send(params: SendEventParams): yield event async with agent.run_stream_events(user_message, message_history=previous_messages) as stream: - final_output = await stream_pydantic_ai_events( - tee_messages(stream), task_id, tracing_handler=tracing_handler - ) + turn = PydanticAITurn(tee_messages(stream), model=MODEL_NAME) + result = await emitter.auto_send_turn(turn) # Save the updated message history so the next turn picks up here. if captured_messages: @@ -158,7 +157,7 @@ async def handle_task_event_send(params: SendEventParams): ) if turn_span: - turn_span.output = {"final_output": final_output} + turn_span.output = {"final_output": result.final_text} @acp.on_task_cancel diff --git a/src/agentex/lib/cli/templates/sync-langgraph/project/acp.py.j2 b/src/agentex/lib/cli/templates/sync-langgraph/project/acp.py.j2 index 54538d0c9..c6814b9c4 100644 --- a/src/agentex/lib/cli/templates/sync-langgraph/project/acp.py.j2 +++ b/src/agentex/lib/cli/templates/sync-langgraph/project/acp.py.j2 @@ -8,12 +8,13 @@ tokens and tool calls from the LangGraph graph to the Agentex frontend. from typing import AsyncGenerator import agentex.lib.adk as adk -from agentex.lib.adk import create_langgraph_tracing_handler, convert_langgraph_to_agentex_events +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config from agentex.lib.sdk.fastacp.fastacp import FastACP from agentex.protocol.acp import SendMessageParams from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger +from agentex.lib.adk import LangGraphTurn from agentex.types.task_message_content import TaskMessageContent from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import TaskMessageUpdate @@ -72,22 +73,21 @@ async def handle_message_send( input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - callback = create_langgraph_tracing_handler( - trace_id=thread_id, - parent_span_id=turn_span.id if turn_span else None, - ) - stream = graph.astream( {"messages": [{"role": "user", "content": user_message}]}, - config={ - "configurable": {"thread_id": thread_id}, - "callbacks": [callback], - }, + config={"configurable": {"thread_id": thread_id}}, stream_mode=["messages", "updates"], ) + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=thread_id, + trace_id=thread_id, + parent_span_id=turn_span.id if turn_span else None, + ) + final_text = "" - async for event in convert_langgraph_to_agentex_events(stream): + async for event in emitter.yield_turn(turn): # Accumulate text deltas for span output delta = getattr(event, "delta", None) if isinstance(delta, TextDelta) and delta.text_delta: diff --git a/src/agentex/lib/cli/templates/sync-pydantic-ai/project/acp.py.j2 b/src/agentex/lib/cli/templates/sync-pydantic-ai/project/acp.py.j2 index 4925e847f..061ae0e08 100644 --- a/src/agentex/lib/cli/templates/sync-pydantic-ai/project/acp.py.j2 +++ b/src/agentex/lib/cli/templates/sync-pydantic-ai/project/acp.py.j2 @@ -15,19 +15,17 @@ from dotenv import load_dotenv load_dotenv() -from project.agent import create_agent +from project.agent import MODEL_NAME, create_agent import agentex.lib.adk as adk -from agentex.lib.adk import ( - create_pydantic_ai_tracing_handler, - convert_pydantic_ai_to_agentex_events, -) from agentex.protocol.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter from agentex.lib.types.tracing import SGPTracingProcessorConfig from agentex.lib.utils.logging import make_logger from agentex.lib.sdk.fastacp.fastacp import FastACP from agentex.types.task_message_update import TaskMessageUpdate from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk import PydanticAITurn from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config logger = make_logger(__name__) @@ -73,7 +71,7 @@ async def handle_message_send( logger.info(f"Processing message for task {task_id}") # Open a per-message turn span. Tool calls below nest underneath this - # span via the tracing handler's parent_span_id wiring. + # span via the emitter's parent_span_id wiring. async with adk.tracing.span( trace_id=task_id, task_id=task_id, @@ -81,13 +79,14 @@ async def handle_message_send( input={"message": user_message}, data={"__span_type__": "AGENT_WORKFLOW"}, ) as turn_span: - tracing_handler = create_pydantic_ai_tracing_handler( + # Construct the UnifiedEmitter from the ACP/streaming context so tracing + # is automatic: tool spans nest under this turn's span. + emitter = UnifiedEmitter( + task_id=task_id, trace_id=task_id, parent_span_id=turn_span.id if turn_span else None, - task_id=task_id, ) async with agent.run_stream_events(user_message) as stream: - async for event in convert_pydantic_ai_to_agentex_events( - stream, tracing_handler=tracing_handler - ): - yield event + turn = PydanticAITurn(stream, model=MODEL_NAME) + async for ev in emitter.yield_turn(turn): + yield ev diff --git a/src/agentex/lib/cli/templates/temporal-pydantic-ai/project/agent.py.j2 b/src/agentex/lib/cli/templates/temporal-pydantic-ai/project/agent.py.j2 index 0aa958118..da97856ea 100644 --- a/src/agentex/lib/cli/templates/temporal-pydantic-ai/project/agent.py.j2 +++ b/src/agentex/lib/cli/templates/temporal-pydantic-ai/project/agent.py.j2 @@ -11,9 +11,9 @@ moves into recorded activities. Streaming back to Agentex happens via ``event_stream_handler``, which receives Pydantic AI ``AgentStreamEvent``s from inside the model activity -and forwards them to Redis using the ``stream_pydantic_ai_events`` helper. -The ``task_id`` and tracing parent span ID are threaded into the handler -via ``deps``. +and forwards them through the unified harness surface +(``UnifiedEmitter.auto_send_turn`` + ``PydanticAITurn``). The ``task_id`` and +tracing parent span ID are threaded into the handler via ``deps``. """ from __future__ import annotations @@ -27,10 +27,8 @@ from project.tools import get_weather from pydantic_ai.messages import AgentStreamEvent from pydantic_ai.durable_exec.temporal import TemporalAgent -from agentex.lib.adk import ( - stream_pydantic_ai_events, - create_pydantic_ai_tracing_handler, -) +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.adk import PydanticAITurn # Swap this for any Pydantic AI-supported model identifier # (e.g. "anthropic:claude-3-5-sonnet-latest", "openai:gpt-4o"). @@ -92,17 +90,18 @@ async def event_handler( activity (not the workflow), it can freely make non-deterministic Redis writes — including the tracing HTTP calls that record per-tool-call spans under the workflow's per-turn span (when ``parent_span_id`` is set). + + The UnifiedEmitter is constructed from ``deps`` (task_id + parent_span_id), + so tool spans nest under the workflow's per-turn span and messages auto-send + to the task stream. """ - tracing_handler = create_pydantic_ai_tracing_handler( + emitter = UnifiedEmitter( + task_id=run_context.deps.task_id, trace_id=run_context.deps.task_id, parent_span_id=run_context.deps.parent_span_id, - task_id=run_context.deps.task_id, - ) - await stream_pydantic_ai_events( - events, - run_context.deps.task_id, - tracing_handler=tracing_handler, ) + turn = PydanticAITurn(events, model=MODEL_NAME) + await emitter.auto_send_turn(turn) # Construct the durable agent at module load time so that the diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py new file mode 100644 index 000000000..067751d63 --- /dev/null +++ b/src/agentex/lib/core/harness/__init__.py @@ -0,0 +1,30 @@ +"""Shared, harness-independent machinery for the unified harness surface. + +The Agentex StreamTaskMessage* stream is the single source of truth; this +package derives spans from it and delivers it (yield or auto-send), so every +harness tap gets streaming + tracing + turn usage uniformly. +""" + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + SpanSignal, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter + +__all__ = [ + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", +] diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py new file mode 100644 index 000000000..2ecd6b583 --- /dev/null +++ b/src/agentex/lib/core/harness/auto_send.py @@ -0,0 +1,156 @@ +"""Auto-send delivery: canonical stream -> adk.streaming side effects + tracing.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator +from datetime import datetime + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage, TurnResult, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.span_derivation import SpanDeriver + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +async def auto_send( + events: AsyncIterator[StreamTaskMessage], + task_id: str, + tracer: SpanTracer | None = None, + streaming: Any = None, + usage: TurnUsage | None = None, + created_at: datetime | None = None, +) -> TurnResult: + """Push the canonical stream to the task stream via adk.streaming. + + Opens a streaming context per message (keyed by index), streams deltas via + ctx.stream_update, and closes via ctx.close() on Done. Posts tool + request/response full messages by opening a context with the content and + closing it immediately (no deltas). Derives and traces spans from the same + stream. Returns the last text segment's text + usage. + + Index-keyed routing: each Start(index=i) opens a context stored in + ctx_map[i]; Delta(index=i) routes to ctx_map.get(i); Done(index=i) closes + and removes ctx_map[i]. Events with index is None are skipped. The finally + block closes all remaining open contexts. + + final_text last-segment semantics: a new Start(TextContent) resets + final_text_parts so that multi-step turns return the LAST text segment. + Full(TextContent) also overwrites final_text_parts (same semantics). + + AGX1-378: created_at is forwarded to every streaming_task_message_context + call so callers can back-date message timestamps. + + Mirrors the open/close/stream_update pattern from + src/agentex/lib/adk/_modules/_langgraph_async.py: + - context opened via streaming_task_message_context(...).__aenter__() + - context closed via ctx.close() (not __aexit__) + - deltas pushed as StreamTaskMessageDelta with parent_task_message set + from ctx.task_message + + For async + temporal agents (call from inside an activity). + """ + if streaming is None: + from agentex.lib import adk + + streaming = adk.streaming + + deriver = SpanDeriver() if tracer is not None else None + final_text_parts: list[str] = [] + ctx_map: dict[int, Any] = {} + + async def _close_all() -> None: + # Guard each close independently: a failure on one context (e.g. a + # backend hiccup during teardown) must not abandon the remaining open + # contexts, otherwise their task messages would never be finalized. + for ctx in list(ctx_map.values()): + try: + await ctx.close() + except Exception as exc: + logger.warning("[harness.auto_send] context close failed during teardown: %s", exc) + ctx_map.clear() + + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + + if isinstance(event, StreamTaskMessageStart): + if event.index is None: + continue + i = event.index + # Reset final_text_parts when a new text segment starts + if isinstance(event.content, TextContent): + final_text_parts = [] + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ) + ctx_map[i] = await ctx.__aenter__() + + elif isinstance(event, StreamTaskMessageDelta): + if event.index is None: + continue + ctx = ctx_map.get(event.index) + if ctx is not None and event.delta is not None: + # Reconstruct the delta with parent_task_message set from + # the context's task_message (mirrors _langgraph_async.py + # lines 72-78 and 117-127). + delta_with_parent = StreamTaskMessageDelta( + parent_task_message=ctx.task_message, + delta=event.delta, + type="delta", + index=event.index, + ) + await ctx.stream_update(delta_with_parent) + if isinstance(event.delta, TextDelta) and event.delta.text_delta: + final_text_parts.append(event.delta.text_delta) + + elif isinstance(event, StreamTaskMessageDone): + if event.index is None: + continue + ctx = ctx_map.pop(event.index, None) + if ctx is not None: + await ctx.close() + + elif isinstance(event, StreamTaskMessageFull): + # Full messages: post the full message by opening a context + # with the content and closing it immediately (no deltas; + # StreamingTaskMessageContext.close() persists initial_content + # when the accumulator is empty). Use async with so the context + # is closed even if close() raises (__aexit__ delegates to + # close()). + # Full(TextContent) also resets final_text_parts for + # last-segment semantics. + if isinstance(event.content, TextContent): + final_text_parts = [event.content.content] + async with streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ): + pass + + finally: + await _close_all() + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) + + return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py new file mode 100644 index 000000000..5b56793bf --- /dev/null +++ b/src/agentex/lib/core/harness/emitter.py @@ -0,0 +1,80 @@ +"""UnifiedEmitter: the single facade agent authors use for either delivery mode.""" + +from __future__ import annotations + +from typing import AsyncGenerator +from datetime import datetime + +from agentex.lib.core.harness.types import TurnResult, HarnessTurn, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.yield_delivery import yield_events + + +class UnifiedEmitter: + """Ties trace context + chosen delivery together. + + Tracing modes (the `tracer` arg): + - tracer=None (default): auto-construct a SpanTracer if `trace_id` is present. + - tracer=False: disable tracing entirely, regardless of `trace_id`. + - tracer=: use the supplied instance. + + `tracing` and `streaming` are injection escape-hatches for tests/advanced + use; leave them None in production so the real adk modules are used. + """ + + tracer: SpanTracer | None + + def __init__( + self, + task_id: str, + trace_id: str | None, + parent_span_id: str | None, + tracer: SpanTracer | bool | None = None, + tracing: object | None = None, + streaming: object | None = None, + ): + self.task_id = task_id + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self._streaming = streaming + if tracer is False: + self.tracer = None + elif isinstance(tracer, SpanTracer): + self.tracer = tracer + elif trace_id: + self.tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id=task_id, + tracing=tracing, + ) + else: + self.tracer = None + + async def yield_turn(self, turn: HarnessTurn) -> AsyncGenerator[StreamTaskMessage, None]: + """Sync HTTP ACP delivery: forward events, trace as side effect.""" + async for event in yield_events(turn.events, tracer=self.tracer): + yield event + + async def auto_send_turn(self, turn: HarnessTurn, created_at: datetime | None = None) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult. + + Pass `created_at` (e.g. `workflow.now()` under Temporal) to stamp the + turn's messages with a deterministic timestamp; it is forwarded to the + streaming contexts. Default None preserves server-side timestamps. + """ + # `turn.usage()` is only valid AFTER `turn.events` is exhausted (the + # HarnessTurn single-pass contract: real turns populate usage while the + # stream is consumed). So drive delivery first, then read usage — do NOT + # pass `usage=turn.usage()` eagerly here (that would capture the empty + # default before the stream runs). + result = await auto_send( + turn.events, + task_id=self.task_id, + tracer=self.tracer, + streaming=self._streaming, + created_at=created_at, + ) + result.usage = turn.usage() + return result diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py new file mode 100644 index 000000000..cecb24bcc --- /dev/null +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -0,0 +1,154 @@ +"""Pure reducer: canonical StreamTaskMessage* stream -> span open/close signals. + +Has no dependency on adk; unit-testable in isolation. Delivery adapters feed it +every event and act on the returned signals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal, StreamTaskMessage +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +@dataclass +class _ToolReqMeta: + tool_call_id: str + name: str + arguments: dict[str, object] + args_buf: str = "" # accumulated streamed argument fragments + + +class SpanDeriver: + """Stateful reducer over the canonical stream. + + Tool span: open on Done of a ToolRequestContent index; close on matching + ToolResponseContent by tool_call_id. Reasoning span: open on + Start(ReasoningContent); close on that index's Done. + + Deliberate contracts: + - A `Full(ToolResponseContent)` whose tool_call_id was never opened is + ignored (no CloseSpan emitted). + - A `Done` for an index that was never a tool_request/reasoning Start is + ignored (no signal emitted). + - Events with `index is None` are skipped entirely; without a stable index + they cannot be reliably paired, and aliasing them to a sentinel would + let unrelated None-indexed events cross-match. + - `flush()` closes anything still open as incomplete; unclosed tool spans + are emitted in the order they were opened. + """ + + def __init__(self) -> None: + self._tool_by_index: dict[int, _ToolReqMeta] = {} + self._reasoning_index_open: set[int] = set() + # insertion-ordered set of open tool_call_ids (dict keys preserve order) + self._open_tool_ids: dict[str, None] = {} + + def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: + if isinstance(event, StreamTaskMessageStart): + return self._on_start(event) + if isinstance(event, StreamTaskMessageDelta): + return self._on_delta(event) + if isinstance(event, StreamTaskMessageFull): + return self._on_full(event) + if isinstance(event, StreamTaskMessageDone): + return self._on_done(event) + return [] + + def flush(self) -> list[SpanSignal]: + """Close anything still open at end of stream, marked incomplete.""" + signals: list[SpanSignal] = [] + for tcid in list(self._open_tool_ids): + signals.append(CloseSpan(key=tcid, output=None, is_complete=False)) + self._open_tool_ids.clear() + for idx in sorted(self._reasoning_index_open): + signals.append(CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=False)) + self._reasoning_index_open.clear() + return signals + + def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + content = event.content + if isinstance(content, ToolRequestContent): + self._tool_by_index[idx] = _ToolReqMeta( + tool_call_id=content.tool_call_id, + name=content.name, + arguments=dict(content.arguments or {}), + ) + return [] + if content.type == "reasoning": + self._reasoning_index_open.add(idx) + return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] + return [] + + def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + delta = event.delta + if isinstance(delta, ToolRequestDelta): + meta = self._tool_by_index.get(idx) + if meta is not None and delta.arguments_delta: + meta.args_buf += delta.arguments_delta + return [] + + def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + """Handle a Full event. + + A `Full(ToolRequestContent)` opens a tool span (keyed by tool_call_id) + if it is not already open; the matching `Full(ToolResponseContent)` + closes it. This handles harnesses (e.g. LangGraph) that emit tool calls + as a single Full rather than Start+Done. + """ + content = event.content + if isinstance(content, ToolRequestContent): + tcid = content.tool_call_id + if tcid not in self._open_tool_ids: + self._open_tool_ids[tcid] = None + args = dict(content.arguments or {}) + return [OpenSpan(key=tcid, kind="tool", name=content.name, input=args)] + return [] + if isinstance(content, ToolResponseContent): + tcid = content.tool_call_id + if tcid in self._open_tool_ids: + self._open_tool_ids.pop(tcid, None) + return [ + CloseSpan( + key=tcid, + output=content.content, + is_complete=True, + is_error=content.is_error, + ) + ] + return [] + + def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + meta = self._tool_by_index.pop(idx, None) + if meta is not None: + args = meta.arguments + if meta.args_buf: + try: + args = json.loads(meta.args_buf) + except json.JSONDecodeError: + args = {"_raw": meta.args_buf} + self._open_tool_ids[meta.tool_call_id] = None + return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] + if idx in self._reasoning_index_open: + self._reasoning_index_open.discard(idx) + return [CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=True)] + return [] diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py new file mode 100644 index 000000000..4ca4d628b --- /dev/null +++ b/src/agentex/lib/core/harness/tracer.py @@ -0,0 +1,88 @@ +"""Adapter from SpanSignals to adk.tracing spans (best-effort, overridable).""" + +from __future__ import annotations + +from typing import Any + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +class SpanTracer: + """Opens/closes adk.tracing child spans in response to span signals. + + `tracing` defaults to the real `adk.tracing` module; inject a fake in tests + or a custom tracer to override. No-op when `trace_id` is falsy. Never raises. + + The real TracingModule.end_span does NOT accept an output kwarg — output is + recorded by mutating span.output before calling end_span, matching the pattern + used throughout the codebase (see _langgraph_tracing.py on_tool_end etc.). + + Span-lifecycle contract: the `_open` dict (span key -> span object) is scoped + to a single turn. Pairing is by `key`: + - A duplicate OpenSpan for a key already in `_open` silently replaces the + earlier span; the earlier span is then orphaned (never closed / leaked). + - A CloseSpan for an unknown key is a no-op. + - Unpaired opens accumulate in `_open` for the lifetime of the tracer; since + a tracer is expected to live for one turn, this is bounded and acceptable. + """ + + def __init__( + self, + trace_id: str | None, + parent_span_id: str | None, + tracing: Any = None, + task_id: str | None = None, + ): + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self.task_id = task_id + if tracing is None: + from agentex.lib import adk + + tracing = adk.tracing + self._tracing = tracing + self._open: dict[str, Any] = {} # span key -> span object + + async def handle(self, signal: SpanSignal) -> None: + if not self.trace_id: + return + try: + if isinstance(signal, OpenSpan): + span = await self._tracing.start_span( + trace_id=self.trace_id, + name=signal.name, + input=signal.input, + parent_id=self.parent_span_id, + task_id=self.task_id, + ) + if span is not None: + self._open[signal.key] = span + elif isinstance(signal, CloseSpan): + span = self._open.pop(signal.key, None) + if span is not None: + # Output is recorded by mutating span.output before end_span. + # The real TracingModule.end_span signature is: + # end_span(trace_id, span, start_to_close_timeout, heartbeat_timeout, retry_policy) + # It does not accept an output= kwarg. + span.output = signal.output + # Tool failure status (ToolResponseContent.is_error) is recorded + # on span.data when the harness reports one; Span has no dedicated + # error field. None means no status was reported, so leave data alone. + if signal.is_error is not None: + data = span.data if isinstance(span.data, dict) else {} + span.data = {**data, "is_error": signal.is_error} + await self._tracing.end_span( + trace_id=self.trace_id, + span=span, + ) + except Exception as exc: # best-effort: tracing never breaks delivery + logger.warning("[harness.tracer] span signal failed: %s", exc) diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py new file mode 100644 index 000000000..74e0dc314 --- /dev/null +++ b/src/agentex/lib/core/harness/types.py @@ -0,0 +1,96 @@ +"""Types for the unified harness surface.""" + +from __future__ import annotations + +from typing import Any, Union, Literal, Protocol, AsyncIterator, runtime_checkable +from dataclasses import field, dataclass + +from pydantic import BaseModel, ConfigDict + +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + +# The canonical stream element. Taps yield these; delivery adapters consume them. +StreamTaskMessage = Union[ + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +] + +SpanKind = Literal["tool", "reasoning", "subagent"] + + +@dataclass +class OpenSpan: + """Signal to open a child span. `key` pairs an open with its close.""" + + key: str + kind: SpanKind + name: str + input: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CloseSpan: + """Signal to close the span previously opened with the same `key`.""" + + key: str + output: Any = None + is_complete: bool = True # False when closed by flush() without a result + is_error: bool | None = None # tool failure status; None when the harness reports no status + + +SpanSignal = Union[OpenSpan, CloseSpan] + + +class TurnUsage(BaseModel): + """Harness-independent turn usage/cost, attached to the turn span. + + Token field names align with agentex.lib.core.observability.llm_metrics. + """ + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + # num_llm_calls is provider-reported and may be absent (None = "not + # reported"). num_tool_calls / num_reasoning_blocks are counted locally from + # the observed stream, so 0 is always a real count. + num_llm_calls: int | None = None + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 + + +class TurnResult(BaseModel): + """Returned to the caller after a turn is delivered.""" + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + final_text: str = "" + usage: TurnUsage = TurnUsage() + + +@runtime_checkable +class HarnessTurn(Protocol): + """A single harness turn: a canonical stream plus its normalized usage. + + Python async generators cannot cleanly return a value to their consumer, so + a tap exposes usage via `usage()` (valid only after `events` is exhausted) + rather than via StopAsyncIteration. + """ + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py new file mode 100644 index 000000000..69b39f152 --- /dev/null +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -0,0 +1,31 @@ +"""Yield delivery: pass the canonical stream through, tracing as a side effect.""" + +from __future__ import annotations + +from typing import AsyncIterator, AsyncGenerator + +from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +async def yield_events( + events: AsyncIterator[StreamTaskMessage], + tracer: SpanTracer | None = None, +) -> AsyncGenerator[StreamTaskMessage, None]: + """Forward each event to the caller; derive + trace spans as a side effect. + + For sync HTTP ACP agents that yield events back over the response. When + `tracer` is None, this is a pure passthrough. + """ + deriver = SpanDeriver() if tracer is not None else None + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + yield event + finally: + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) diff --git a/src/agentex/lib/core/services/adk/providers/openai.py b/src/agentex/lib/core/services/adk/providers/openai.py index 75e507d8a..1ae29589d 100644 --- a/src/agentex/lib/core/services/adk/providers/openai.py +++ b/src/agentex/lib/core/services/adk/providers/openai.py @@ -14,15 +14,8 @@ from agents.guardrail import InputGuardrail, OutputGuardrail from agents.exceptions import InputGuardrailTripwireTriggered, OutputGuardrailTripwireTriggered from openai.types.responses import ( - ResponseCompletedEvent, - ResponseTextDeltaEvent, - ResponseFunctionToolCall, ResponseFunctionWebSearch, - ResponseOutputItemDoneEvent, ResponseCodeInterpreterToolCall, - ResponseReasoningSummaryPartDoneEvent, - ResponseReasoningSummaryPartAddedEvent, - ResponseReasoningSummaryTextDeltaEvent, ) # Local imports @@ -31,24 +24,14 @@ from agentex.lib.utils.mcp import redact_mcp_server_params from agentex.lib.utils.temporal import heartbeat_if_in_workflow from agentex.lib.core.tracing.tracer import AsyncTracer -from agentex.types.task_message_delta import ( - TextDelta, - ReasoningSummaryDelta, -) -from agentex.types.task_message_update import ( - StreamTaskMessageFull, - StreamTaskMessageDelta, -) +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import StreamTaskMessageFull from agentex.types.task_message_content import ( TextContent, - ReasoningContent, ToolRequestContent, ToolResponseContent, ) -from agentex.lib.core.services.adk.streaming import ( - StreamingService, - StreamingTaskMessageContext, -) +from agentex.lib.core.services.adk.streaming import StreamingService logger = logging.make_logger(__name__) @@ -695,7 +678,7 @@ async def run_agent_streamed_auto_send( input_guardrails: list[InputGuardrail] | None = None, output_guardrails: list[OutputGuardrail] | None = None, max_turns: int | None = None, - previous_response_id: str | None = None, # noqa: ARG002 + previous_response_id: str | None = None, created_at: datetime | None = None, ) -> RunResultStreaming: """ @@ -733,8 +716,6 @@ async def run_agent_streamed_auto_send( if self.agentex_client is None: raise ValueError("Agentex client must be provided for auto_send methods") - tool_call_map: dict[str, ResponseFunctionToolCall] = {} - if self.tracer is None: raise RuntimeError("Tracer not initialized - ensure tracer is provided to OpenAIService") trace = self.tracer.trace(trace_id) @@ -761,12 +742,13 @@ async def run_agent_streamed_auto_send( ) as span: heartbeat_if_in_workflow("run agent streamed auto send") - # Consume the workflow-supplied created_at on the FIRST message - # opened by this activity (whichever streaming context opens first - # for this turn). That's the message that races the workflow's - # user-echo at the server. Subsequent messages in the same turn are - # separated by network/processing latency and rely on the server's - # wall clock. + # AGX1-378 restored: created_at is now threaded through + # UnifiedEmitter.auto_send_turn -> auto_send -> every + # streaming_task_message_context call, so the first agent message of + # the turn is stamped with the workflow-supplied timestamp (e.g. + # workflow.now()) just as the original inline loop did. + # The dispenser is still used below for guardrail-rejection messages, + # which open their own streaming contexts directly. _take_created_at = _make_created_at_dispenser(created_at) async with mcp_server_context(mcp_server_params, mcp_timeout_seconds) as servers: @@ -803,204 +785,48 @@ async def run_agent_streamed_auto_send( agent = Agent(**agent_kwargs) - # Run with streaming - if max_turns is not None: + # Run with streaming. Forward previous_response_id so callers that + # continue a Responses-API conversation resume the prior response + # instead of silently starting a fresh one (mirrors the non-auto-send + # run_agent_streamed path). + if max_turns is not None and previous_response_id is not None: + result = Runner.run_streamed( + starting_agent=agent, + input=input_list, + max_turns=max_turns, + previous_response_id=previous_response_id, + ) + elif max_turns is not None: result = Runner.run_streamed(starting_agent=agent, input=input_list, max_turns=max_turns) + elif previous_response_id is not None: + result = Runner.run_streamed( + starting_agent=agent, input=input_list, previous_response_id=previous_response_id + ) else: result = Runner.run_streamed(starting_agent=agent, input=input_list) - item_id_to_streaming_context: dict[str, StreamingTaskMessageContext] = {} - unclosed_item_ids: set[str] = set() - # Simple string to accumulate reasoning summary - current_reasoning_summary: str = "" + # Migrate onto the unified harness surface: wrap the streamed run + # as an OpenAITurn (provider -> canonical StreamTaskMessage* + # adapter) and let UnifiedEmitter.auto_send_turn drive delivery + + # tracing + usage. The previous ~270-line inline loop that hand- + # rolled per-item streaming contexts, reasoning handling, and + # span derivation now lives in the shared harness modules. + # Imported lazily: openai_turn pulls in agentex.lib.adk, which + # imports this service module, so an eager import would create a + # circular import at package init. + from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + turn = OpenAITurn(result=result, model=model) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=self.tracer, + streaming=self.streaming_service, + ) try: - # Process streaming events with TaskMessage creation - async for event in result.stream_events(): - heartbeat_if_in_workflow("processing stream event with auto send") - - if event.type == "run_item_stream_event": - if event.item.type == "tool_call_item": - tool_call_item = event.item.raw_item - - # Extract tool call information using the helper method - call_id, tool_name, tool_arguments = self._extract_tool_call_info(tool_call_item) - tool_call_map[call_id] = tool_call_item - - tool_request_content = ToolRequestContent( - author="agent", - tool_call_id=call_id, - name=tool_name, - arguments=tool_arguments, - ) - - # Create tool request using streaming context (immediate completion) - async with self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=tool_request_content, - created_at=_take_created_at(), - ) as streaming_context: - # The message has already been persisted, but we still need to send an upda - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=tool_request_content, - type="full", - ), - ) - - elif event.item.type == "tool_call_output_item": - tool_output_item = event.item.raw_item - - # Extract tool response information using the helper method - call_id, tool_name, content = self._extract_tool_response_info( - tool_call_map, tool_output_item - ) - - tool_response_content = ToolResponseContent( - author="agent", - tool_call_id=call_id, - name=tool_name, - content=content, - ) - - # Create tool response using streaming context (immediate completion) - async with self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=tool_response_content, - created_at=_take_created_at(), - ) as streaming_context: - # The message has already been persisted, but we still need to send an update - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=tool_response_content, - type="full", - ), - ) - - elif event.type == "raw_response_event": - if isinstance(event.data, ResponseTextDeltaEvent): - # Handle text delta - item_id = event.data.item_id - - # Check if we already have a streaming context for this item - if item_id not in item_id_to_streaming_context: - # Create a new streaming context for this item - streaming_context = self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - ), - created_at=_take_created_at(), - ) - # Open the streaming context - item_id_to_streaming_context[item_id] = await streaming_context.open() - unclosed_item_ids.add(item_id) - else: - streaming_context = item_id_to_streaming_context[item_id] - - # Stream the delta through the streaming service - await streaming_context.stream_update( - update=StreamTaskMessageDelta( - parent_task_message=streaming_context.task_message, - delta=TextDelta(text_delta=event.data.delta, type="text"), - type="delta", - ), - ) - # Reasoning step one: new summary part added - elif isinstance(event.data, ResponseReasoningSummaryPartAddedEvent): - # We need to create a new streaming context for this reasoning item - item_id = event.data.item_id - - # Reset the reasoning summary string - current_reasoning_summary = "" - - streaming_context = self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - created_at=_take_created_at(), - ) - - # Replace the existing streaming context (if it exists) - # Why do we replace? Cause all the reasoning parts use the same item_id! - item_id_to_streaming_context[item_id] = await streaming_context.open() - unclosed_item_ids.add(item_id) - - # Reasoning step two: handling summary text delta - elif isinstance(event.data, ResponseReasoningSummaryTextDeltaEvent): - # Accumulate the delta into the string - current_reasoning_summary += event.data.delta - streaming_context = item_id_to_streaming_context[item_id] - - # Stream the summary delta through the streaming service - await streaming_context.stream_update( - update=StreamTaskMessageDelta( - parent_task_message=streaming_context.task_message, - delta=ReasoningSummaryDelta( - summary_index=event.data.summary_index, - summary_delta=event.data.delta, - type="reasoning_summary", - ), - type="delta", - ), - ) - - # Reasoning step three: handling summary text done, closing the streaming context - elif isinstance(event.data, ResponseReasoningSummaryPartDoneEvent): - # Handle reasoning summary text completion - streaming_context = item_id_to_streaming_context[item_id] - - # Create the complete reasoning content with the accumulated summary - complete_reasoning_content = ReasoningContent( - author="agent", - summary=[current_reasoning_summary], - content=[], - type="reasoning", - style="static", - ) - - # Send a full message update with the complete reasoning content - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=complete_reasoning_content, - type="full", - ), - ) - - await streaming_context.close() - unclosed_item_ids.discard(item_id) - - elif isinstance(event.data, ResponseOutputItemDoneEvent): - # Handle item completion - item_id = event.data.item.id - - # Finish the streaming context (sends DONE event and updates message) - if item_id in item_id_to_streaming_context: - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - if item_id in unclosed_item_ids: - unclosed_item_ids.remove(item_id) - - elif isinstance(event.data, ResponseCompletedEvent): - # All items complete, finish all remaining streaming contexts for this session - # Create a copy to avoid modifying set during iteration - remaining_items = list(unclosed_item_ids) - for item_id in remaining_items: - if ( - item_id in unclosed_item_ids and item_id in item_id_to_streaming_context - ): # Check if still unclosed - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - unclosed_item_ids.discard(item_id) + await emitter.auto_send_turn(turn, created_at=created_at) except InputGuardrailTripwireTriggered as e: # Handle guardrail trigger by sending a rejection message @@ -1080,18 +906,6 @@ async def run_agent_streamed_auto_send( # Re-raise to let the activity handle it raise - finally: - # Cleanup: ensure all streaming contexts for this session are properly finished - # Create a copy to avoid modifying set during iteration - remaining_items = list(unclosed_item_ids) - for item_id in remaining_items: - if ( - item_id in unclosed_item_ids and item_id in item_id_to_streaming_context - ): # Check if still unclosed - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - unclosed_item_ids.discard(item_id) - if span: span.output = { "new_items": [ diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py index 7ccc6627a..75dc0f053 100644 --- a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py +++ b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py @@ -1,6 +1,7 @@ """Custom Temporal Model Provider with streaming support for OpenAI agents.""" from __future__ import annotations +import json import time import uuid from typing import Any, List, Union, Optional, override @@ -63,9 +64,9 @@ from agentex.lib import adk from agentex.lib.utils.logging import make_logger from agentex.lib.core.tracing.tracer import AsyncTracer -from agentex.types.task_message_delta import TextDelta, ReasoningContentDelta, ReasoningSummaryDelta +from agentex.types.task_message_delta import TextDelta, ToolRequestDelta, ReasoningContentDelta, ReasoningSummaryDelta from agentex.types.task_message_update import StreamTaskMessageFull, StreamTaskMessageDelta -from agentex.types.task_message_content import TextContent, ReasoningContent +from agentex.types.task_message_content import TextContent, ReasoningContent, ToolRequestContent from agentex.lib.adk.utils._modules.client import create_async_agentex_client from agentex.lib.core.temporal.plugins.openai_agents.interceptors.context_interceptor import ( streaming_task_id, @@ -722,12 +723,27 @@ async def get_response( streaming_mode=self.streaming_mode, ).__aenter__() elif item and getattr(item, 'type', None) == 'function_call': - # Track the function call being streamed + # Open a streaming context per function call so argument + # deltas can be published incrementally. Coalescing and + # mode dispatch are handled by the streaming layer. + call_id = getattr(item, 'call_id', '') + tool_name = getattr(item, 'name', '') + call_context = await adk.streaming.streaming_task_message_context( + task_id=task_id, + initial_content=ToolRequestContent( + author="agent", + tool_call_id=call_id, + name=tool_name, + arguments={}, + ), + streaming_mode=self.streaming_mode, + ).__aenter__() function_calls_in_progress[output_index] = { 'id': getattr(item, 'id', ''), - 'call_id': getattr(item, 'call_id', ''), - 'name': getattr(item, 'name', ''), + 'call_id': call_id, + 'name': tool_name, 'arguments': getattr(item, 'arguments', ''), + 'context': call_context, } logger.debug(f"[TemporalStreamingModel] Starting function call: {item.name}") @@ -748,8 +764,24 @@ async def get_response( output_index = getattr(event, 'output_index', 0) delta = getattr(event, 'delta', '') - if output_index in function_calls_in_progress: - function_calls_in_progress[output_index]['arguments'] += delta + call_data = function_calls_in_progress.get(output_index) + if call_data is not None: + call_data['arguments'] += delta + call_context = call_data.get('context') + if call_context is not None: + try: + await call_context.stream_update(StreamTaskMessageDelta( + parent_task_message=call_context.task_message, + delta=ToolRequestDelta( + tool_call_id=call_data['call_id'], + name=call_data['name'], + arguments_delta=delta, + type="tool_request", + ), + type="delta", + )) + except Exception as e: + logger.warning(f"Failed to send tool request delta: {e}") logger.debug(f"[TemporalStreamingModel] Function call args delta: {delta[:50]}...") elif isinstance(event, ResponseFunctionCallArgumentsDoneEvent): @@ -874,6 +906,42 @@ async def get_response( ) output_items.append(tool_call) + # Emit the final ToolRequestContent and close the + # per-call streaming context. If the model produced + # invalid JSON args (truncation, hallucination), fall + # back to an empty dict so the streaming layer can + # still persist a message. + call_context = call_data.get('context') + if call_context is not None: + raw_args = call_data['arguments'] or '' + try: + parsed_args = json.loads(raw_args) if raw_args else {} + except json.JSONDecodeError: + logger.warning( + f"Failed to parse tool call arguments for {call_data['name']} " + f"(raw_args_bytes={len(raw_args)})" + ) + parsed_args = {} + try: + await call_context.stream_update(StreamTaskMessageFull( + parent_task_message=call_context.task_message, + content=ToolRequestContent( + author="agent", + tool_call_id=call_data['call_id'], + name=call_data['name'], + arguments=parsed_args, + ), + type="full", + )) + except Exception as e: + logger.warning(f"Failed to send tool request full update: {e}") + try: + await call_context.close() + except Exception as e: + logger.warning(f"Failed to close tool request context: {e}") + finally: + call_data['context'] = None + elif isinstance(event, ResponseReasoningSummaryPartAddedEvent): # New reasoning part/summary started - reset accumulator part = getattr(event, 'part', None) @@ -907,6 +975,17 @@ async def get_response( await streaming_context.close() streaming_context = None + # Defensive: close any function call contexts that didn't see a + # ResponseOutputItemDoneEvent (truncated stream, error mid-call). + for call_data in function_calls_in_progress.values(): + call_context = call_data.get('context') + if call_context is not None: + try: + await call_context.close() + except Exception as e: + logger.warning(f"Failed to close orphaned tool request context: {e}") + call_data['context'] = None + # Build the response from output items collected during streaming # Create output from the items we collected response_output = [] diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py index 97dda0e61..26c0b7c4b 100644 --- a/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py +++ b/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py @@ -12,8 +12,11 @@ from openai.types.responses import ( ResponseCompletedEvent, ResponseTextDeltaEvent, + ResponseOutputItemDoneEvent, ResponseOutputItemAddedEvent, + ResponseFunctionCallArgumentsDoneEvent, ResponseReasoningSummaryTextDeltaEvent, + ResponseFunctionCallArgumentsDeltaEvent, ) @@ -851,6 +854,197 @@ async def test_missing_task_id_error(self, streaming_model): ) +class TestStreamingModelFunctionCallArgsStreaming: + """Verify ``ResponseFunctionCallArgumentsDeltaEvent``s are surfaced as + ``ToolRequestDelta`` updates and that a final ``ToolRequestContent`` Full is + emitted on ``ResponseOutputItemDoneEvent``. + + Without this, write-heavy tools (``write_file``, ``apply_patch``) buffer their + entire argument body inside ``invoke_model_activity`` and the UI sees a + multi-second freeze while the model is actively producing tokens. + """ + + @staticmethod + def _build_function_call_stream(arguments_text: str): + """Construct a streaming event sequence for a single function_call. + + Mirrors the production order: Added → N × ArgumentsDelta → ArgumentsDone + → OutputItemDone → ResponseCompleted. ``spec=...`` makes ``isinstance`` + dispatch in production work without triggering pydantic validation. + """ + call_item = MagicMock() + call_item.type = "function_call" + call_item.id = "fc_abc" + call_item.call_id = "call_abc" + call_item.name = "write_file" + call_item.arguments = "" + + item_added = MagicMock(spec=ResponseOutputItemAddedEvent) + item_added.item = call_item + item_added.output_index = 0 + + # Split the argument text into a few chunks to exercise the per-delta loop + chunk_size = max(1, len(arguments_text) // 3) if arguments_text else 1 + chunks = [arguments_text[i:i + chunk_size] for i in range(0, len(arguments_text), chunk_size)] or [""] + delta_events = [] + for chunk in chunks: + ev = MagicMock(spec=ResponseFunctionCallArgumentsDeltaEvent) + ev.delta = chunk + ev.output_index = 0 + delta_events.append(ev) + + args_done = MagicMock(spec=ResponseFunctionCallArgumentsDoneEvent) + args_done.arguments = arguments_text + args_done.output_index = 0 + + item_done = MagicMock(spec=ResponseOutputItemDoneEvent) + item_done.item = call_item + item_done.output_index = 0 + + completed = MagicMock(spec=ResponseCompletedEvent) + completed.response = MagicMock(output=[], usage=MagicMock(), id=None) + + return [item_added, *delta_events, args_done, item_done, completed], chunks + + @staticmethod + def _install_real_task_message(mock_adk_streaming, task_id: str): + """Replace the autouse fixture's MagicMock ``task_message`` with a real + ``TaskMessage`` so production's ``StreamTaskMessageDelta(parent_task_message=...)`` + construction passes pydantic validation. The default mock works for tests + that only assert on the context's ``__aenter__`` call but breaks tests + that exercise ``stream_update`` end-to-end. + """ + from agentex.types.task_message import TaskMessage + from agentex.types.task_message_content import ToolRequestContent + + ctx = mock_adk_streaming.streaming_task_message_context.return_value + ctx.task_message = TaskMessage( + id="msg_test", + task_id=task_id, + content=ToolRequestContent( + author="agent", + tool_call_id="call_abc", + name="write_file", + arguments={}, + ), + streaming_status="IN_PROGRESS", + ) + return ctx + + @pytest.mark.asyncio + async def test_function_call_emits_argument_deltas_and_final_full( + self, streaming_model, mock_adk_streaming, _streaming_context_vars, sample_task_id + ): + """A function_call with well-formed JSON args should produce: + (1) one streaming context opened with ``ToolRequestContent`` initial_content, + (2) one ``StreamTaskMessageDelta`` per ``ArgumentsDelta`` event carrying a + ``ToolRequestDelta`` with the right ``tool_call_id`` and ``arguments_delta``, + (3) one final ``StreamTaskMessageFull`` with ``ToolRequestContent`` whose + ``arguments`` is the parsed JSON dict. + """ + from agentex.types.task_message_delta import ToolRequestDelta + from agentex.types.task_message_update import StreamTaskMessageFull, StreamTaskMessageDelta + from agentex.types.task_message_content import ToolRequestContent + + ctx = self._install_real_task_message(mock_adk_streaming, sample_task_id) + + args_text = '{"path": "/tmp/foo.txt", "contents": "hello world"}' + events, chunks = self._build_function_call_stream(args_text) + + mock_stream = AsyncMock() + mock_stream.__aiter__.return_value = iter(events) + streaming_model.client.responses.create = AsyncMock(return_value=mock_stream) + + await streaming_model.get_response( + system_instructions=None, + input="please write foo", + model_settings=ModelSettings(), + tools=[], + output_schema=None, + handoffs=[], + tracing=None, + ) + + # 1. A streaming context was opened with ToolRequestContent. + opens = [ + c for c in mock_adk_streaming.streaming_task_message_context.call_args_list + if isinstance(c.kwargs.get("initial_content"), ToolRequestContent) + ] + assert len(opens) == 1, f"expected one ToolRequest context, got {len(opens)}" + initial = opens[0].kwargs["initial_content"] + assert initial.tool_call_id == "call_abc" + assert initial.name == "write_file" + + # 2. One StreamTaskMessageDelta(ToolRequestDelta) was streamed per + # ArgumentsDelta event, preserving the delta text exactly. + delta_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageDelta) + and isinstance(call.args[0].delta, ToolRequestDelta)) + ] + assert len(delta_updates) == len(chunks) + for update, expected_chunk in zip(delta_updates, chunks): + assert update.delta.tool_call_id == "call_abc" + assert update.delta.name == "write_file" + assert update.delta.arguments_delta == expected_chunk + + # 3. A final StreamTaskMessageFull(ToolRequestContent) was streamed with + # parsed args. + full_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageFull) + and isinstance(call.args[0].content, ToolRequestContent)) + ] + assert len(full_updates) == 1 + final = full_updates[0].content + assert final.tool_call_id == "call_abc" + assert final.name == "write_file" + assert final.arguments == {"path": "/tmp/foo.txt", "contents": "hello world"} + + @pytest.mark.asyncio + async def test_function_call_malformed_args_fall_back_to_empty_dict( + self, streaming_model, mock_adk_streaming, _streaming_context_vars, sample_task_id, caplog + ): + """If the model produces invalid JSON for the args, the final + ``ToolRequestContent`` should carry ``arguments={}`` and a warning should + be logged. The raw delta stream is preserved either way. + """ + from agentex.types.task_message_update import StreamTaskMessageFull + from agentex.types.task_message_content import ToolRequestContent + + ctx = self._install_real_task_message(mock_adk_streaming, sample_task_id) + + # Missing closing brace — invalid JSON. + events, _ = self._build_function_call_stream('{"path": "/tmp/foo.txt", "contents":') + + mock_stream = AsyncMock() + mock_stream.__aiter__.return_value = iter(events) + streaming_model.client.responses.create = AsyncMock(return_value=mock_stream) + + with caplog.at_level("WARNING"): + await streaming_model.get_response( + system_instructions=None, + input="please write foo", + model_settings=ModelSettings(), + tools=[], + output_schema=None, + handoffs=[], + tracing=None, + ) + + full_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageFull) + and isinstance(call.args[0].content, ToolRequestContent)) + ] + assert len(full_updates) == 1 + assert full_updates[0].content.arguments == {} + assert any("Failed to parse tool call arguments" in r.getMessage() for r in caplog.records) + + class TestStreamingModelUsageResponseIdAndCacheKey: """Cover real-Usage capture, real response_id, span emission, and opt-in prompt_cache_key.""" diff --git a/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py b/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py index 98d50546b..448d013e9 100644 --- a/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py +++ b/src/agentex/lib/core/tracing/processors/agentex_tracing_processor.py @@ -1,3 +1,4 @@ +import os import asyncio import weakref from typing import TYPE_CHECKING, Any, Dict, override @@ -5,6 +6,7 @@ from agentex import Agentex from agentex.types.span import Span from agentex.lib.types.tracing import AgentexTracingProcessorConfig +from agentex.lib.utils.logging import make_logger from agentex.lib.adk.utils._modules.client import create_async_agentex_client from agentex.lib.core.tracing.processors.tracing_processor_interface import ( SyncTracingProcessor, @@ -14,28 +16,88 @@ if TYPE_CHECKING: from agentex import AsyncAgentex +logger = make_logger(__name__) + + +# NOTE: This is the Agentex-backend toggle (writes to the agentex `spans` +# table via the Agentex API). It is intentionally SEPARATE from the SGP/EGP +# processor's ``AGENTEX_TRACING_SKIP_SPAN_START`` so the two backends can be +# controlled independently. +_SKIP_SPAN_START_ENV = "AGENTEX_TRACING_SKIP_AGENTEX_SPAN_START" + + +def _skip_span_start_enabled() -> bool: + """Whether to skip the Agentex span-start write and persist each span only on end. + + The Agentex processor otherwise writes every span twice: a ``spans.create`` + on start (no ``end_time``/``output`` yet) and a ``spans.update`` on end. + The start row is overwritten by the end write moments later, so persisting + it doubles the per-span HTTP/DB write volume against the Agentex control + plane — the load that timed out span-start activities and pressured the + Agentex Postgres connection pool under load. + + When enabled (the default), the start write is skipped and the END write + becomes a single ``spans.create`` carrying the complete span — one INSERT + per span instead of an INSERT + UPDATE. (A plain ``spans.update`` on end + would 404 because the row was never created.) + + Default ON. Set ``AGENTEX_TRACING_SKIP_AGENTEX_SPAN_START`` to + ``0``/``false``/``no``/``off`` to restore the start write — e.g. if you + need in-flight spans visible before they complete, or spans that never end + (process crash) to still be persisted. + """ + raw = os.environ.get(_SKIP_SPAN_START_ENV, "1").strip().lower() + return raw not in ("0", "false", "no", "off") + + +def _create_kwargs(span: Span) -> Dict[str, Any]: + """Full-span kwargs for ``spans.create`` — used on start (skip disabled) and + on end (skip enabled, single-INSERT path).""" + return { + "name": span.name, + "start_time": span.start_time, + "end_time": span.end_time, + "id": span.id, + "trace_id": span.trace_id, + "parent_id": span.parent_id, + "input": span.input, + "output": span.output, + "data": span.data, + "task_id": span.task_id, + } + class AgentexSyncTracingProcessor(SyncTracingProcessor): def __init__(self, config: AgentexTracingProcessorConfig): # noqa: ARG002 self.client = Agentex() + # Capture the skip decision once at init: both halves of a span's + # lifecycle MUST agree, otherwise a start-skip + end-update lands on a + # non-existent row (404) — or the reverse double-creates. Re-reading the + # env per event would let a mid-span toggle (tests, config reload) split + # the decision. Deploy-time flag, so a single read is correct. + self._skip_span_start = _skip_span_start_enabled() + logger.info( + "Agentex tracing span-start write %s (%s)", + "disabled — end-only ingest" if self._skip_span_start else "enabled", + _SKIP_SPAN_START_ENV, + ) @override def on_span_start(self, span: Span) -> None: - self.client.spans.create( - name=span.name, - start_time=span.start_time, - end_time=span.end_time, - trace_id=span.trace_id, - id=span.id, - data=span.data, - input=span.input, - output=span.output, - parent_id=span.parent_id, - task_id=span.task_id, - ) + # End-only ingest: by default the start write is skipped (see + # _skip_span_start_enabled) so each span is persisted once, on end. + if self._skip_span_start: + return + self.client.spans.create(**_create_kwargs(span)) @override def on_span_end(self, span: Span) -> None: + # End-only ingest: the start create was skipped, so persist the complete + # span as a single INSERT here (a bare spans.update would 404 — no row). + if self._skip_span_start: + self.client.spans.create(**_create_kwargs(span)) + return + update: Dict[str, Any] = {} if span.trace_id: update["trace_id"] = span.trace_id @@ -82,6 +144,17 @@ def __init__(self, config: AgentexTracingProcessorConfig): # noqa: ARG002 self._clients_by_loop: weakref.WeakKeyDictionary[ asyncio.AbstractEventLoop, "AsyncAgentex" ] = weakref.WeakKeyDictionary() + # Capture the skip decision once at init: both halves of a span's + # lifecycle MUST agree, otherwise a start-skip + end-update lands on a + # non-existent row (404) — or the reverse double-creates. Re-reading the + # env per event would let a mid-span toggle (tests, config reload) split + # the decision. Deploy-time flag, so a single read is correct. + self._skip_span_start = _skip_span_start_enabled() + logger.info( + "Agentex tracing span-start write %s (%s)", + "disabled — end-only ingest" if self._skip_span_start else "enabled", + _SKIP_SPAN_START_ENV, + ) def _build_client(self) -> "AsyncAgentex": import httpx @@ -111,21 +184,20 @@ def client(self) -> "AsyncAgentex": # https://linear.app/scale-epd/issue/AGX1-199/add-agentex-batch-endpoint-for-traces @override async def on_span_start(self, span: Span) -> None: - await self.client.spans.create( - name=span.name, - start_time=span.start_time, - end_time=span.end_time, - id=span.id, - trace_id=span.trace_id, - parent_id=span.parent_id, - input=span.input, - output=span.output, - data=span.data, - task_id=span.task_id, - ) + # End-only ingest: by default the start write is skipped (see + # _skip_span_start_enabled) so each span is persisted once, on end. + if self._skip_span_start: + return + await self.client.spans.create(**_create_kwargs(span)) @override async def on_span_end(self, span: Span) -> None: + # End-only ingest: the start create was skipped, so persist the complete + # span as a single INSERT here (a bare spans.update would 404 — no row). + if self._skip_span_start: + await self.client.spans.create(**_create_kwargs(span)) + return + update: Dict[str, Any] = {} if span.trace_id: update["trace_id"] = span.trace_id diff --git a/src/agentex/lib/sdk/state_machine/state_machine.py b/src/agentex/lib/sdk/state_machine/state_machine.py index f1e5c4239..5679a6bd8 100644 --- a/src/agentex/lib/sdk/state_machine/state_machine.py +++ b/src/agentex/lib/sdk/state_machine/state_machine.py @@ -113,6 +113,7 @@ async def reset_to_initial_state(self): """ Reset the state machine to its initial state. """ + span = None if self._trace_transitions: if self._task_id is None: raise ValueError( @@ -126,7 +127,7 @@ async def reset_to_initial_state(self): await self.transition(self._initial_state) - if self._trace_transitions: + if self._trace_transitions and span is not None: span.output = {"output_state": self._initial_state} # type: ignore[assignment,union-attr] await adk.tracing.end_span(trace_id=self._task_id, span=span) diff --git a/src/agentex/lib/sdk/utils/webhooks.py b/src/agentex/lib/sdk/utils/webhooks.py new file mode 100644 index 000000000..d4b7b43e1 --- /dev/null +++ b/src/agentex/lib/sdk/utils/webhooks.py @@ -0,0 +1,389 @@ +"""Drive an agent turn from an inbound webhook, inside a forward-route handler. + +The Agentex server already exposes a webhook ingress: a request to +``/agents/forward/name/{agent}/{path}`` is signature-verified (GitHub ``sha256=`` / +Slack ``v0:`` HMAC via the agent's registered keys) and proxied to the agent's own +HTTP route. This helper is what that route handler calls to turn the inbound payload +into an agent turn — without each agent re-implementing payload shaping, config +resolution, session continuity, and reply handling. + +Typical use inside an agent:: + + from fastapi import Request + from agentex.lib.sdk.utils.webhooks import handle_webhook + + + @acp.post("/github-pr") + async def github_pr(request: Request): + body = await request.json() + result = await handle_webhook( + agent_name="my-agent", + payload=body, + acp_type="sync", + shaper="github_pr", + params_source="https:///public/v5/agent_configs//resolve", + params_source_headers={"x-api-key": ..., "x-selected-account-id": ...}, + wait=True, + ) + return {"task_id": result.task_id, "reply": result.reply} + +Config-by-id: pass ``params_source`` pointing at the platform's config-resolve +endpoint; the resolved params (e.g. system_prompt / harness / model / tools) are +forwarded opaquely to ``task/create``. Or pass inline ``params`` for a one-off. +""" + +from __future__ import annotations + +import json +import hashlib +from typing import Any, Literal +from dataclasses import field, dataclass +from collections.abc import Mapping, Callable, Awaitable + +from agentex.lib import adk +from agentex.lib.utils.logging import make_logger +from agentex.types.task_message_content import TextContent + +logger = make_logger(__name__) + +# Injectable params fetcher (url -> JSON). Default uses httpx; tests inject a fake. +ParamsFetcher = Callable[[str], Awaitable[dict[str, Any]]] + +MAX_BODY_CHARS = 4000 +MAX_DIFF_CHARS = 30000 + + +class WebhookError(RuntimeError): + """Raised when a webhook turn cannot be driven (e.g. params resolution failed).""" + + +@dataclass +class WebhookResult: + task_id: str + # Sync agents reply inline. For async agents, ``reply`` is None unless ``wait`` was + # set, in which case it is the polled reply (or None if it didn't settle in time). + reply: str | None = None + task_metadata: dict[str, str] = field(default_factory=dict) + + +# --------------------------------------------------------------------------- shaping + + +def session_key(agent_name: str, channel: str, peer_id: str) -> str: + """Stable per-conversation task name → reused for get-or-create on task/create, so + repeat events from the same source fold into one task instead of spawning new ones.""" + basis = peer_id or "main" + digest = hashlib.sha1(f"{agent_name}:{channel}:{basis}".encode()).hexdigest()[:16] + return f"wh-{channel}-{digest}" + + +# Top-level fields a generic webhook payload might carry its prompt in, in priority +# order. Matched case-insensitively against the payload's keys. +GENERIC_PROMPT_KEYS = ( + "text", + "message", + "prompt", + "goal", + "content", + "body", + "description", + "title", +) + + +def render_generic(body: dict[str, Any]) -> str: + """Generic payload → prompt text: first non-empty string among GENERIC_PROMPT_KEYS + (case-insensitive), else raw JSON.""" + lowered = {key.lower(): value for key, value in body.items() if isinstance(key, str)} + for key in GENERIC_PROMPT_KEYS: + value = lowered.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return json.dumps(body, indent=2)[:8000] + + +def shape_github_pr(body: dict[str, Any]) -> tuple[str, str | None, str]: + """Shape a GitHub/Gitea pull-request webhook into (prompt, peer_id, sender). + + ``peer_id`` is ``repo#number`` so repeated events for the same PR (opened, + synchronize, ...) fold into one task. Falls back to generic rendering for non-PR + payloads (ping, issue, ...). + """ + pull_request = body.get("pull_request") + if not isinstance(pull_request, dict): + return render_generic(body), None, _github_actor(body) + + repo = _repo_full_name(body) + number = pull_request.get("number") + title = (pull_request.get("title") or "").strip() + action = (body.get("action") or "").strip() + description = (pull_request.get("body") or "").strip() + html_url = pull_request.get("html_url") or pull_request.get("url") + + header = "Pull request" + if repo and number is not None: + header = f"Pull request {repo}#{number}" + elif number is not None: + header = f"Pull request #{number}" + + lines = [f"{header}: {title}" if title else header] + if action: + lines.append(f"Action: {action}") + if html_url: + lines.append(f"URL: {html_url}") + if description: + lines.extend(["", "Description:", description[:MAX_BODY_CHARS]]) + + diff = _inline_diff(body, pull_request) + if diff: + lines.extend(["", "Diff:", diff[:MAX_DIFF_CHARS]]) + else: + # Standard GitHub/Gitea payloads carry a diff/patch URL, not the patch body. + # Surface it so a tool-enabled agent (or the caller) can fetch the diff; inline + # `diff` wins. Gitea sends patch_url alongside diff_url, so accept either. + diff_url = pull_request.get("diff_url") or pull_request.get("patch_url") + if diff_url: + lines.extend(["", f"Diff URL: {diff_url}"]) + + peer_id = None + if repo and number is not None: + peer_id = f"{repo}#{number}" + elif number is not None: + peer_id = f"pr#{number}" + return "\n".join(lines), peer_id, _github_actor(body) + + +def _repo_full_name(body: dict[str, Any]) -> str | None: + repo = body.get("repository") + if isinstance(repo, dict) and isinstance(repo.get("full_name"), str): + return repo["full_name"] or None + return None + + +def _github_actor(body: dict[str, Any]) -> str: + sender = body.get("sender") + if isinstance(sender, dict) and isinstance(sender.get("login"), str) and sender["login"]: + return sender["login"] + return "webhook" + + +def _inline_diff(body: dict[str, Any], pull_request: dict[str, Any]) -> str | None: + for source in (body, pull_request): + diff = source.get("diff") + if isinstance(diff, str) and diff.strip(): + return diff.strip() + return None + + +# ------------------------------------------------------------------- params resolution + + +async def _default_fetch(url: str, headers: dict[str, str]) -> dict[str, Any]: + """GET a params source over HTTP. Imported lazily so callers that only pass inline + params carry no httpx dependency.""" + import httpx + + request_headers = {"accept": "application/json", **headers} + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(url, headers=request_headers) + response.raise_for_status() + return response.json() + except httpx.HTTPError as exc: + raise WebhookError(f"params source request failed: {exc}") from exc + except ValueError as exc: # json.JSONDecodeError subclasses ValueError + raise WebhookError(f"params source returned invalid JSON: {exc}") from exc + + +async def resolve_remote_params( + url: str, + headers: dict[str, str] | None = None, + *, + fetch: ParamsFetcher | None = None, +) -> tuple[dict[str, Any], dict[str, str]]: + """Fetch params (+ optional task_metadata) from a config-resolve URL. + + Response shape (lenient):: + + {"params": {...}, "task_metadata": {...}} + + A bare object with no ``params`` key is treated as the params dict itself (minus a + top-level ``task_metadata``, which is returned separately for stamping). + """ + do_fetch = fetch or (lambda u: _default_fetch(u, headers or {})) + payload = await do_fetch(url) + if not isinstance(payload, dict): + raise WebhookError("params source returned a non-object response") + + metadata_raw = payload.get("task_metadata") + task_metadata = {str(k): str(v) for k, v in metadata_raw.items()} if isinstance(metadata_raw, dict) else {} + params = payload.get("params") + if not isinstance(params, dict): + params = {k: v for k, v in payload.items() if k != "task_metadata"} + return params, task_metadata + + +# ------------------------------------------------------------------------- dispatch + + +def _agent_reply_text(messages: object) -> str | None: + """Join agent-authored text from a message list (sync result or polled stream).""" + if not isinstance(messages, list): + return None + parts = [] + for message in messages: + content = getattr(message, "content", None) + if ( + content is not None + and getattr(content, "author", None) == "agent" + and getattr(content, "type", None) == "text" + ): + text = (getattr(content, "content", "") or "").strip() + if text: + parts.append(text) + return "\n\n".join(parts) if parts else None + + +async def handle_webhook( + *, + agent_name: str, + payload: dict[str, Any], + acp_type: Literal["sync", "async"] = "sync", + shaper: Literal["generic", "github_pr"] = "generic", + channel: str | None = None, + params: dict[str, Any] | None = None, + params_source: str | None = None, + params_source_headers: dict[str, str] | None = None, + peer_id: str | None = None, + extra_task_metadata: dict[str, str] | None = None, + wait: bool = False, + fetch: ParamsFetcher | None = None, +) -> WebhookResult: + """Drive an agent turn from a webhook payload, agent-side, via the ADK client. + + - Shapes the payload (generic or GitHub PR) into a prompt + conversation scope. + - Resolves task params: inline ``params``, or fetched from ``params_source`` + (config-by-id). The platform never interprets params — they're forwarded to the + agent as ``task/create`` params. + - Get-or-creates a task keyed on a stable session key, so repeat events fold in. + - Sends the turn (sync → message/send returns the reply inline; async → event/send, + with optional ``wait`` to poll for the reply). + """ + channel = channel or shaper + if shaper == "github_pr": + text, derived_peer, sender = shape_github_pr(payload) + peer_id = peer_id or derived_peer + else: + text, sender = render_generic(payload), "webhook" + + task_metadata: dict[str, str] = {"channel": channel, "sender_id": sender} + if peer_id: + task_metadata["peer_id"] = peer_id + + resolved_params = dict(params) if params else {} + if params_source: + resolved_params, source_metadata = await resolve_remote_params( + params_source, params_source_headers, fetch=fetch + ) + # Source metadata + caller extras never override the canonical fields above. + for key, value in {**source_metadata, **(extra_task_metadata or {})}.items(): + task_metadata.setdefault(key, str(value)) + elif extra_task_metadata: + for key, value in extra_task_metadata.items(): + task_metadata.setdefault(key, str(value)) + + name = session_key(agent_name, channel, peer_id or "") + # task/create carries only name/params (CreateTaskParams has no task_metadata field), + # so we create first, then stamp task_metadata via a follow-up update below. + task = await adk.acp.create_task( + name=name, + agent_name=agent_name, + params=resolved_params or None, + ) + + # Best-effort: stamp the resolved task_metadata (channel/sender/peer_id, plus the + # display_name etc. from params_source) onto the task so it's labeled in the UI. + # Failure must never break the run — the metadata is also returned on the result. + if task_metadata: + try: + merged_task_metadata = { + **_task_metadata_dict(getattr(task, "task_metadata", None)), + **task_metadata, + } + await adk.tasks.update(task_id=task.id, task_metadata=merged_task_metadata) + except Exception: + logger.warning("Failed to stamp task_metadata on task %s", task.id, exc_info=True) + + content = TextContent(author="user", content=text, format="markdown") + + if acp_type == "sync": + messages = await adk.acp.send_message(task_id=task.id, agent_name=agent_name, content=content) + return WebhookResult(task_id=task.id, reply=_agent_reply_text(messages), task_metadata=task_metadata) + + # Async: when we'll wait for the reply, snapshot existing message ids BEFORE the + # event so a reused task's prior reply (session continuity) isn't mistaken for it. + if wait: + seen_ids, seen_count = await _message_snapshot(task.id) + await adk.acp.send_event(task_id=task.id, agent_name=agent_name, content=content) + reply = await _await_reply(task.id, seen_ids, seen_count=seen_count) + else: + await adk.acp.send_event(task_id=task.id, agent_name=agent_name, content=content) + reply = None + return WebhookResult(task_id=task.id, reply=reply, task_metadata=task_metadata) + + +def _task_metadata_dict(value: object) -> dict[str, Any]: + if isinstance(value, Mapping): + return dict(value) + return {} + + +async def _message_snapshot(task_id: str) -> tuple[set[str], int]: + messages = await adk.messages.list(task_id=task_id) + messages = messages or [] + return {mid for m in messages if (mid := getattr(m, "id", None)) is not None}, len(messages) + + +async def _message_ids(task_id: str) -> set[str]: + # Only track real ids. Keeping None in the set would let a later id-less message + # collide with it and be wrongly treated as already-seen (dropping a fresh reply). + seen_ids, _ = await _message_snapshot(task_id) + return seen_ids + + +async def _await_reply( + task_id: str, + seen_ids: set[str | None], + *, + seen_count: int | None = None, + timeout_s: float = 120.0, + interval_s: float = 2.0, + quiescence_s: float = 6.0, +) -> str | None: + """Poll for THIS turn's reply — agent text in messages that weren't present before + the event — until it settles (unchanged for ``quiescence_s``) or times out. Filtering + on new message ids avoids returning a stale prior reply on a reused task.""" + import asyncio + + waited = 0.0 + last: str | None = None + stable_for = 0.0 + while waited < timeout_s: + await asyncio.sleep(interval_s) + waited += interval_s + messages = await adk.messages.list(task_id=task_id) + new = [] + for index, message in enumerate(messages or []): + mid = getattr(message, "id", None) + if mid is not None and mid not in seen_ids: + new.append(message) + elif mid is None and seen_count is not None and index >= seen_count: + new.append(message) + text = _agent_reply_text(new) + if text and text == last: + stable_for += interval_s + if stable_for >= quiescence_s: + return text + elif text: + last, stable_for = text, 0.0 + return last diff --git a/src/agentex/resources/__init__.py b/src/agentex/resources/__init__.py index 00e0bfea8..43dbdbdb4 100644 --- a/src/agentex/resources/__init__.py +++ b/src/agentex/resources/__init__.py @@ -56,6 +56,14 @@ MessagesResourceWithStreamingResponse, AsyncMessagesResourceWithStreamingResponse, ) +from .webhooks import ( + WebhooksResource, + AsyncWebhooksResource, + WebhooksResourceWithRawResponse, + AsyncWebhooksResourceWithRawResponse, + WebhooksResourceWithStreamingResponse, + AsyncWebhooksResourceWithStreamingResponse, +) from .checkpoints import ( CheckpointsResource, AsyncCheckpointsResource, @@ -128,4 +136,10 @@ "AsyncCheckpointsResourceWithRawResponse", "CheckpointsResourceWithStreamingResponse", "AsyncCheckpointsResourceWithStreamingResponse", + "WebhooksResource", + "AsyncWebhooksResource", + "WebhooksResourceWithRawResponse", + "AsyncWebhooksResourceWithRawResponse", + "WebhooksResourceWithStreamingResponse", + "AsyncWebhooksResourceWithStreamingResponse", ] diff --git a/src/agentex/resources/tasks.py b/src/agentex/resources/tasks.py index 67704e36b..56ad84d2c 100644 --- a/src/agentex/resources/tasks.py +++ b/src/agentex/resources/tasks.py @@ -554,6 +554,7 @@ def update_by_id( self, task_id: str, *, + merge_params: Optional[Dict[str, object]] | Omit = omit, task_metadata: Optional[Dict[str, object]] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. @@ -578,7 +579,13 @@ def update_by_id( raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}") return self._put( path_template("/tasks/{task_id}", task_id=task_id), - body=maybe_transform({"task_metadata": task_metadata}, task_update_by_id_params.TaskUpdateByIDParams), + body=maybe_transform( + { + "merge_params": merge_params, + "task_metadata": task_metadata, + }, + task_update_by_id_params.TaskUpdateByIDParams, + ), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -589,6 +596,7 @@ def update_by_name( self, task_name: str, *, + merge_params: Optional[Dict[str, object]] | Omit = omit, task_metadata: Optional[Dict[str, object]] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. @@ -613,7 +621,13 @@ def update_by_name( raise ValueError(f"Expected a non-empty value for `task_name` but received {task_name!r}") return self._put( path_template("/tasks/name/{task_name}", task_name=task_name), - body=maybe_transform({"task_metadata": task_metadata}, task_update_by_name_params.TaskUpdateByNameParams), + body=maybe_transform( + { + "merge_params": merge_params, + "task_metadata": task_metadata, + }, + task_update_by_name_params.TaskUpdateByNameParams, + ), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -1136,6 +1150,7 @@ async def update_by_id( self, task_id: str, *, + merge_params: Optional[Dict[str, object]] | Omit = omit, task_metadata: Optional[Dict[str, object]] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. @@ -1161,7 +1176,11 @@ async def update_by_id( return await self._put( path_template("/tasks/{task_id}", task_id=task_id), body=await async_maybe_transform( - {"task_metadata": task_metadata}, task_update_by_id_params.TaskUpdateByIDParams + { + "merge_params": merge_params, + "task_metadata": task_metadata, + }, + task_update_by_id_params.TaskUpdateByIDParams, ), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout @@ -1173,6 +1192,7 @@ async def update_by_name( self, task_name: str, *, + merge_params: Optional[Dict[str, object]] | Omit = omit, task_metadata: Optional[Dict[str, object]] | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. @@ -1198,7 +1218,11 @@ async def update_by_name( return await self._put( path_template("/tasks/name/{task_name}", task_name=task_name), body=await async_maybe_transform( - {"task_metadata": task_metadata}, task_update_by_name_params.TaskUpdateByNameParams + { + "merge_params": merge_params, + "task_metadata": task_metadata, + }, + task_update_by_name_params.TaskUpdateByNameParams, ), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout diff --git a/src/agentex/resources/webhooks.py b/src/agentex/resources/webhooks.py new file mode 100644 index 000000000..f565a7870 --- /dev/null +++ b/src/agentex/resources/webhooks.py @@ -0,0 +1,242 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal + +import httpx + +from ..types import webhook_create_webhook_trigger_params +from .._types import Body, Omit, Query, Headers, NotGiven, omit, not_given +from .._utils import maybe_transform, async_maybe_transform +from .._compat import cached_property +from .._resource import SyncAPIResource, AsyncAPIResource +from .._response import ( + to_raw_response_wrapper, + to_streamed_response_wrapper, + async_to_raw_response_wrapper, + async_to_streamed_response_wrapper, +) +from .._base_client import make_request_options +from ..types.webhook_create_webhook_trigger_response import WebhookCreateWebhookTriggerResponse + +__all__ = ["WebhooksResource", "AsyncWebhooksResource"] + + +class WebhooksResource(SyncAPIResource): + @cached_property + def with_raw_response(self) -> WebhooksResourceWithRawResponse: + """ + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. + + For more information, see https://www.github.com/scaleapi/scale-agentex-python#accessing-raw-response-data-eg-headers + """ + return WebhooksResourceWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> WebhooksResourceWithStreamingResponse: + """ + An alternative to `.with_raw_response` that doesn't eagerly read the response body. + + For more information, see https://www.github.com/scaleapi/scale-agentex-python#with_streaming_response + """ + return WebhooksResourceWithStreamingResponse(self) + + def create_webhook_trigger( + self, + *, + agent_name: str, + forward_path: str, + name: str, + base_url: Optional[str] | Omit = omit, + secret: Optional[str] | Omit = omit, + source: Literal["internal", "external", "github", "slack"] | Omit = omit, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = not_given, + ) -> WebhookCreateWebhookTriggerResponse: + """ + Wire a webhook trigger in one call. + + Registers the source's signature-verification key (github/slack) for the agent + and returns the ready-to-paste forward webhook URL plus the signing secret + (shown once). The webhook then flows through the existing /agents/forward + ingress, which verifies the signature against this key. Bundles the existing + key-create + URL composition so a UI (or a curl) can set up a trigger without + two steps. + + Args: + agent_name: The agent the webhook drives. + + forward_path: Subpath the agent's own route handles, e.g. 'github-pr/'. Appended to + /agents/forward/name/{agent_name}/ to form the webhook URL. + + name: Signature-lookup key: the repo full_name (github) or api_app_id (slack) that the + forward ingress matches the incoming webhook against. + + base_url: Optional public agentex base URL for the returned webhook_url; defaults to the + AGENTEX_PUBLIC_URL env var. + + secret: Signing secret. For GitHub, omit to generate one, or provide an existing webhook + secret. For Slack, this is required and must be the Slack app's Signing Secret. + + source: Webhook source whose signature is verified (github or slack). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + return self._post( + "/agent_api_keys/webhook-trigger", + body=maybe_transform( + { + "agent_name": agent_name, + "forward_path": forward_path, + "name": name, + "base_url": base_url, + "secret": secret, + "source": source, + }, + webhook_create_webhook_trigger_params.WebhookCreateWebhookTriggerParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=WebhookCreateWebhookTriggerResponse, + ) + + +class AsyncWebhooksResource(AsyncAPIResource): + @cached_property + def with_raw_response(self) -> AsyncWebhooksResourceWithRawResponse: + """ + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. + + For more information, see https://www.github.com/scaleapi/scale-agentex-python#accessing-raw-response-data-eg-headers + """ + return AsyncWebhooksResourceWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> AsyncWebhooksResourceWithStreamingResponse: + """ + An alternative to `.with_raw_response` that doesn't eagerly read the response body. + + For more information, see https://www.github.com/scaleapi/scale-agentex-python#with_streaming_response + """ + return AsyncWebhooksResourceWithStreamingResponse(self) + + async def create_webhook_trigger( + self, + *, + agent_name: str, + forward_path: str, + name: str, + base_url: Optional[str] | Omit = omit, + secret: Optional[str] | Omit = omit, + source: Literal["internal", "external", "github", "slack"] | Omit = omit, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = not_given, + ) -> WebhookCreateWebhookTriggerResponse: + """ + Wire a webhook trigger in one call. + + Registers the source's signature-verification key (github/slack) for the agent + and returns the ready-to-paste forward webhook URL plus the signing secret + (shown once). The webhook then flows through the existing /agents/forward + ingress, which verifies the signature against this key. Bundles the existing + key-create + URL composition so a UI (or a curl) can set up a trigger without + two steps. + + Args: + agent_name: The agent the webhook drives. + + forward_path: Subpath the agent's own route handles, e.g. 'github-pr/'. Appended to + /agents/forward/name/{agent_name}/ to form the webhook URL. + + name: Signature-lookup key: the repo full_name (github) or api_app_id (slack) that the + forward ingress matches the incoming webhook against. + + base_url: Optional public agentex base URL for the returned webhook_url; defaults to the + AGENTEX_PUBLIC_URL env var. + + secret: Signing secret. For GitHub, omit to generate one, or provide an existing webhook + secret. For Slack, this is required and must be the Slack app's Signing Secret. + + source: Webhook source whose signature is verified (github or slack). + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + return await self._post( + "/agent_api_keys/webhook-trigger", + body=await async_maybe_transform( + { + "agent_name": agent_name, + "forward_path": forward_path, + "name": name, + "base_url": base_url, + "secret": secret, + "source": source, + }, + webhook_create_webhook_trigger_params.WebhookCreateWebhookTriggerParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=WebhookCreateWebhookTriggerResponse, + ) + + +class WebhooksResourceWithRawResponse: + def __init__(self, webhooks: WebhooksResource) -> None: + self._webhooks = webhooks + + self.create_webhook_trigger = to_raw_response_wrapper( + webhooks.create_webhook_trigger, + ) + + +class AsyncWebhooksResourceWithRawResponse: + def __init__(self, webhooks: AsyncWebhooksResource) -> None: + self._webhooks = webhooks + + self.create_webhook_trigger = async_to_raw_response_wrapper( + webhooks.create_webhook_trigger, + ) + + +class WebhooksResourceWithStreamingResponse: + def __init__(self, webhooks: WebhooksResource) -> None: + self._webhooks = webhooks + + self.create_webhook_trigger = to_streamed_response_wrapper( + webhooks.create_webhook_trigger, + ) + + +class AsyncWebhooksResourceWithStreamingResponse: + def __init__(self, webhooks: AsyncWebhooksResource) -> None: + self._webhooks = webhooks + + self.create_webhook_trigger = async_to_streamed_response_wrapper( + webhooks.create_webhook_trigger, + ) diff --git a/src/agentex/types/__init__.py b/src/agentex/types/__init__.py index f04daeb3b..8bb76fa04 100644 --- a/src/agentex/types/__init__.py +++ b/src/agentex/types/__init__.py @@ -85,3 +85,9 @@ from .checkpoint_delete_thread_params import CheckpointDeleteThreadParams as CheckpointDeleteThreadParams from .message_list_paginated_response import MessageListPaginatedResponse as MessageListPaginatedResponse from .deployment_history_list_response import DeploymentHistoryListResponse as DeploymentHistoryListResponse +from .webhook_create_webhook_trigger_params import ( + WebhookCreateWebhookTriggerParams as WebhookCreateWebhookTriggerParams, +) +from .webhook_create_webhook_trigger_response import ( + WebhookCreateWebhookTriggerResponse as WebhookCreateWebhookTriggerResponse, +) diff --git a/src/agentex/types/task_update_by_id_params.py b/src/agentex/types/task_update_by_id_params.py index 8b0f04f11..8d6aa6516 100644 --- a/src/agentex/types/task_update_by_id_params.py +++ b/src/agentex/types/task_update_by_id_params.py @@ -9,4 +9,6 @@ class TaskUpdateByIDParams(TypedDict, total=False): + merge_params: Optional[Dict[str, object]] + task_metadata: Optional[Dict[str, object]] diff --git a/src/agentex/types/task_update_by_name_params.py b/src/agentex/types/task_update_by_name_params.py index 07d48df9b..20e1a624c 100644 --- a/src/agentex/types/task_update_by_name_params.py +++ b/src/agentex/types/task_update_by_name_params.py @@ -9,4 +9,6 @@ class TaskUpdateByNameParams(TypedDict, total=False): + merge_params: Optional[Dict[str, object]] + task_metadata: Optional[Dict[str, object]] diff --git a/src/agentex/types/webhook_create_webhook_trigger_params.py b/src/agentex/types/webhook_create_webhook_trigger_params.py new file mode 100644 index 000000000..f6a1358bc --- /dev/null +++ b/src/agentex/types/webhook_create_webhook_trigger_params.py @@ -0,0 +1,42 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["WebhookCreateWebhookTriggerParams"] + + +class WebhookCreateWebhookTriggerParams(TypedDict, total=False): + agent_name: Required[str] + """The agent the webhook drives.""" + + forward_path: Required[str] + """Subpath the agent's own route handles, e.g. + + 'github-pr/'. Appended to /agents/forward/name/{agent_name}/ to form + the webhook URL. + """ + + name: Required[str] + """ + Signature-lookup key: the repo full_name (github) or api_app_id (slack) that the + forward ingress matches the incoming webhook against. + """ + + base_url: Optional[str] + """ + Optional public agentex base URL for the returned webhook_url; defaults to the + AGENTEX_PUBLIC_URL env var. + """ + + secret: Optional[str] + """Signing secret. + + For GitHub, omit to generate one, or provide an existing webhook secret. For + Slack, this is required and must be the Slack app's Signing Secret. + """ + + source: Literal["internal", "external", "github", "slack"] + """Webhook source whose signature is verified (github or slack).""" diff --git a/src/agentex/types/webhook_create_webhook_trigger_response.py b/src/agentex/types/webhook_create_webhook_trigger_response.py new file mode 100644 index 000000000..745ce68a1 --- /dev/null +++ b/src/agentex/types/webhook_create_webhook_trigger_response.py @@ -0,0 +1,31 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from .._models import BaseModel + +__all__ = ["WebhookCreateWebhookTriggerResponse"] + + +class WebhookCreateWebhookTriggerResponse(BaseModel): + agent_name: str + """The agent the webhook drives.""" + + key_id: str + """The created agent API key id.""" + + name: str + """Signature-lookup key (repo full_name / api_app_id).""" + + secret: str + """The signing secret — shown once; paste into the source's webhook config.""" + + source: Literal["internal", "external", "github", "slack"] + """Webhook source (github or slack).""" + + webhook_path: str + """The forward path to POST webhooks to.""" + + webhook_url: Optional[str] = None + """Full webhook URL to paste into the source (None if no base URL configured).""" diff --git a/tests/api_resources/test_tasks.py b/tests/api_resources/test_tasks.py index 0e70529dd..766175fa4 100644 --- a/tests/api_resources/test_tasks.py +++ b/tests/api_resources/test_tasks.py @@ -657,6 +657,7 @@ def test_method_update_by_id(self, client: Agentex) -> None: def test_method_update_by_id_with_all_params(self, client: Agentex) -> None: task = client.tasks.update_by_id( task_id="task_id", + merge_params={"foo": "bar"}, task_metadata={"foo": "bar"}, ) assert_matches_type(Task, task, path=["response"]) @@ -708,6 +709,7 @@ def test_method_update_by_name(self, client: Agentex) -> None: def test_method_update_by_name_with_all_params(self, client: Agentex) -> None: task = client.tasks.update_by_name( task_name="task_name", + merge_params={"foo": "bar"}, task_metadata={"foo": "bar"}, ) assert_matches_type(Task, task, path=["response"]) @@ -1384,6 +1386,7 @@ async def test_method_update_by_id(self, async_client: AsyncAgentex) -> None: async def test_method_update_by_id_with_all_params(self, async_client: AsyncAgentex) -> None: task = await async_client.tasks.update_by_id( task_id="task_id", + merge_params={"foo": "bar"}, task_metadata={"foo": "bar"}, ) assert_matches_type(Task, task, path=["response"]) @@ -1435,6 +1438,7 @@ async def test_method_update_by_name(self, async_client: AsyncAgentex) -> None: async def test_method_update_by_name_with_all_params(self, async_client: AsyncAgentex) -> None: task = await async_client.tasks.update_by_name( task_name="task_name", + merge_params={"foo": "bar"}, task_metadata={"foo": "bar"}, ) assert_matches_type(Task, task, path=["response"]) diff --git a/tests/api_resources/test_webhooks.py b/tests/api_resources/test_webhooks.py new file mode 100644 index 000000000..ff32dd719 --- /dev/null +++ b/tests/api_resources/test_webhooks.py @@ -0,0 +1,131 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +import os +from typing import Any, cast + +import pytest + +from agentex import Agentex, AsyncAgentex +from agentex.types import WebhookCreateWebhookTriggerResponse + +from ..utils import assert_matches_type + +base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") + + +class TestWebhooks: + parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + def test_method_create_webhook_trigger(self, client: Agentex) -> None: + webhook = client.webhooks.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + def test_method_create_webhook_trigger_with_all_params(self, client: Agentex) -> None: + webhook = client.webhooks.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + base_url="base_url", + secret="secret", + source="internal", + ) + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + def test_raw_response_create_webhook_trigger(self, client: Agentex) -> None: + response = client.webhooks.with_raw_response.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + webhook = response.parse() + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + def test_streaming_response_create_webhook_trigger(self, client: Agentex) -> None: + with client.webhooks.with_streaming_response.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + webhook = response.parse() + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + assert cast(Any, response.is_closed) is True + + +class TestAsyncWebhooks: + parametrize = pytest.mark.parametrize( + "async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"] + ) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + async def test_method_create_webhook_trigger(self, async_client: AsyncAgentex) -> None: + webhook = await async_client.webhooks.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + async def test_method_create_webhook_trigger_with_all_params(self, async_client: AsyncAgentex) -> None: + webhook = await async_client.webhooks.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + base_url="base_url", + secret="secret", + source="internal", + ) + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + async def test_raw_response_create_webhook_trigger(self, async_client: AsyncAgentex) -> None: + response = await async_client.webhooks.with_raw_response.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + webhook = await response.parse() + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + @pytest.mark.skip(reason="Mock server tests are disabled") + @parametrize + async def test_streaming_response_create_webhook_trigger(self, async_client: AsyncAgentex) -> None: + async with async_client.webhooks.with_streaming_response.create_webhook_trigger( + agent_name="agent_name", + forward_path="forward_path", + name="name", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + webhook = await response.parse() + assert_matches_type(WebhookCreateWebhookTriggerResponse, webhook, path=["response"]) + + assert cast(Any, response.is_closed) is True diff --git a/tests/lib/adk/providers/test_openai_activities.py b/tests/lib/adk/providers/test_openai_activities.py index c933b6ce4..2f89308a9 100644 --- a/tests/lib/adk/providers/test_openai_activities.py +++ b/tests/lib/adk/providers/test_openai_activities.py @@ -335,23 +335,61 @@ async def mock_stream_events(): expected_params.tools = [CodeInterpreterTool(tool_config={"type": "code_interpreter"})] self._assert_starting_agent_params(starting_agent, expected_params) - # Verify streaming context received tool request and response updates - # Should have been called twice - once for tool request, once for response - assert mock_streaming_context.stream_update.call_count == 2 + # Under the unified harness, the OpenAI events are converted to canonical + # StreamTaskMessageFull events and auto_send posts each full tool message + # by opening a streaming context with the content as initial_content and + # closing it (no stream_update). So assert on the opened contents. + opened = mock_streaming_context.opened_contents + tool_contents = [c for c in opened if getattr(c, "type", None) in ("tool_request", "tool_response")] + assert len(tool_contents) == 2 + + # First opened context is the tool request. + first = tool_contents[0] + assert first.type == "tool_request" + assert first.name == "code_interpreter" + assert first.tool_call_id == "code_interpreter_call_123" + + # Second opened context is the tool response. + second = tool_contents[1] + assert second.type == "tool_response" + assert second.tool_call_id == "code_interpreter_call_123" - # First call should be tool request - first_call = mock_streaming_context.stream_update.call_args_list[0] - first_update = first_call[1]["update"] # keyword argument - assert hasattr(first_update, "content") - assert first_update.content.name == "code_interpreter" - assert first_update.content.tool_call_id == "code_interpreter_call_123" + @patch("agents.Runner.run_streamed") + async def test_run_agent_streamed_auto_send_forwards_previous_response_id(self, mock_runner_run_streamed): + """previous_response_id must reach Runner.run_streamed so a Responses-API + conversation continues instead of silently starting fresh.""" + from agentex.lib.core.temporal.activities.adk.providers.openai_activities import ( + RunAgentStreamedAutoSendParams, + ) - # Second call should be tool response - second_call = mock_streaming_context.stream_update.call_args_list[1] - second_update = second_call[1]["update"] # keyword argument - assert hasattr(second_update, "content") - assert second_update.content.name == "code_interpreter_call" - assert second_update.content.tool_call_id == "code_interpreter_call_123" + mock_streaming_result = self._create_streaming_result_mock() + + async def _no_events(): + return + yield + + mock_streaming_result.stream_events = _no_events + mock_runner_run_streamed.return_value = mock_streaming_result + + mock_tracer = self._create_mock_tracer() + openai_service, openai_activities, env = self._create_test_setup(mock_tracer) + self._setup_streaming_service_mocks(openai_service) + + params = RunAgentStreamedAutoSendParams( + input_list=[{"role": "user", "content": "continue"}], + mcp_server_params=[], + agent_name="test_agent", + agent_instructions="You are a helpful assistant", + trace_id="test-trace-id", + parent_span_id="test-span-id", + task_id="test-task-id", + previous_response_id="response_123", + ) + + await env.run(openai_activities.run_agent_streamed_auto_send, params) + + mock_runner_run_streamed.assert_called_once() + assert mock_runner_run_streamed.call_args.kwargs.get("previous_response_id") == "response_123" def _create_mock_tracer(self): """Helper method to create a properly mocked tracer with async context manager support.""" @@ -613,6 +651,60 @@ def _assert_tools_conversion(self, starting_agent, tools_case, _original_tools): else: raise ValueError(f"Unknown tools_case: {tools_case}") + @patch("agents.Runner.run_streamed") + async def test_run_agent_streamed_auto_send_forwards_created_at(self, mock_runner_run_streamed): + """created_at is forwarded to every streaming context opened by auto_send_turn (AGX1-378).""" + from datetime import datetime, timezone + + from agentex.lib.core.temporal.activities.adk.providers.openai_activities import ( + RunAgentStreamedAutoSendParams, + ) + + deterministic_ts = datetime(2025, 1, 15, 12, 0, 0, tzinfo=timezone.utc) + + mock_streaming_result = self._create_streaming_result_mock() + + # Emit a tool call + tool response so auto_send actually opens streaming + # contexts; an empty stream opens none, making the assertion below + # vacuously true and unable to catch a created_at regression. + async def mock_stream_events(): + tool_call_event = Mock() + tool_call_event.type = "run_item_stream_event" + tool_call_event.item = self._create_tool_call_item_mock(self._create_code_interpreter_tool_call_mock()) + yield tool_call_event + + tool_response_event = Mock() + tool_response_event.type = "run_item_stream_event" + tool_response_event.item = self._create_tool_output_item_mock() + yield tool_response_event + + mock_streaming_result.stream_events = mock_stream_events + mock_runner_run_streamed.return_value = mock_streaming_result + + mock_tracer = self._create_mock_tracer() + openai_service, openai_activities, env = self._create_test_setup(mock_tracer) + mock_ctx, recorded_created_ats = self._setup_streaming_service_mocks_with_created_at(openai_service) + + params = RunAgentStreamedAutoSendParams( + input_list=[{"role": "user", "content": "hello"}], + mcp_server_params=[], + agent_name="test_agent", + agent_instructions="You are a helpful assistant", + trace_id="test-trace-id", + parent_span_id="test-span-id", + task_id="test-task-id", + created_at=deterministic_ts, + ) + + await env.run(openai_activities.run_agent_streamed_auto_send, params) + + # Guard against a vacuous pass: at least one streaming context must have + # been opened so the per-context created_at assertion is meaningful. + assert recorded_created_ats, "expected at least one streaming context to be opened" + assert all(ts == deterministic_ts for ts in recorded_created_ats), ( + f"Expected all streaming contexts to receive created_at={deterministic_ts!r}, got: {recorded_created_ats!r}" + ) + def _setup_streaming_service_mocks(self, openai_service): """Helper method to setup streaming service mocks for run_agent_auto_send.""" from unittest.mock import AsyncMock @@ -635,21 +727,64 @@ def _setup_streaming_service_mocks(self, openai_service): mock_streaming_context.task_message = mock_task_message mock_streaming_context.stream_update = AsyncMock() + # Record the initial_content passed to each opened streaming context. + # The unified harness auto_send path posts full tool messages by opening + # a context with initial_content and closing it (no stream_update), so + # assertions inspect the opened contents rather than stream_update calls. + opened_contents: list = [] + # Create a proper async context manager mock from contextlib import asynccontextmanager from unittest.mock import AsyncMock @asynccontextmanager - async def mock_streaming_context_manager(*_args, **_kwargs): + async def mock_streaming_context_manager(*_args, **kwargs): + if "initial_content" in kwargs: + opened_contents.append(kwargs["initial_content"]) yield mock_streaming_context mock_streaming_service.streaming_task_message_context = mock_streaming_context_manager + # Expose the recorded contents on the returned context mock for assertions. + mock_streaming_context.opened_contents = opened_contents openai_service.streaming_service = mock_streaming_service openai_service.agentex_client = mock_agentex_client return mock_streaming_context + def _setup_streaming_service_mocks_with_created_at(self, openai_service): + """Like _setup_streaming_service_mocks but also records every created_at kwarg.""" + from contextlib import asynccontextmanager + from unittest.mock import AsyncMock + + from agentex.types.task_message import TaskMessage + + mock_streaming_service = AsyncMock() + mock_agentex_client = AsyncMock() + + mock_streaming_context = AsyncMock() + mock_task_message = Mock(spec=TaskMessage) + mock_task_message.id = "test-task-message-id" + mock_task_message.task_id = "test-task-id" + mock_task_message.content = {"type": "text", "content": "test"} + mock_streaming_context.task_message = mock_task_message + mock_streaming_context.stream_update = AsyncMock() + + recorded_created_ats: list = [] + + @asynccontextmanager + async def mock_ctx_manager(*_args, **kwargs): + recorded_created_ats.append(kwargs.get("created_at")) + yield mock_streaming_context + + mock_streaming_service.streaming_task_message_context = mock_ctx_manager + mock_streaming_context.opened_contents = [] + + openai_service.streaming_service = mock_streaming_service + openai_service.agentex_client = mock_agentex_client + + return mock_streaming_context, recorded_created_ats + def _create_code_interpreter_tool_call_mock(self, call_id="code_interpreter_call_123"): """Helper to create ResponseCodeInterpreterToolCall mock objects.""" return ResponseCodeInterpreterToolCall( @@ -680,6 +815,9 @@ def _create_streaming_result_mock(self, final_output="Code executed successfully mock_streaming_result = Mock(spec=RunResultStreaming) mock_streaming_result.final_output = final_output mock_streaming_result.new_items = [] + # OpenAITurn reads raw_responses after stream exhaustion to aggregate + # usage; provide an empty list so usage normalizes to model-only. + mock_streaming_result.raw_responses = [] mock_streaming_result.final_input_list = [ {"role": "user", "content": "Run some Python code"}, {"role": "assistant", "content": final_output}, diff --git a/tests/lib/adk/providers/test_openai_turn.py b/tests/lib/adk/providers/test_openai_turn.py new file mode 100644 index 000000000..47a9ba9fe --- /dev/null +++ b/tests/lib/adk/providers/test_openai_turn.py @@ -0,0 +1,248 @@ +"""Tests for OpenAITurn and its usage mapping. + +OpenAITurn adapts an OpenAI Agents SDK streamed run onto the harness +``HarnessTurn`` protocol. These tests cover: +- ``openai_usage_to_turn_usage`` (full usage, None, real zeros) +- ``_aggregate_usage`` (empty, single, multiple ModelResponses) +- ``OpenAITurn.events`` driven by an injected canonical stream (bypassing the + OpenAI->canonical converter), plus ``usage()`` before/after exhaustion +- the ``ValueError`` guard when neither ``result`` nor ``stream`` is supplied +""" + +import types as _types + +import pytest +from agents.usage import Usage +from openai.types.responses.response_usage import InputTokensDetails, OutputTokensDetails + +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + + +def _import_target(): + from agentex.lib.adk.providers._modules.openai_turn import ( + OpenAITurn, + _aggregate_usage, + openai_usage_to_turn_usage, + ) + + return OpenAITurn, _aggregate_usage, openai_usage_to_turn_usage + + +# --------------------------------------------------------------------------- +# openai_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +def test_usage_mapping_full(): + _, _, openai_usage_to_turn_usage = _import_target() + usage = Usage( + requests=3, + input_tokens=100, + input_tokens_details=InputTokensDetails(cached_tokens=20), + output_tokens=50, + output_tokens_details=OutputTokensDetails(reasoning_tokens=10), + total_tokens=150, + ) + turn_usage = openai_usage_to_turn_usage(usage, model="gpt-4o") + + assert turn_usage.model == "gpt-4o" + assert turn_usage.num_llm_calls == 3 + assert turn_usage.input_tokens == 100 + assert turn_usage.cached_input_tokens == 20 + assert turn_usage.output_tokens == 50 + assert turn_usage.reasoning_tokens == 10 + assert turn_usage.total_tokens == 150 + + +def test_usage_mapping_none_usage(): + _, _, openai_usage_to_turn_usage = _import_target() + turn_usage = openai_usage_to_turn_usage(None, model="gpt-4o") + + assert turn_usage.model == "gpt-4o" + # num_llm_calls is None ("not reported") when no usage is present, matching + # the token fields below; a real 0 is only reported when the provider says so. + assert turn_usage.num_llm_calls is None + assert turn_usage.input_tokens is None + assert turn_usage.output_tokens is None + assert turn_usage.total_tokens is None + + +def test_usage_mapping_real_zeros_are_preserved(): + # A cache hit can legitimately produce 0 output tokens; a present-but-zero + # value must survive as 0, not be coerced to None. + _, _, openai_usage_to_turn_usage = _import_target() + usage = Usage( + requests=1, + input_tokens=0, + input_tokens_details=InputTokensDetails(cached_tokens=0), + output_tokens=0, + output_tokens_details=OutputTokensDetails(reasoning_tokens=0), + total_tokens=0, + ) + turn_usage = openai_usage_to_turn_usage(usage, model="m") + + assert turn_usage.input_tokens == 0 + assert turn_usage.cached_input_tokens == 0 + assert turn_usage.output_tokens == 0 + assert turn_usage.reasoning_tokens == 0 + assert turn_usage.total_tokens == 0 + assert turn_usage.num_llm_calls == 1 + + +# --------------------------------------------------------------------------- +# _aggregate_usage +# --------------------------------------------------------------------------- + + +def _resp(usage): + return _types.SimpleNamespace(usage=usage) + + +def test_aggregate_usage_empty(): + _, _aggregate_usage, _ = _import_target() + assert _aggregate_usage([]) is None + + +def test_aggregate_usage_single(): + _, _aggregate_usage, _ = _import_target() + usage = Usage(requests=1, input_tokens=10, output_tokens=5, total_tokens=15) + total = _aggregate_usage([_resp(usage)]) + + assert total is not None + assert total.requests == 1 + assert total.input_tokens == 10 + assert total.output_tokens == 5 + assert total.total_tokens == 15 + + +def test_aggregate_usage_multiple(): + _, _aggregate_usage, _ = _import_target() + u1 = Usage( + requests=1, + input_tokens=10, + input_tokens_details=InputTokensDetails(cached_tokens=2), + output_tokens=5, + output_tokens_details=OutputTokensDetails(reasoning_tokens=1), + total_tokens=15, + ) + u2 = Usage( + requests=2, + input_tokens=20, + input_tokens_details=InputTokensDetails(cached_tokens=3), + output_tokens=7, + output_tokens_details=OutputTokensDetails(reasoning_tokens=4), + total_tokens=27, + ) + # A response without usage must be skipped, not crash the aggregation. + total = _aggregate_usage([_resp(u1), _resp(None), _resp(u2)]) + + assert total is not None + assert total.requests == 3 + assert total.input_tokens == 30 + assert total.output_tokens == 12 + assert total.total_tokens == 42 + assert total.input_tokens_details.cached_tokens == 5 + assert total.output_tokens_details.reasoning_tokens == 5 + + +# --------------------------------------------------------------------------- +# OpenAITurn.events / usage / construction +# --------------------------------------------------------------------------- + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_turn_events_forwards_injected_stream(): + OpenAITurn, _, _ = _import_target() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + + out = [e async for e in turn.events] + assert out == events + + +@pytest.mark.asyncio +async def test_turn_usage_before_and_after_exhaustion_with_injected_stream(): + OpenAITurn, _, _ = _import_target() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + + # Before exhaustion: usage carries only the model name. + before = turn.usage() + assert before.model == "gpt-4o" + assert before.input_tokens is None + + async for _ in turn.events: + pass + + # With an injected stream there is no run to read usage from, so usage + # stays model-only after exhaustion. + after = turn.usage() + assert after.model == "gpt-4o" + assert after.input_tokens is None + + +@pytest.mark.asyncio +async def test_turn_usage_populated_from_result_after_exhaustion(): + OpenAITurn, _, _ = _import_target() + + canonical = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDone(type="done", index=0), + ] + + class _FakeResult: + def __init__(self): + self.raw_responses = [ + _resp(Usage(requests=1, input_tokens=8, output_tokens=4, total_tokens=12)), + ] + + def stream_events(self): + # OpenAITurn passes this to convert_openai_to_agentex_events; we + # monkeypatch that converter below so this can yield canonical events. + return _canonical_stream(canonical) + + import agentex.lib.adk.providers._modules.openai_turn as mod + + async def _passthrough(stream): + async for e in stream: + yield e + + original = mod.convert_openai_to_agentex_events + mod.convert_openai_to_agentex_events = _passthrough + try: + turn = OpenAITurn(result=_FakeResult(), model="gpt-4o") + out = [e async for e in turn.events] + finally: + mod.convert_openai_to_agentex_events = original + + assert out == canonical + usage = turn.usage() + assert usage.model == "gpt-4o" + assert usage.num_llm_calls == 1 + assert usage.input_tokens == 8 + assert usage.output_tokens == 4 + assert usage.total_tokens == 12 + + +def test_turn_requires_result_or_stream(): + OpenAITurn, _, _ = _import_target() + with pytest.raises(ValueError, match="either"): + OpenAITurn() diff --git a/tests/lib/adk/test_claude_code_sync.py b/tests/lib/adk/test_claude_code_sync.py new file mode 100644 index 000000000..6dd36d973 --- /dev/null +++ b/tests/lib/adk/test_claude_code_sync.py @@ -0,0 +1,637 @@ +"""Tests for the claude-code stream-json -> Agentex StreamTaskMessage* converter.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +# --------------------------------------------------------------------------- +# Text content +# --------------------------------------------------------------------------- + + +class TestTextContent: + async def test_text_block_in_assistant_message_emits_start_delta_done(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello world"}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, TextContent) + assert out[0].content.content == "" + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[1].delta, TextDelta) + assert out[1].delta.text_delta == "Hello world" + assert isinstance(out[2], StreamTaskMessageDone) + assert out[0].index == out[1].index == out[2].index + + async def test_empty_text_block_is_skipped(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": ""}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_streamed_text_via_stream_event_emits_start_deltas_done(self): + envelopes = [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": 0, "content_block": {"type": "text"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "Hello"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " world"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, TextContent) + assert len(deltas) == 2 + assert isinstance(deltas[0].delta, TextDelta) + assert deltas[0].delta.text_delta == "Hello" + assert isinstance(deltas[1].delta, TextDelta) + assert deltas[1].delta.text_delta == " world" + assert len(dones) == 1 + + async def test_streamed_text_not_re_emitted_by_assistant_block(self): + """After stream_event triple, the final assistant block must not re-emit the text.""" + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "text"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "streamed"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + # Final assistant message with same text — must NOT be re-emitted + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "streamed"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + text_starts = [e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, TextContent)] + assert len(text_starts) == 1, "Text block must not be emitted twice" + + async def test_later_turn_non_streamed_text_not_dropped(self): + """A non-streamed text block in a later turn must not be dropped because an + earlier turn streamed a block at the same index.""" + envelopes = [ + # Turn 1: streamed text at index 0 (dedup'd against the materialised msg). + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": 0, "content_block": {"type": "text"}}, + }, + { + "type": "stream_event", + "event": {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "first"}}, + }, + {"type": "stream_event", "event": {"type": "content_block_stop", "index": 0}}, + {"type": "assistant", "message": {"content": [{"type": "text", "text": "first"}]}}, + # Turn 2: a NON-streamed text block, also at index 0. + {"type": "assistant", "message": {"content": [{"type": "text", "text": "second"}]}}, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + deltas = [ + e.delta.text_delta for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, TextDelta) + ] + assert deltas == ["first", "second"], "Later turn's non-streamed text must still be delivered" + + +# --------------------------------------------------------------------------- +# Thinking / reasoning content +# --------------------------------------------------------------------------- + + +class TestThinkingContent: + async def test_thinking_block_emits_reasoning_start_delta_done(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "Let me reason..."}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, ReasoningContent) + # Summary must be populated from the thinking text + assert out[0].content.summary == ["Let me reason..."] + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[1].delta, ReasoningContentDelta) + assert out[1].delta.content_delta == "Let me reason..." + assert out[1].delta.content_index == 0 + assert isinstance(out[2], StreamTaskMessageDone) + + async def test_empty_thinking_block_is_skipped(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": ""}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_streamed_thinking_emits_reasoning_start_deltas_done(self): + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "thinking"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "thinking_delta", "thinking": "step one"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "thinking_delta", "thinking": " step two"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent) + assert len(deltas) == 2 + assert isinstance(deltas[0].delta, ReasoningContentDelta) + assert deltas[0].delta.content_delta == "step one" + assert isinstance(deltas[1].delta, ReasoningContentDelta) + assert deltas[1].delta.content_delta == " step two" + assert len(dones) == 1 + + async def test_two_streamed_thinking_blocks_not_re_emitted(self): + """A turn that streams two thinking blocks must claim both indices, so the + final assistant envelope does not re-emit the second one.""" + + def _thinking_block(idx: int, text: str) -> list: + return [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": idx, "content_block": {"type": "thinking"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": idx, + "delta": {"type": "thinking_delta", "thinking": text}, + }, + }, + {"type": "stream_event", "event": {"type": "content_block_stop", "index": idx}}, + ] + + envelopes = [ + *_thinking_block(0, "first thought"), + *_thinking_block(1, "second thought"), + # Final assistant envelope repeats both thinking blocks — neither should re-emit. + { + "type": "assistant", + "message": { + "content": [ + {"type": "thinking", "thinking": "first thought"}, + {"type": "thinking", "thinking": "second thought"}, + ] + }, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + reasoning_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ReasoningContent) + ] + assert len(reasoning_starts) == 2, "each streamed thinking block emitted exactly once (no duplicate)" + + async def test_thinking_block_start_with_no_deltas_allows_assistant_to_fill(self): + """A thinking block_start without any deltas leaves the final assistant block + free to emit the thinking text (the block index is not claimed as streamed).""" + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "thinking"}, + }, + }, + # No thinking_delta — close block immediately + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + # Final assistant message has the thinking text + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "delayed thinking"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + # The assistant block should produce a full thinking message (Start+Delta+Done) + reasoning_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ReasoningContent) + ] + # There will be the empty start from stream_event, plus the one from assistant block + reasoning_deltas = [ + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + assert len(reasoning_deltas) >= 1 + assert any( + isinstance(d.delta, ReasoningContentDelta) and d.delta.content_delta == "delayed thinking" + for d in reasoning_deltas + ) + + +# --------------------------------------------------------------------------- +# Tool calls and results +# --------------------------------------------------------------------------- + + +class TestToolCallsAndResults: + async def test_tool_use_block_emits_start_done(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_abc", + "name": "Bash", + "input": {"command": "ls /"}, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 2 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, ToolRequestContent) + assert out[0].content.tool_call_id == "call_abc" + assert out[0].content.name == "Bash" + assert out[0].content.arguments == {"command": "ls /"} + assert isinstance(out[1], StreamTaskMessageDone) + + async def test_tool_result_block_emits_full(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_abc", + "content": "file1.txt\nfile2.txt", + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + assert out[0].content.tool_call_id == "call_abc" + assert "file1.txt" in str(out[0].content.content) + + async def test_tool_result_list_content_joined(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tid", + "content": [ + {"type": "text", "text": "line1"}, + {"type": "text", "text": "line2"}, + ], + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + payload = str(out[0].content.content) + assert "line1" in payload + assert "line2" in payload + + async def test_tool_result_error_flag_passed_through(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "err_call", + "content": "Permission denied", + "is_error": True, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + assert isinstance(out[0].content.content, dict) + assert out[0].content.content.get("is_error") is True + + async def test_tool_result_truncation(self): + long_result = "x" * 5000 + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "t", + "content": long_result, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + result_str = out[0].content.content.get("result", "") + assert len(result_str) <= 4000 + + +# --------------------------------------------------------------------------- +# on_result callback +# --------------------------------------------------------------------------- + + +class TestOnResult: + async def test_on_result_called_with_result_envelope(self): + captured: list[dict] = [] + + async def capture(envelope): + captured.append(envelope) + + envelopes = [ + { + "type": "result", + "session_id": "sess123", + "cost_usd": 0.012, + "usage": {"input_tokens": 100, "output_tokens": 50}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes), on_result=capture)) + + # result envelope does not emit any StreamTaskMessage + assert out == [] + assert len(captured) == 1 + assert captured[0]["session_id"] == "sess123" + assert captured[0]["cost_usd"] == pytest.approx(0.012) + + async def test_on_result_not_called_when_no_result_envelope(self): + captured: list[dict] = [] + + async def capture(envelope): + captured.append(envelope) + + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hi"}]}, + } + ] + await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes), on_result=capture)) + assert captured == [] + + async def test_no_on_result_does_not_raise(self): + envelopes = [ + { + "type": "result", + "cost_usd": 0.001, + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + ] + # Should not raise even without a callback + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + +# --------------------------------------------------------------------------- +# Message indexing +# --------------------------------------------------------------------------- + + +class TestMessageIndexing: + async def test_multiple_blocks_get_distinct_indices(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "text", "text": "First"}, + { + "type": "tool_use", + "id": "c1", + "name": "Read", + "input": {"path": "/tmp"}, + }, + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "c1", + "content": "some content", + } + ] + }, + }, + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Done"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + # Gather all Start/Full events and check indices are monotonically increasing + anchors = [e for e in out if isinstance(e, (StreamTaskMessageStart, StreamTaskMessageFull))] + indices = [e.index for e in anchors] + assert indices == sorted(indices), "Indices must be monotonically increasing" + assert len(set(indices)) == len(indices), "All indices must be distinct" + + async def test_system_init_and_unknown_envelopes_produce_no_output(self): + envelopes = [ + {"type": "system", "subtype": "init", "session_id": "sess"}, + {"type": "unknown_future_type", "data": "whatever"}, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_non_json_string_lines_are_skipped(self): + lines = [ + "not json at all", + '{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}', + ] + + async def _str_iter(): + for line in lines: + yield line + + out = await _collect(convert_claude_code_to_agentex_events(_str_iter())) + assert len(out) == 3 # Start + Delta + Done for the text block + + async def test_empty_lines_are_skipped(self): + lines = ["", " ", '{"type": "system", "subtype": "init"}'] + + async def _str_iter(): + for line in lines: + yield line + + out = await _collect(convert_claude_code_to_agentex_events(_str_iter())) + assert out == [] + + +# --------------------------------------------------------------------------- +# Author +# --------------------------------------------------------------------------- + + +class TestContentAuthors: + @pytest.mark.parametrize( + "envelope", + [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "hi"}]}, + }, + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "thoughts"}]}, + }, + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "c", + "name": "t", + "input": {}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "c", + "content": "ok", + } + ] + }, + }, + ], + ) + async def test_all_content_authored_by_agent(self, envelope: dict): + out = await _collect(convert_claude_code_to_agentex_events(_aiter([envelope]))) + for e in out: + content = getattr(e, "content", None) + if content is not None and hasattr(content, "author"): + assert content.author == "agent" diff --git a/tests/lib/adk/test_claude_code_turn.py b/tests/lib/adk/test_claude_code_turn.py new file mode 100644 index 000000000..4fbb2f913 --- /dev/null +++ b/tests/lib/adk/test_claude_code_turn.py @@ -0,0 +1,283 @@ +"""Tests for ClaudeCodeTurn and claude_code_usage_to_turn_usage.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._claude_code_turn import ( + ClaudeCodeTurn, + claude_code_usage_to_turn_usage, +) + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _drain(turn: ClaudeCodeTurn) -> list[Any]: + return [e async for e in turn.events] + + +# --------------------------------------------------------------------------- +# Usage normalization +# --------------------------------------------------------------------------- + + +class TestClaudeCodeUsageToTurnUsage: + def test_full_usage_fields(self): + result = { + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "cache_read_input_tokens": 20, + "cache_creation_input_tokens": 5, + }, + "cost_usd": 0.025, + "duration_ms": 3200, + "num_turns": 3, + } + usage = claude_code_usage_to_turn_usage(result) + + assert usage.input_tokens == 100 + assert usage.output_tokens == 50 + assert usage.cached_input_tokens == 25 # 20 + 5 + assert usage.total_tokens == 150 + assert usage.cost_usd == pytest.approx(0.025) + assert usage.duration_ms == 3200 + assert usage.num_llm_calls == 3 + + def test_total_cost_usd_fallback(self): + """total_cost_usd should be used when cost_usd is absent.""" + result = { + "usage": {"input_tokens": 10, "output_tokens": 5}, + "total_cost_usd": 0.001, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cost_usd == pytest.approx(0.001) + + def test_cost_usd_takes_precedence_over_total_cost_usd(self): + result = { + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.002, + "total_cost_usd": 0.999, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cost_usd == pytest.approx(0.002) + + def test_missing_usage_key_returns_nones(self): + result: dict[str, Any] = {} + usage = claude_code_usage_to_turn_usage(result) + assert usage.input_tokens is None + assert usage.output_tokens is None + assert usage.cached_input_tokens is None + assert usage.total_tokens is None + assert usage.cost_usd is None + assert usage.duration_ms is None + assert usage.num_llm_calls is None + + def test_real_zeros_preserved(self): + result = { + "usage": { + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + }, + "cost_usd": 0.0, + "duration_ms": 0, + "num_turns": 0, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.input_tokens == 0 + assert usage.output_tokens == 0 + assert usage.cached_input_tokens == 0 + assert usage.total_tokens == 0 + assert usage.cost_usd == pytest.approx(0.0) + assert usage.duration_ms == 0 + assert usage.num_llm_calls == 0 + + def test_only_cache_read_no_creation(self): + result = { + "usage": { + "input_tokens": 50, + "output_tokens": 25, + "cache_read_input_tokens": 15, + } + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens == 15 + + def test_only_cache_creation_no_read(self): + result = { + "usage": { + "input_tokens": 50, + "output_tokens": 25, + "cache_creation_input_tokens": 10, + } + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens == 10 + + def test_no_cache_fields_gives_none(self): + result = {"usage": {"input_tokens": 10, "output_tokens": 5}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens is None + + def test_total_tokens_computed_from_input_output(self): + result = {"usage": {"input_tokens": 70, "output_tokens": 30}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.total_tokens == 100 + + def test_missing_output_tokens_leaves_total_none(self): + result = {"usage": {"input_tokens": 70}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.total_tokens is None + + def test_returns_turn_usage_instance(self): + result = {"usage": {"input_tokens": 1, "output_tokens": 1}} + usage = claude_code_usage_to_turn_usage(result) + assert isinstance(usage, TurnUsage) + + +# --------------------------------------------------------------------------- +# ClaudeCodeTurn protocol +# --------------------------------------------------------------------------- + + +class TestClaudeCodeTurnProtocol: + def test_satisfies_harness_turn_protocol(self): + """ClaudeCodeTurn must satisfy the HarnessTurn structural protocol.""" + turn = ClaudeCodeTurn(_aiter([])) + assert isinstance(turn, HarnessTurn) + + async def test_events_yields_stream_task_messages(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hi there"}]}, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + out = await _drain(turn) + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[2], StreamTaskMessageDone) + + async def test_usage_before_drain_returns_empty(self): + envelopes = [ + { + "type": "result", + "usage": {"input_tokens": 100, "output_tokens": 50}, + "cost_usd": 0.01, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + # usage() called before events drained — no result envelope yet + usage = turn.usage() + assert isinstance(usage, TurnUsage) + assert usage.input_tokens is None + + async def test_usage_after_drain_reflects_result(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "response"}]}, + }, + { + "type": "result", + "usage": {"input_tokens": 200, "output_tokens": 80}, + "cost_usd": 0.015, + "num_turns": 2, + }, + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + await _drain(turn) + usage = turn.usage() + + assert usage.input_tokens == 200 + assert usage.output_tokens == 80 + assert usage.cost_usd == pytest.approx(0.015) + assert usage.num_llm_calls == 2 + + async def test_usage_empty_when_no_result_envelope(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "no result"}]}, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + await _drain(turn) + usage = turn.usage() + assert usage.input_tokens is None + assert usage.cost_usd is None + + async def test_tool_call_and_result_round_trip(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_1", + "name": "Read", + "input": {"path": "/etc/hosts"}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_1", + "content": "127.0.0.1 localhost", + } + ] + }, + }, + { + "type": "result", + "usage": {"input_tokens": 50, "output_tokens": 20}, + "cost_usd": 0.005, + }, + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + out = await _drain(turn) + usage = turn.usage() + + tool_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolResponseContent) + ] + tool_fulls = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(tool_fulls) == 1 + full_content = tool_fulls[0].content + assert isinstance(full_content, ToolResponseContent) + assert full_content.tool_call_id == "call_1" + + assert usage.input_tokens == 50 + assert usage.output_tokens == 20 + + async def test_events_property_returns_same_iterator(self): + """Accessing .events multiple times returns the same iterator (not a new one each call).""" + turn = ClaudeCodeTurn(_aiter([])) + it1 = turn.events + it2 = turn.events + assert it1 is it2 diff --git a/tests/lib/adk/test_codex_sync.py b/tests/lib/adk/test_codex_sync.py new file mode 100644 index 000000000..644688dfb --- /dev/null +++ b/tests/lib/adk/test_codex_sync.py @@ -0,0 +1,720 @@ +"""Offline tests for the codex event-stream parser tap. + +Tests cover: +- Text streaming (agent_message items) +- Tool call streaming (command_execution, mcp_tool_call, file_change) +- Reasoning streaming (reasoning items) +- Multi-step turns +- Error events (top-level + item-level) +- Edge cases: empty events, non-JSON lines, unknown types +- on_result callback (session_id, usage, counters) +- file_change synthesized start (no item.started emitted by codex) +""" + +from __future__ import annotations + +import json +from typing import Any, AsyncIterator + +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.task_message_content import TextContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._codex_sync import ( + _truncate, + _tool_args_for, + _tool_name_for, + _tool_output_for, + convert_codex_to_agentex_events, +) +from agentex.types.reasoning_content_delta import ReasoningContentDelta +from agentex.types.reasoning_summary_delta import ReasoningSummaryDelta + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class TestHelpers: + def test_truncate_short(self) -> None: + assert _truncate("hello", max_len=10) == "hello" + + def test_truncate_long(self) -> None: + assert _truncate("a" * 5000) == "a" * 4000 + + def test_tool_name_command_execution(self) -> None: + assert _tool_name_for("command_execution", {}) == "bash" + + def test_tool_name_file_change(self) -> None: + assert _tool_name_for("file_change", {}) == "file_change" + + def test_tool_name_mcp_with_server_and_tool(self) -> None: + assert _tool_name_for("mcp_tool_call", {"server": "fs", "tool": "read"}) == "fs.read" + + def test_tool_name_mcp_empty(self) -> None: + assert _tool_name_for("mcp_tool_call", {}) == "mcp_tool_call" + + def test_tool_name_unknown(self) -> None: + assert _tool_name_for("", {}) == "unknown" + + def test_tool_args_command(self) -> None: + assert _tool_args_for("command_execution", {"command": "ls"}) == {"command": "ls"} + + def test_tool_args_file_change(self) -> None: + assert _tool_args_for("file_change", {"changes": ["a"]}) == {"changes": ["a"]} + + def test_tool_args_mcp_dict(self) -> None: + assert _tool_args_for("mcp_tool_call", {"arguments": {"k": "v"}}) == {"k": "v"} + + def test_tool_args_mcp_non_dict(self) -> None: + assert _tool_args_for("mcp_tool_call", {"arguments": "str"}) == {"value": "str"} + + def test_tool_output_command_success(self) -> None: + text, is_err = _tool_output_for("command_execution", {"aggregated_output": "hello", "exit_code": 0}) + assert text == "hello" + assert is_err is False + + def test_tool_output_command_error(self) -> None: + _, is_err = _tool_output_for("command_execution", {"aggregated_output": "boom", "exit_code": 1}) + assert is_err is True + + def test_tool_output_mcp_error(self) -> None: + text, is_err = _tool_output_for("mcp_tool_call", {"error": {"message": "not found"}}) + assert "not found" in text + assert is_err is True + + def test_tool_output_mcp_result(self) -> None: + text, is_err = _tool_output_for("mcp_tool_call", {"result": {"data": 1}}) + assert json.loads(text) == {"data": 1} + assert is_err is False + + def test_tool_output_file_change_failed(self) -> None: + _, is_err = _tool_output_for("file_change", {"status": "failed", "changes": []}) + assert is_err is True + + def test_tool_output_file_change_ok(self) -> None: + text, is_err = _tool_output_for("file_change", {"status": "ok", "changes": [1, 2]}) + assert "2 changes" in text + assert is_err is False + + +# --------------------------------------------------------------------------- +# Text streaming +# --------------------------------------------------------------------------- + + +class TestTextStreaming: + async def test_text_start_delta_done(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "Hi"}}, + {"type": "item.updated", "item": {"id": "m1", "type": "agent_message", "text": "Hi!"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "Hi! Done"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, TextContent) + assert len(deltas) >= 1 + all_delta_text = "".join( + d.delta.text_delta for d in deltas if isinstance(d.delta, TextDelta) and d.delta.text_delta is not None + ) + assert "Hi" in all_delta_text + assert len(dones) == 1 + + async def test_text_indices_are_monotonic(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "A"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "A"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + anchor = [e for e in out if isinstance(e, StreamTaskMessageStart)] + done = [e for e in out if isinstance(e, StreamTaskMessageDone)] + assert anchor[0].index == done[0].index + + async def test_empty_text_no_delta(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": ""}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": ""}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + assert deltas == [] + + async def test_text_author_is_agent(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "X"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + for e in out: + content = getattr(e, "content", None) + if content and hasattr(content, "author"): + assert content.author == "agent" + + +# --------------------------------------------------------------------------- +# Tool call streaming +# --------------------------------------------------------------------------- + + +class TestToolCallStreaming: + async def test_command_execution_start_done_full(self) -> None: + events = [ + { + "type": "item.started", + "item": { + "id": "t1", + "type": "command_execution", + "command": "echo hello", + }, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "echo hello", + "aggregated_output": "hello", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ToolRequestContent) + assert starts[0].content.name == "bash" + assert starts[0].content.arguments == {"command": "echo hello"} + assert starts[0].content.tool_call_id == "t1" + + assert len(dones) == 1 + + assert len(fulls) == 1 + assert isinstance(fulls[0].content, ToolResponseContent) + resp_content = fulls[0].content.content + assert isinstance(resp_content, dict) + assert resp_content["result"] == "hello" + assert fulls[0].content.tool_call_id == "t1" + + async def test_empty_item_id_request_response_ids_match(self) -> None: + """A tool with an empty item_id must use the SAME fallback tool_call_id + on the request (started) and response (completed) halves.""" + events = [ + {"type": "item.started", "item": {"id": "", "type": "command_execution", "command": "ls"}}, + { + "type": "item.completed", + "item": { + "id": "", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + # Pull tool_call_id inside the comprehension so the isinstance narrows the + # content union (the narrowing would not survive a later attribute access). + req_ids = [ + e.content.tool_call_id + for e in out + if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolRequestContent) + ] + resp_ids = [ + e.content.tool_call_id + for e in out + if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(req_ids) == 1 and len(resp_ids) == 1 + assert req_ids[0] == resp_ids[0] + + async def test_file_change_synthesizes_start(self) -> None: + """file_change items may only emit item.completed (no started).""" + events = [ + { + "type": "item.completed", + "item": { + "id": "fc1", + "type": "file_change", + "changes": ["a.py"], + "status": "ok", + }, + } + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + tool_req = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolRequestContent) + ] + tool_resp = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(tool_req) == 1 + assert isinstance(tool_req[0].content, ToolRequestContent) + assert tool_req[0].content.name == "file_change" + assert len(tool_resp) == 1 + + async def test_mcp_tool_call_name(self) -> None: + events = [ + { + "type": "item.started", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "fs", + "tool": "read", + "arguments": {"path": "/x"}, + }, + }, + { + "type": "item.completed", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "fs", + "tool": "read", + "arguments": {"path": "/x"}, + "result": "content", + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + req = next( + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolRequestContent) + ) + assert isinstance(req.content, ToolRequestContent) + assert req.content.name == "fs.read" + + async def test_tool_error_marks_is_error(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd1", "type": "command_execution", "command": "bad"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd1", + "type": "command_execution", + "command": "bad", + "aggregated_output": "error output", + "exit_code": 127, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + resp = next( + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ) + assert isinstance(resp.content, ToolResponseContent) + resp_body = resp.content.content + assert isinstance(resp_body, dict) + assert resp_body.get("is_error") is True + + async def test_tool_indices_request_before_response(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd2", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd2", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + req = next(e for e in out if isinstance(e, StreamTaskMessageStart)) + resp = next( + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ) + assert req.index is not None and resp.index is not None + assert req.index < resp.index + + +# --------------------------------------------------------------------------- +# Reasoning +# --------------------------------------------------------------------------- + + +class TestReasoningStreaming: + async def test_reasoning_start_deltas_done(self) -> None: + """A reasoning block opens with a Start, streams the final text as + summary + content deltas, and closes with a Done. + + It must NOT emit a Full at the open Start's index: auto_send routes a + Full into a throwaway streaming context (ignoring the index), which + would leave the Start context dangling and persist a duplicate, empty + reasoning message (AGX1 codex reasoning duplicate bug). + """ + events = [ + {"type": "item.started", "item": {"id": "r1", "type": "reasoning", "text": ""}}, + { + "type": "item.updated", + "item": {"id": "r1", "type": "reasoning", "text": "thinking..."}, + }, + { + "type": "item.completed", + "item": {"id": "r1", "type": "reasoning", "text": "thinking... done"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + reasoning_fulls = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ReasoningContent) + ] + content_deltas = [ + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + summary_deltas = [ + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningSummaryDelta) + ] + + # Exactly one message: Start + deltas + Done, all on the same index, no Full. + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent) + assert reasoning_fulls == [] + assert len(content_deltas) == 1 + content_delta = content_deltas[0].delta + assert isinstance(content_delta, ReasoningContentDelta) + assert content_delta.content_delta == "thinking... done" + assert len(summary_deltas) == 1 + summary_delta = summary_deltas[0].delta + assert isinstance(summary_delta, ReasoningSummaryDelta) + assert summary_delta.summary_delta == "thinking... done" + assert len(dones) == 1 + idx = starts[0].index + assert content_deltas[0].index == idx + assert summary_deltas[0].index == idx + assert dones[0].index == idx + + async def test_reasoning_no_started_opens_and_closes_one_message(self) -> None: + """If item.completed arrives without item.started, the converter opens a + Start lazily and closes it with a Done (still one clean message, no Full).""" + events = [ + { + "type": "item.completed", + "item": {"id": "r_orphan", "type": "reasoning", "text": "orphan thought"}, + } + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + reasoning_fulls = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ReasoningContent) + ] + content_deltas = [ + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent) + assert reasoning_fulls == [] + assert len(content_deltas) == 1 + content_delta = content_deltas[0].delta + assert isinstance(content_delta, ReasoningContentDelta) + assert content_delta.content_delta == "orphan thought" + assert len(dones) == 1 + assert dones[0].index == starts[0].index + + async def test_reasoning_summary_is_first_line(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "r2", "type": "reasoning", "text": ""}}, + { + "type": "item.completed", + "item": {"id": "r2", "type": "reasoning", "text": "line one\nline two"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + summary_event = next( + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningSummaryDelta) + ) + summary_delta = summary_event.delta + assert isinstance(summary_delta, ReasoningSummaryDelta) + assert summary_delta.summary_delta == "line one" + + async def test_reasoning_empty_block_closes_with_done_only(self) -> None: + """A reasoning block that completes with no text still closes its Start.""" + events = [ + {"type": "item.started", "item": {"id": "r3", "type": "reasoning", "text": ""}}, + {"type": "item.completed", "item": {"id": "r3", "type": "reasoning", "text": ""}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + + assert len(starts) == 1 + assert deltas == [] + assert len(dones) == 1 + assert dones[0].index == starts[0].index + + +# --------------------------------------------------------------------------- +# Error events +# --------------------------------------------------------------------------- + + +class TestErrorEvents: + async def test_turn_failed_emits_error_text(self) -> None: + events = [{"type": "turn.failed", "error": {"message": "context length exceeded"}}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, TextContent) + assert "context length exceeded" in out[0].content.content + + async def test_top_level_error_emits_text(self) -> None: + events = [{"type": "error", "message": "unexpected EOF"}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) == 1 + assert isinstance(out[0].content, TextContent) + assert "unexpected EOF" in out[0].content.content + + async def test_item_error_emits_on_completed_only(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "e1", "type": "error", "message": "bad"}}, + {"type": "item.completed", "item": {"id": "e1", "type": "error", "message": "bad"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + # Only item.completed emits an event for error items + assert len(out) == 1 + assert isinstance(out[0].content, TextContent) + assert "bad" in out[0].content.content + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + async def test_empty_stream(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([]))) + assert out == [] + + async def test_non_json_lines_skipped(self) -> None: + events: list[str] = ["not json", "also not json"] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_blank_lines_skipped(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter(["", " ", "\n"]))) + assert out == [] + + async def test_pre_decoded_dict_events(self) -> None: + """Events passed as dicts (pre-decoded) should work without JSON parsing.""" + events: list[dict[str, Any]] = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + { + "type": "item.completed", + "item": {"id": "m1", "type": "agent_message", "text": "hi"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) > 0 + + async def test_thread_started_no_message(self) -> None: + events = [{"type": "thread.started", "thread_id": "t1"}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_turn_started_no_message(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([{"type": "turn.started"}]))) + assert out == [] + + async def test_turn_completed_no_message(self) -> None: + out = await _collect( + convert_codex_to_agentex_events(_aiter([{"type": "turn.completed", "usage": {"input_tokens": 1}}])) + ) + assert out == [] + + async def test_unknown_event_type_no_message(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([{"type": "some.future.event"}]))) + assert out == [] + + async def test_unknown_item_type_no_message(self) -> None: + out = await _collect( + convert_codex_to_agentex_events( + _aiter([{"type": "item.started", "item": {"id": "x", "type": "future_item"}}]) + ) + ) + assert out == [] + + +# --------------------------------------------------------------------------- +# on_result callback +# --------------------------------------------------------------------------- + + +class TestOnResult: + async def test_session_id_captured(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + {"type": "thread.started", "thread_id": "sess-xyz"}, + { + "type": "turn.completed", + "usage": {"input_tokens": 5, "output_tokens": 3, "total_tokens": 8}, + }, + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["session_id"] == "sess-xyz" + + async def test_usage_forwarded(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + } + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["usage"] == {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + + async def test_tool_count(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": None}, + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["tool_call_count"] == 1 + + async def test_no_callback_when_none(self) -> None: + """Passing on_result=None should not raise.""" + events = [{"type": "turn.completed", "usage": None}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=None)) + assert out == [] + + async def test_on_result_called_even_without_turn_completed(self) -> None: + """on_result fires at end of stream even if turn.completed never arrived.""" + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events: list[Any] = [] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result.get("usage") is None + assert result.get("session_id") is None + + +# --------------------------------------------------------------------------- +# Multi-step turn: tool → text +# --------------------------------------------------------------------------- + + +class TestMultiStepTurn: + async def test_tool_then_text_monotonic_indices(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd1", + "type": "command_execution", + "command": "ls", + "aggregated_output": "file.txt", + "exit_code": 0, + }, + }, + { + "type": "item.started", + "item": {"id": "msg1", "type": "agent_message", "text": ""}, + }, + { + "type": "item.completed", + "item": {"id": "msg1", "type": "agent_message", "text": "Done"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + indices = [e.index for e in out] + assert indices == sorted(indices), "indices must be monotonically non-decreasing" + + async def test_two_text_blocks_distinct_indices(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "a", "type": "agent_message", "text": "first"}, + }, + {"type": "item.completed", "item": {"id": "a", "type": "agent_message", "text": "first"}}, + { + "type": "item.started", + "item": {"id": "b", "type": "agent_message", "text": "second"}, + }, + {"type": "item.completed", "item": {"id": "b", "type": "agent_message", "text": "second"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 2 + assert starts[0].index != starts[1].index + + async def test_json_string_events(self) -> None: + """Events may arrive as raw newline-delimited JSON strings.""" + raw_events = [ + json.dumps({"type": "item.started", "item": {"id": "s1", "type": "agent_message", "text": "hello"}}), + json.dumps({"type": "item.completed", "item": {"id": "s1", "type": "agent_message", "text": "hello"}}), + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(raw_events))) + assert len(out) > 0 + assert any(isinstance(e, StreamTaskMessageStart) for e in out) diff --git a/tests/lib/adk/test_codex_turn.py b/tests/lib/adk/test_codex_turn.py new file mode 100644 index 000000000..f6a046478 --- /dev/null +++ b/tests/lib/adk/test_codex_turn.py @@ -0,0 +1,282 @@ +"""Offline tests for CodexTurn and codex_usage_to_turn_usage. + +Tests cover: +- TurnUsage normalization from raw codex usage dicts +- Defensive handling of missing/invalid usage fields +- CodexTurn: events property yields canonical StreamTaskMessage* +- CodexTurn: usage() before and after stream exhaustion +- CodexTurn: on_result wiring (session_id, counts propagate to usage()) +- CodexTurn satisfies HarnessTurn protocol +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk._modules._codex_turn import ( + CodexTurn, + codex_usage_to_turn_usage, +) + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(turn: CodexTurn) -> list[Any]: + return [msg async for msg in turn.events] + + +# --------------------------------------------------------------------------- +# codex_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +class TestCodexUsageToTurnUsage: + def test_none_raw_all_none_tokens(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.input_tokens is None + assert u.output_tokens is None + assert u.total_tokens is None + assert u.cost_usd is None + + def test_empty_dict_all_none_tokens(self) -> None: + u = codex_usage_to_turn_usage({}) + assert u.input_tokens is None + assert u.output_tokens is None + + def test_standard_usage(self) -> None: + raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + u = codex_usage_to_turn_usage(raw, model="o4-mini") + assert u.input_tokens == 100 + assert u.output_tokens == 50 + assert u.total_tokens == 150 + assert u.model == "o4-mini" + + def test_reasoning_tokens(self) -> None: + raw = {"input_tokens": 200, "output_tokens": 80, "reasoning_tokens": 60, "total_tokens": 340} + u = codex_usage_to_turn_usage(raw) + assert u.reasoning_tokens == 60 + + def test_real_zero_preserved(self) -> None: + """Explicit zeros in the payload must survive (not be treated as missing).""" + raw = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + u = codex_usage_to_turn_usage(raw) + assert u.input_tokens == 0 + assert u.output_tokens == 0 + + def test_cached_input_tokens(self) -> None: + raw = {"input_tokens": 100, "cached_input_tokens": 20, "output_tokens": 40} + u = codex_usage_to_turn_usage(raw) + assert u.cached_input_tokens == 20 + + def test_invalid_token_values_become_none(self) -> None: + raw = {"input_tokens": "not_a_number", "output_tokens": None} + u = codex_usage_to_turn_usage(raw) + assert u.input_tokens is None + assert u.output_tokens is None + + def test_cost_explicit(self) -> None: + u = codex_usage_to_turn_usage(None, cost_usd=0.0042) + assert u.cost_usd == pytest.approx(0.0042) + + def test_cost_from_raw(self) -> None: + u = codex_usage_to_turn_usage({"cost_usd": 0.001}) + assert u.cost_usd == pytest.approx(0.001) + + def test_explicit_cost_overrides_raw(self) -> None: + """Explicit cost_usd kwarg takes precedence over raw dict value.""" + u = codex_usage_to_turn_usage({"cost_usd": 0.001}, cost_usd=0.002) + assert u.cost_usd == pytest.approx(0.002) + + def test_tool_and_reasoning_counts(self) -> None: + u = codex_usage_to_turn_usage(None, tool_call_count=3, reasoning_count=2) + assert u.num_tool_calls == 3 + assert u.num_reasoning_blocks == 2 + + def test_num_llm_calls_always_one(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.num_llm_calls == 1 + + def test_duration_ms(self) -> None: + u = codex_usage_to_turn_usage(None, duration_ms=1234) + assert u.duration_ms == 1234 + + def test_model_none_when_not_provided(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.model is None + + def test_non_dict_raw_treated_as_empty(self) -> None: + u = codex_usage_to_turn_usage("bad input") # type: ignore[arg-type] + assert u.input_tokens is None + + def test_returns_turn_usage_instance(self) -> None: + u = codex_usage_to_turn_usage({}) + assert isinstance(u, TurnUsage) + + +# --------------------------------------------------------------------------- +# CodexTurn protocol conformance +# --------------------------------------------------------------------------- + + +class TestCodexTurnProtocol: + def test_implements_harness_turn_protocol(self) -> None: + turn = CodexTurn(_aiter([]), model="o4-mini") + assert isinstance(turn, HarnessTurn) + + def test_usage_before_exhaustion_returns_zero_turn_usage(self) -> None: + turn = CodexTurn(_aiter([]), model="test-model") + u = turn.usage() + assert isinstance(u, TurnUsage) + assert u.model == "test-model" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +# --------------------------------------------------------------------------- +# CodexTurn events +# --------------------------------------------------------------------------- + + +class TestCodexTurnEvents: + async def test_events_yield_stream_task_messages(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + out = await _collect(turn) + assert len(out) > 0 + for msg in out: + assert isinstance( + msg, + (StreamTaskMessageStart, StreamTaskMessageDelta, StreamTaskMessageFull, StreamTaskMessageDone), + ) + + async def test_usage_after_exhaustion_has_tokens(self) -> None: + events = [ + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + } + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + u = turn.usage() + assert u.input_tokens == 10 + assert u.output_tokens == 5 + assert u.total_tokens == 15 + + async def test_usage_model_propagated(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="codex-model-x") + await _collect(turn) + assert turn.usage().model == "codex-model-x" + + async def test_tool_count_in_usage(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": None}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().num_tool_calls == 1 + + async def test_events_property_stable_across_accesses(self) -> None: + """`.events` returns the same generator; usage survives a second access.""" + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + assert turn.events is turn.events # same generator, not a fresh wrapper + await _collect(turn) + # A second access must NOT re-wrap the exhausted iterator and reset usage. + _ = turn.events + assert turn.usage().total_tokens == 15 + assert turn.usage().num_tool_calls == 1 + + async def test_reasoning_count_in_usage(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "r1", "type": "reasoning", "text": ""}}, + { + "type": "item.completed", + "item": {"id": "r1", "type": "reasoning", "text": "thought"}, + }, + {"type": "turn.completed", "usage": None}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().num_reasoning_blocks == 1 + + async def test_duration_ms_passed_through(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="o4-mini", duration_ms=999) + await _collect(turn) + assert turn.usage().duration_ms == 999 + + async def test_cost_usd_passed_through(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="o4-mini", cost_usd=0.007) + await _collect(turn) + assert turn.usage().cost_usd == pytest.approx(0.007) + + async def test_empty_stream_usage_still_valid(self) -> None: + turn = CodexTurn(_aiter([]), model="o4-mini") + await _collect(turn) + u = turn.usage() + assert isinstance(u, TurnUsage) + assert u.num_llm_calls == 1 + + async def test_reasoning_tokens_propagated(self) -> None: + events = [ + { + "type": "turn.completed", + "usage": { + "input_tokens": 100, + "output_tokens": 60, + "reasoning_tokens": 40, + "total_tokens": 200, + }, + } + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().reasoning_tokens == 40 diff --git a/tests/lib/adk/test_langgraph_async.py b/tests/lib/adk/test_langgraph_async.py new file mode 100644 index 000000000..682bd43bc --- /dev/null +++ b/tests/lib/adk/test_langgraph_async.py @@ -0,0 +1,282 @@ +"""Characterization tests for stream_langgraph_events (unified surface). + +These tests verify the behavior of ``stream_langgraph_events`` after it was +reimplemented on top of ``LangGraphTurn`` + ``UnifiedEmitter.auto_send_turn`` +(Task 4). They serve as a contract test for the public signature. + +Key behavioral notes (unified surface vs. old bespoke implementation): +- Tool calls/responses are posted via ``streaming_task_message_context`` (not + ``adk.messages.create``); they appear as contexts with no stream_update calls. +- ``final_text`` accumulates ALL text across the turn (the old bespoke impl + only returned the last text segment — behavior varied across models). + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import StreamTaskMessageDelta +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +TASK_ID = "task-test" + + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming infrastructure (mirrors test_pydantic_ai_async.py pattern) +# --------------------------------------------------------------------------- + + +@dataclass +class FakeContext: + initial_content: Any + task_message: TaskMessage + closed: bool = False + updates: list[StreamTaskMessageDelta] = field(default_factory=list) + + async def __aenter__(self) -> "FakeContext": + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> bool: + await self.close() + return False + + async def stream_update(self, update: StreamTaskMessageDelta) -> None: + if self.closed: + raise AssertionError("stream_update called after close") + self.updates.append(update) + + async def close(self) -> None: + self.closed = True + + +class FakeStreamingModule: + def __init__(self) -> None: + self.contexts: list[FakeContext] = [] + + def streaming_task_message_context(self, *, task_id: str, initial_content: Any, **kw: Any) -> FakeContext: + tm = TaskMessage( + id=f"m{len(self.contexts) + 1}", + task_id=task_id, + content=initial_content, + streaming_status="IN_PROGRESS", + ) + ctx = FakeContext(initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +class FakeMessagesModule: + def __init__(self) -> None: + self.created: list[dict[str, Any]] = [] + + async def create(self, *, task_id: str, content: Any) -> TaskMessage: + self.created.append({"task_id": task_id, "content": content}) + return TaskMessage( + id=f"created-{len(self.created)}", + task_id=task_id, + content=content, + streaming_status="DONE", + ) + + +@pytest.fixture +def fake_adk(monkeypatch): + from agentex.lib import adk as adk_module + + streaming = FakeStreamingModule() + messages = FakeMessagesModule() + monkeypatch.setattr(adk_module, "streaming", streaming) + monkeypatch.setattr(adk_module, "messages", messages) + return streaming, messages + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +def _text_deltas(ctx: FakeContext) -> list[str]: + out: list[str] = [] + for u in ctx.updates: + if isinstance(u.delta, TextDelta): + out.append(u.delta.text_delta or "") + return out + + +# --------------------------------------------------------------------------- +# Characterization tests (unified surface behavior) +# --------------------------------------------------------------------------- + + +class TestCharacterization: + async def test_plain_text_streams_and_returns_final_text( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import AIMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk = AIMessageChunk(content="Hello, world!") + ai_msg = AIMessage(content="Hello, world!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + assert final == "Hello, world!" + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + assert isinstance(ctx.initial_content, TextContent) + assert _text_deltas(ctx) == ["Hello, world!"] + assert ctx.closed is True + # Unified surface: no messages.create for text + assert messages.created == [] + + async def test_empty_stream_returns_empty_string( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + streaming, _ = fake_adk + final = await stream_langgraph_events(_make_stream([]), TASK_ID) + assert final == "" + assert streaming.contexts == [] + + async def test_tool_call_posted_via_streaming_context( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: tool calls go through streaming_task_message_context, + not adk.messages.create. The context is opened and immediately closed + (no deltas) so the initial_content is the tool request.""" + from langchain_core.messages import AIMessage + + streaming, messages = fake_adk + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + # Unified surface: tool messages go via streaming_task_message_context + assert len(streaming.contexts) == 1 + assert messages.created == [], "Unified surface uses streaming_task_message_context, not messages.create" + + from agentex.types.tool_request_content import ToolRequestContent + + content = streaming.contexts[0].initial_content + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + # Full messages close immediately (no delta updates) + assert streaming.contexts[0].closed is True + assert streaming.contexts[0].updates == [] + + async def test_tool_response_posted_via_streaming_context( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: tool responses go through streaming_task_message_context.""" + from langchain_core.messages import ToolMessage + + streaming, messages = fake_adk + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + stream = _make_stream([("updates", {"tools": {"messages": [tool_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + assert len(streaming.contexts) == 1 + assert messages.created == [] + + from agentex.types.tool_response_content import ToolResponseContent + + content = streaming.contexts[0].initial_content + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + assert streaming.contexts[0].closed is True + + async def test_multi_step_text_then_tool_then_text_last_segment( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: final_text uses last-segment semantics. + + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + Both text contexts are still opened and streamed to Redis; only the + return value is last-segment. This matches stream_pydantic_ai_events. + """ + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk1 = AIMessageChunk(content="Looking up...") + ai_msg1 = AIMessage(content="Looking up...", tool_calls=[{"id": "c1", "name": "search", "args": {}}]) + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + stream = _make_stream( + [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + # Last segment only — first text segment is NOT in final_text + assert final == "Found it!" + # Two text streaming contexts (one per text segment) — both streamed to Redis + text_ctxs = [c for c in streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + assert all(ctx.closed for ctx in text_ctxs) + # Tool request + tool response via streaming_task_message_context (not messages.create) + assert messages.created == [] + + async def test_context_closed_on_exception(self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule]) -> None: + from langchain_core.messages import AIMessageChunk + + streaming, _ = fake_adk + + async def _boom(): + chunk = AIMessageChunk(content="partial") + yield ("messages", (chunk, {})) + raise RuntimeError("upstream exploded") + + with pytest.raises(RuntimeError, match="upstream exploded"): + await stream_langgraph_events(_boom(), TASK_ID) + + assert streaming.contexts[0].closed is True diff --git a/tests/lib/adk/test_langgraph_sync.py b/tests/lib/adk/test_langgraph_sync.py new file mode 100644 index 000000000..248d18f68 --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync.py @@ -0,0 +1,247 @@ +"""Tests for the sync LangGraph -> Agentex stream event converter. + +Covers: +- Basic text, tool call, and tool response emission +- on_final_ai_message callback for usage capture +- create_langgraph_tracing_handler symbol is importable and functional + (runtime DeprecationWarning removed; deprecation is docstring-only) + +NOTE: langchain_core imports must be deferred to test-function scope because +conftest.py stubs out ``langchain_core.messages`` with MagicMock for ADK +package-level tests. The real classes are imported lazily inside each test. +""" + +from __future__ import annotations + +import sys +from typing import Any, AsyncIterator + +import pytest + +from agentex.types.task_message_update import ( + StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +def _make_stream(events: list[tuple[str, Any]]) -> AsyncIterator[tuple[str, Any]]: + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Remove the conftest stubs for langchain_core so real classes are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + """Remove conftest MagicMock stubs so real langchain_core types are used.""" + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + # Re-import the real modules + import importlib + + importlib.import_module("langchain_core.messages") + yield + # Restore stubs after the test + sys.modules.update(saved) + + +class TestTextStreaming: + async def test_plain_text_emits_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello, world!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [AIMessage(content="Hello, world!")]}}), + ] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_empty_chunk_content_is_skipped(self): + from langchain_core.messages import AIMessageChunk + + chunk = AIMessageChunk(content="") + events = [("messages", (chunk, {}))] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert out == [] + + async def test_reasoning_block_start_wraps_reasoning_content(self): + """A Responses-API reasoning block opens a Start wrapping ReasoningContent, + not TextContent (the deltas are ReasoningContentDelta).""" + from langchain_core.messages import AIMessageChunk + + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.task_message_update import StreamTaskMessageDelta, StreamTaskMessageStart + from agentex.types.reasoning_content_delta import ReasoningContentDelta + + chunk = AIMessageChunk( + content=[{"type": "reasoning", "summary": [{"type": "summary_text", "text": "thinking hard"}]}] + ) + events = [("messages", (chunk, {}))] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent), "reasoning Start must wrap ReasoningContent" + # `style` must be a non-null MessageStyle: the AgentEx server's + # StreamTaskMessageStartEntity rejects `reasoning.style=None` (enum), which + # would kill the stream. Match the conformance fixture's canonical value. + assert starts[0].content.style == "active", "reasoning Start must set a non-null style ('active')" + # Pull content_delta inside the comprehension so the isinstance narrows the + # delta union (narrowing would not survive a later attribute access). + reasoning_delta_texts = [ + e.delta.content_delta + for e in out + if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + assert reasoning_delta_texts == ["thinking hard"] + + +class TestToolCallEmission: + async def test_tool_call_emits_full_message(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + assert content.author == "agent" + + async def test_tool_response_emits_full_message(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + assert content.author == "agent" + + +class TestOnFinalAiMessageCallback: + async def test_callback_called_for_ai_message_in_agent_node(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg = AIMessage(content="Hello!") + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0] is ai_msg + + async def test_callback_not_called_for_tool_messages(self): + from langchain_core.messages import ToolMessage + + captured: list[Any] = [] + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="t") + + events = [("updates", {"tools": {"messages": [tool_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert captured == [] + + async def test_callback_receives_usage_metadata(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Answer.", usage_metadata=usage) + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0].usage_metadata == usage + + async def test_no_callback_is_noop(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hello!") + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert isinstance(out, list) + + async def test_callback_called_multiple_times_for_multi_step(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg_1 = AIMessage(content="Step 1") + ai_msg_2 = AIMessage(content="Step 2") + + events = [ + ("updates", {"agent": {"messages": [ai_msg_1]}}), + ("updates", {"agent": {"messages": [ai_msg_2]}}), + ] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 2 + assert captured[0] is ai_msg_1 + assert captured[1] is ai_msg_2 + + async def test_callback_called_after_tool_call_events_yielded(self): + """The callback fires after all events for that AIMessage are yielded.""" + from langchain_core.messages import AIMessage + + yield_order: list[str] = [] + + async def _gen(): + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + yield ("updates", {"agent": {"messages": [ai_msg]}}) + + def _cb(msg): + yield_order.append("callback") + + async for _ in convert_langgraph_to_agentex_events(_gen(), on_final_ai_message=_cb): + yield_order.append("event") + + # The tool call Full event is emitted before the callback fires + assert yield_order.index("event") < yield_order.index("callback") + + +class TestLangGraphTracingHandlerBackwardCompat: + def test_create_langgraph_tracing_handler_no_runtime_warning(self): + """Deprecated symbol remains importable and emits no runtime DeprecationWarning. + + The runtime warnings.warn was removed (docstring-only deprecation) to + align with PR 4/6 and avoid breaking callers under warnings-as-errors. + Using ``warnings.simplefilter("error", DeprecationWarning)`` verifies + that calling the function is safe under -W error conditions. + """ + import warnings + + from agentex.lib.adk._modules._langgraph_tracing import create_langgraph_tracing_handler + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error", DeprecationWarning) + create_langgraph_tracing_handler(trace_id="t1", parent_span_id="p1") + + assert w == [], "create_langgraph_tracing_handler must NOT emit a runtime DeprecationWarning" diff --git a/tests/lib/adk/test_langgraph_sync_unified.py b/tests/lib/adk/test_langgraph_sync_unified.py new file mode 100644 index 000000000..cfd522828 --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync_unified.py @@ -0,0 +1,214 @@ +"""Unified sync path tests for LangGraphTurn + UnifiedEmitter. + +Verifies: +1. Passthrough: events from emitter.yield_turn(LangGraphTurn(stream)) equal + LangGraphTurn(stream).events collected directly. +2. Span derivation: with trace_id + fake tracer, tool spans are derived from + the event stream. + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from datetime import datetime, timezone +from dataclasses import field, dataclass + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Fake SpanTracer +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeTracingBackend: + spans_started: list[dict[str, Any]] = field(default_factory=list) + spans_ended: list[str] = field(default_factory=list) + + async def start_span(self, **kw) -> Any: + from agentex.types.span import Span + + sp = Span( + id=f"span-{len(self.spans_started) + 1}", + trace_id=kw.get("trace_id", "trace1"), + name=kw.get("name", ""), + start_time=datetime.now(tz=timezone.utc), + ) + self.spans_started.append(kw) + return sp + + async def end_span(self, *, trace_id: str, span: Any) -> None: + self.spans_ended.append(span.id if span else "") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestPassthrough: + async def test_yield_turn_events_equal_direct_events(self): + """Events from emitter.yield_turn(LangGraphTurn(stream)) must equal + LangGraphTurn(stream).events collected directly — the emitter must not + add, drop, or reorder events in yield mode.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + # Build two identical streams + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + # Direct collection + direct = [e async for e in LangGraphTurn(_make_stream(events_raw)).events] + + # Via emitter.yield_turn + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert len(direct) == len(via_emitter), "yield_turn must not add or drop events relative to direct iteration" + for a, b in zip(direct, via_emitter, strict=True): + assert type(a) == type(b), f"Event type mismatch: {type(a).__name__} vs {type(b).__name__}" + + async def test_yield_turn_passes_all_event_types(self): + """Start, Delta, Done, Full — each type is preserved.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="hi") + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="hi", tool_calls=[tc]) + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + types = {type(e).__name__ for e in out} + # text chunk emits Start + Delta + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + # tool call emits Full + assert "StreamTaskMessageFull" in types + + async def test_empty_stream_yields_no_events(self): + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream([])))] + assert out == [] + + +class TestSpanDerivation: + @pytest.fixture + def fake_tracer(self): + backend = _FakeTracingBackend() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id=None, + task_id="t", + tracing=backend, # type: ignore[arg-type] + ) + return tracer, backend + + async def test_tool_span_derived_from_full_events(self, fake_tracer): + """AGX1-377: SpanDeriver now handles Full tool events for LangGraph. + + Full(ToolRequestContent) opens a tool span keyed by tool_call_id; + Full(ToolResponseContent) closes it. This bridges the previous gap where + LangGraph's Full-event path produced no spans, aligning it with + Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tracer, backend = fake_tracer + tc = {"id": "c1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny", tool_call_id="c1", name="get_weather") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert len(backend.spans_started) == 1, "Full(ToolRequestContent) opens one tool span" + started = backend.spans_started[0] + assert started["name"] == "get_weather" + assert started["input"] == {"city": "Paris"} + + async def test_no_spans_when_no_tool_calls(self, fake_tracer): + """yield_turn with tracer but no tool calls emits no spans.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + tracer, backend = fake_tracer + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert backend.spans_started == [], "No tool spans when there are no tool calls" + + async def test_tracer_none_means_no_spans(self): + """With tracer=False, no spans should be emitted.""" + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=False) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + # No assertion on spans since tracer=False means emitter.tracer is None + assert emitter.tracer is None diff --git a/tests/lib/adk/test_langgraph_turn.py b/tests/lib/adk/test_langgraph_turn.py new file mode 100644 index 000000000..23aa34ba3 --- /dev/null +++ b/tests/lib/adk/test_langgraph_turn.py @@ -0,0 +1,265 @@ +"""Tests for LangGraphTurn and langgraph_usage_to_turn_usage.""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn, langgraph_usage_to_turn_usage + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _drain(turn: LangGraphTurn) -> list[Any]: + return [e async for e in turn.events] + + +# --------------------------------------------------------------------------- +# langgraph_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +class TestLangGraphUsageToTurnUsage: + def test_none_usage_returns_empty_turn_usage(self): + result = langgraph_usage_to_turn_usage(None, model="gpt-4") + assert result == TurnUsage(model="gpt-4") + + def test_basic_token_fields_mapped(self): + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + result = langgraph_usage_to_turn_usage(usage, model="gpt-4") + assert result.input_tokens == 10 + assert result.output_tokens == 5 + assert result.total_tokens == 15 + assert result.model == "gpt-4" + + def test_zero_output_tokens_preserved_not_coerced_to_none(self): + """Real zero counts must be preserved as 0, not None.""" + usage = {"input_tokens": 10, "output_tokens": 0, "total_tokens": 10} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.output_tokens == 0 + + def test_cache_read_mapped_to_cached_input_tokens(self): + usage = { + "input_tokens": 20, + "output_tokens": 5, + "total_tokens": 25, + "input_token_details": {"cache_read": 8}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens == 8 + + def test_reasoning_mapped_to_reasoning_tokens(self): + usage = { + "input_tokens": 10, + "output_tokens": 15, + "total_tokens": 25, + "output_token_details": {"reasoning": 6}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens == 6 + + def test_missing_optional_fields_are_none(self): + usage = {"input_tokens": 5, "output_tokens": 3, "total_tokens": 8} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + assert result.reasoning_tokens is None + + def test_full_usage_object(self): + usage = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + result = langgraph_usage_to_turn_usage(usage, model="claude-3-5-sonnet") + assert result == TurnUsage( + model="claude-3-5-sonnet", + input_tokens=100, + output_tokens=50, + total_tokens=150, + cached_input_tokens=30, + reasoning_tokens=20, + ) + + def test_model_none_is_preserved(self): + result = langgraph_usage_to_turn_usage({"input_tokens": 1}, model=None) + assert result.model is None + + def test_empty_input_token_details_does_not_crash(self): + usage = {"input_tokens": 5, "input_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + + def test_empty_output_token_details_does_not_crash(self): + usage = {"output_tokens": 5, "output_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens is None + + +# --------------------------------------------------------------------------- +# LangGraphTurn +# --------------------------------------------------------------------------- + + +class TestLangGraphTurn: + async def test_events_yields_from_sync_converter(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + turn = LangGraphTurn(stream) + events = await _drain(turn) + assert len(events) > 0 + + async def test_usage_is_empty_before_stream_consumed(self): + turn = LangGraphTurn(_make_stream([])) + # usage() before events consumed should return a default TurnUsage + usage = turn.usage() + assert isinstance(usage, TurnUsage) + + async def test_usage_captured_from_ai_message(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + assert usage.model == "gpt-4" + + async def test_usage_accumulates_across_multiple_ai_messages(self): + """A multi-step turn (>1 LLM call) sums usage instead of keeping only the last.""" + from langchain_core.messages import AIMessage + + first = AIMessage( + content="thinking", + usage_metadata={ + "input_tokens": 10, + "output_tokens": 5, + "total_tokens": 15, + "input_token_details": {"cache_read": 2}, + "output_token_details": {"reasoning": 1}, + }, + ) + second = AIMessage( + content="answer", + usage_metadata={ + "input_tokens": 20, + "output_tokens": 7, + "total_tokens": 27, + "input_token_details": {"cache_read": 3}, + "output_token_details": {"reasoning": 4}, + }, + ) + stream = _make_stream( + [ + ("updates", {"agent": {"messages": [first]}}), + ("updates", {"agent": {"messages": [second]}}), + ] + ) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage.input_tokens == 30 + assert usage.output_tokens == 12 + assert usage.total_tokens == 42 + assert usage.cached_input_tokens == 5 + assert usage.reasoning_tokens == 5 + assert usage.model == "gpt-4" + + async def test_usage_not_updated_when_no_usage_metadata(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hi!") + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage == TurnUsage(model="gpt-4") + + async def test_usage_captures_cache_read_and_reasoning(self): + from langchain_core.messages import AIMessage + + usage_meta = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + ai_msg = AIMessage(content="Result", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="claude-3-5-sonnet") + await _drain(turn) + + usage = turn.usage() + assert usage.cached_input_tokens == 30 + assert usage.reasoning_tokens == 20 + + async def test_harness_turn_protocol_conformance(self): + """LangGraphTurn satisfies the HarnessTurn Protocol.""" + from agentex.lib.core.harness.types import HarnessTurn + + turn = LangGraphTurn(_make_stream([])) + assert isinstance(turn, HarnessTurn), "LangGraphTurn must satisfy HarnessTurn Protocol" + + async def test_empty_stream_yields_no_events(self): + turn = LangGraphTurn(_make_stream([])) + events = await _drain(turn) + assert events == [] + + async def test_model_none_default(self): + turn = LangGraphTurn(_make_stream([])) + assert turn.usage().model is None + + async def test_model_passed_through_to_usage(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="ok", usage_metadata={"input_tokens": 1, "output_tokens": 0, "total_tokens": 1}) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="my-model") + await _drain(turn) + assert turn.usage().model == "my-model" diff --git a/tests/lib/adk/test_pydantic_ai_async.py b/tests/lib/adk/test_pydantic_ai_async.py index dadda5914..49cb6054c 100644 --- a/tests/lib/adk/test_pydantic_ai_async.py +++ b/tests/lib/adk/test_pydantic_ai_async.py @@ -82,7 +82,9 @@ class FakeStreamingModule: def __init__(self) -> None: self.contexts: list[FakeContext] = [] - def streaming_task_message_context(self, *, task_id: str, initial_content: Any) -> FakeContext: + def streaming_task_message_context( + self, *, task_id: str, initial_content: Any, streaming_mode: str = "coalesced", created_at: Any = None + ) -> FakeContext: tm = TaskMessage( id=f"m{len(self.contexts) + 1}", task_id=task_id, @@ -255,16 +257,36 @@ async def test_empty_thinking_delta_is_skipped( class TestToolCallEmission: - async def test_tool_call_emits_full_tool_request_message_on_part_end( + async def test_tool_call_opens_streaming_context_with_identity( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Async helper uses Option A: tool requests are full messages, not delta streams.""" + """Tool requests are delivered as a streaming context (Start+Delta+Done). + + AGX1-377 fix: auto_send now delivers streamed tool-request messages + natively (Start+ToolRequestDelta+Done). The streaming context is opened + at the Start event with the initial ToolRequestContent (tool_call_id + + name + empty arguments), argument tokens are streamed as deltas, and the + context is closed on Done. + + This test uses a realistic pydantic-ai event sequence: args arrive as a + PartDeltaEvent fragment (the way OpenAI/Anthropic actually stream JSON + tool-call arguments). + """ + from pydantic_ai.messages import ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + streaming, messages = fake_adk events = [ PartStartEvent( index=1, part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1"), ), + # Realistic: args arrive as delta tokens (JSON string fragments). + PartDeltaEvent( + index=1, + delta=ToolCallPartDelta(args_delta='{"city":"Paris"}'), + ), PartEndEvent( index=1, part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="c1"), @@ -272,21 +294,28 @@ async def test_tool_call_emits_full_tool_request_message_on_part_end( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert streaming.contexts == [], "Tool calls do not open a streaming context" - assert len(messages.created) == 1 - msg = messages.created[0] - assert msg["task_id"] == TASK_ID - content = msg["content"] + # AGX1-373: tool messages arrive via streaming_task_message_context. + assert messages.created == [], "adk.messages.create must not be called" + assert len(streaming.contexts) == 1, "tool_request opens a streaming context" + ctx = streaming.contexts[0] + assert ctx.closed is True + content = ctx.initial_content assert isinstance(content, ToolRequestContent) assert content.tool_call_id == "c1" assert content.name == "get_weather" - assert content.arguments == {"city": "Paris"} assert content.author == "agent" + # AGX1-377 streamed shape: initial_content has empty args (args come via delta) + assert content.arguments == {} + # The arg delta is delivered as a stream_update + assert len(ctx.updates) == 1 + assert isinstance(ctx.updates[0].delta, ToolRequestDelta) + assert ctx.updates[0].delta.arguments_delta == '{"city":"Paris"}' async def test_tool_call_with_dict_args_passes_through( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + """When args arrive pre-populated as a dict in PartStart, they're in initial_content.""" + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -299,23 +328,40 @@ async def test_tool_call_with_dict_args_passes_through( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {"q": "weather"} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + # Dict args present at PartStart land directly in initial_content.arguments + assert streaming.contexts[0].initial_content.arguments == {"q": "weather"} + assert streaming.contexts[0].updates == [], "no delta for pre-populated dict args" async def test_tool_call_with_invalid_json_args_surfaces_raw( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Don't drop the tool call when the model emits malformed JSON args. + """Malformed JSON arg delta is surfaced as a ToolRequestDelta with the raw string. + + The argument delta is delivered as-is by auto_send; the client-side + accumulator or the streaming backend handles malformed JSON gracefully. - The arguments field is preserved under ``_raw`` so the failure is - visible to the UI rather than silently truncated. + Parts-manager invariant: PartEnd.part is the accumulated snapshot; real + pydantic-ai conveys args via PartStart + PartDeltaEvent, so a + PartStart(None)+PartEnd(json) with no delta is not realizable. """ - _, messages = fake_adk + from pydantic_ai.messages import ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + + streaming, messages = fake_adk events = [ PartStartEvent( index=0, part=ToolCallPart(tool_name="t", args=None, tool_call_id="c"), ), + # Malformed JSON arrives as a delta token. + PartDeltaEvent( + index=0, + delta=ToolCallPartDelta(args_delta="not-json{"), + ), PartEndEvent( index=0, part=ToolCallPart(tool_name="t", args="not-json{", tool_call_id="c"), @@ -323,13 +369,21 @@ async def test_tool_call_with_invalid_json_args_surfaces_raw( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {"_raw": "not-json{"} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + # Initial content has empty args (args come via delta) + assert ctx.initial_content.arguments == {} + # The malformed JSON is surfaced verbatim in the ToolRequestDelta + assert len(ctx.updates) == 1 + assert isinstance(ctx.updates[0].delta, ToolRequestDelta) + assert ctx.updates[0].delta.arguments_delta == "not-json{" async def test_tool_call_with_none_args_defaults_to_empty_dict( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -342,15 +396,20 @@ async def test_tool_call_with_none_args_defaults_to_empty_dict( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + assert streaming.contexts[0].initial_content.arguments == {} + assert streaming.contexts[0].updates == [], "no delta when args are absent" class TestToolResult: async def test_tool_return_emits_full_tool_response_message( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + # AGX1-373: tool responses arrive via streaming_task_message_context + # (open+close pair), NOT via adk.messages.create. + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart(tool_name="get_weather", content="Sunny, 72F", tool_call_id="c1"), @@ -358,13 +417,17 @@ async def test_tool_return_emits_full_tool_response_message( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + assert messages.created == [], "adk.messages.create must not be called after reimplementation" + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + assert ctx.closed is True + content = ctx.initial_content assert isinstance(content, ToolResponseContent) assert content.tool_call_id == "c1" assert content.name == "get_weather" assert content.content == "Sunny, 72F" assert content.author == "agent" + assert ctx.updates == [], "open+close only — no deltas for tool messages" async def test_tool_return_with_dict_content_preserves_structure( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] @@ -377,7 +440,7 @@ async def test_tool_return_with_dict_content_preserves_structure( and divergent from the sync converter which uses ``_tool_return_content`` to return dicts as-is. """ - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart(tool_name="t", content={"temp": 72, "sky": "clear"}, tool_call_id="c"), @@ -385,7 +448,10 @@ async def test_tool_return_with_dict_content_preserves_structure( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - out = messages.created[0]["content"].content + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + out = streaming.contexts[0].initial_content.content assert out == {"temp": 72, "sky": "clear"}, ( f"Expected the dict to survive verbatim; got {out!r}. " "If this is a Python repr string, the helper regressed to str(content)." @@ -402,7 +468,7 @@ class WeatherResult(BaseModel): temp: int sky: str - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart( @@ -414,13 +480,16 @@ class WeatherResult(BaseModel): ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - out = messages.created[0]["content"].content + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + out = streaming.contexts[0].initial_content.content assert out == {"temp": 72, "sky": "clear"} async def test_retry_prompt_part_surfaces_as_tool_response( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=RetryPromptPart( @@ -432,8 +501,10 @@ async def test_retry_prompt_part_surfaces_as_tool_response( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + content = streaming.contexts[0].initial_content assert isinstance(content, ToolResponseContent) assert content.tool_call_id == "c1" # RetryPromptPart.content stringifies to the error description @@ -446,9 +517,9 @@ async def test_text_then_tool_then_text_uses_separate_contexts_in_order( ) -> None: """End-to-end multi-step shape: text → tool call → tool result → more text. - Each text/reasoning segment must get its own streaming context that is - closed before the next one opens, and tool messages must interleave - correctly via ``adk.messages.create``. + AGX1-373 envelope change: tool messages now arrive via + streaming_task_message_context (open+close pairs) instead of + adk.messages.create. All four message types open streaming contexts. """ streaming, messages = fake_adk events = [ @@ -474,18 +545,30 @@ async def test_text_then_tool_then_text_uses_separate_contexts_in_order( ] final = await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(streaming.contexts) == 2, "One context per text part — tool calls don't open streaming contexts" + # AGX1-373: all 4 messages (text, tool_request, tool_response, text) + # arrive via streaming_task_message_context. + assert messages.created == [], "adk.messages.create must not be called after reimplementation" + assert len(streaming.contexts) == 4 assert all(ctx.closed for ctx in streaming.contexts) - assert _text_deltas(streaming.contexts[0]) == ["Looking up..."] - assert _text_deltas(streaming.contexts[1]) == ["It's sunny."] - # Two messages: tool request, then tool response — in that order. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] - assert messages.created[0]["content"].tool_call_id == "c1" - assert messages.created[1]["content"].tool_call_id == "c1" + text_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, TextContent)] + tool_req_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolRequestContent)] + tool_resp_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolResponseContent)] + assert len(text_ctxs) == 2 + assert len(tool_req_ctxs) == 1 + assert len(tool_resp_ctxs) == 1 + + assert _text_deltas(text_ctxs[0]) == ["Looking up..."] + assert _text_deltas(text_ctxs[1]) == ["It's sunny."] + + # Tool content is preserved verbatim. + assert tool_req_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_resp_ctxs[0].initial_content.tool_call_id == "c1" + + # Tool contexts carry no deltas (open+close only). + assert tool_req_ctxs[0].updates == [] + assert tool_resp_ctxs[0].updates == [] + assert final == "It's sunny." async def test_new_text_part_after_text_closes_previous( @@ -533,7 +616,11 @@ async def test_reasoning_then_text_closes_reasoning_context( async def test_tool_result_closes_any_open_streaming_context( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """A tool result arriving while a text context is open must close that context first.""" + """A tool result arriving while a text context is open must close that context first. + + AGX1-373: the tool response itself now also opens a streaming context + (open+close pair) rather than going through adk.messages.create. + """ streaming, messages = fake_adk events = [ PartStartEvent(index=0, part=TextPart(content="")), @@ -548,7 +635,10 @@ async def test_tool_result_closes_any_open_streaming_context( assert streaming.contexts[0].closed is True, ( "Helper must close any open streaming context before emitting a tool result message" ) - assert len(messages.created) == 1 + # AGX1-373: tool response arrives via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 2 + assert isinstance(streaming.contexts[1].initial_content, ToolResponseContent) class TestDeltaForOrphanIndexIgnored: @@ -584,7 +674,7 @@ async def on_tool_end(self, tool_call_id: str, result: Any) -> None: async def test_handler_records_start_and_end_for_each_tool_call( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk handler = self._RecordingHandler() events = [ PartStartEvent( @@ -605,11 +695,12 @@ async def test_handler_records_start_and_end_for_each_tool_call( tracing_handler=handler, # type: ignore[arg-type] ) - # Streaming side-effects still happen — tracing is additive. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] + # AGX1-373: tool messages arrive via streaming_task_message_context. + # Tracing is still additive — both messages are delivered AND hooks fire. + assert messages.created == [] + assert len(streaming.contexts) == 2 + assert isinstance(streaming.contexts[0].initial_content, ToolRequestContent) + assert isinstance(streaming.contexts[1].initial_content, ToolResponseContent) # And both lifecycle hooks fired exactly once with the right payload. assert handler.starts == [ { @@ -680,8 +771,12 @@ async def test_handler_records_each_tool_in_multi_tool_run( async def test_omitting_handler_is_a_no_op_for_existing_behavior( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Regression: passing no tracing handler preserves the pre-tracing behavior.""" - _, messages = fake_adk + """Regression: passing no tracing handler preserves streaming behavior. + + AGX1-373: tool messages arrive via streaming_task_message_context + regardless of whether tracing_handler is passed. + """ + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -696,11 +791,11 @@ async def test_omitting_handler_is_a_no_op_for_existing_behavior( ), ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - # Exact same shape as before tracing existed. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] + # AGX1-373: tool messages via streaming_task_message_context. + assert messages.created == [] + assert len(streaming.contexts) == 2 + content_types = [type(ctx.initial_content).__name__ for ctx in streaming.contexts] + assert content_types == ["ToolRequestContent", "ToolResponseContent"] class TestPydanticAITracingHandlerDeterministicIds: @@ -867,3 +962,101 @@ async def boom() -> AsyncIterator[Any]: await stream_pydantic_ai_events(boom(), TASK_ID) assert streaming.contexts[0].closed is True + + +# --------------------------------------------------------------------------- +# Characterization test: lock the wire-level delivery shape for a representative +# pydantic-ai run (text + tool call + tool response + more text). +# +# Step 1 (CURRENT behavior): written against the original implementation. +# - Text/reasoning use adk.streaming.streaming_task_message_context. +# - Tool messages use adk.messages.create (FakeMessagesModule.created list). +# - Final text is the last text segment. +# +# Step 2 (POST-reimplementation on UnifiedEmitter / auto_send): +# The assertions in TestCharacterizeWireShapeNew (below) lock the new shape. +# Tool messages no longer go through adk.messages.create; they arrive via +# streaming_task_message_context open+close pairs (Start+Done envelope). +# This is the AGX1-373 accepted envelope change: logical content is identical. +# --------------------------------------------------------------------------- + + +class TestCharacterizeWireShape: + """Characterization tests: lock the wire-level delivery shape after reimplementation. + + Uses FakeStreamingModule + FakeMessagesModule (the existing fake pair). + + AGX1-373 shape (post-reimplementation on UnifiedEmitter / auto_send): + - Text/reasoning: streaming_task_message_context (open + deltas + close) + - Tool messages: streaming_task_message_context (open+close, no deltas) + - adk.messages.create is NOT called. + - Final text == last text segment only. + + This class was first written to characterize the OLD shape (adk.messages.create + for tool messages) and was updated post-reimplementation to reflect the new + delivery channel. The logical content is identical; only the channel changed. + """ + + async def test_text_tool_text_new_wire_shape( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Representative run: text -> tool call -> tool response -> more text. + + Post-AGX1-373 delivery shape: + - Four streaming contexts: text, tool_request, tool_response, text. + - adk.messages.create NOT called. + - Final text == "It's sunny." (last segment only, matching the + multi-step convention). + """ + from pydantic_ai.messages import ToolReturnPart + + streaming, messages = fake_adk + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Looking up...")), + PartEndEvent(index=0, part=TextPart(content="Looking up...")), + PartStartEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1"), + ), + PartEndEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args="{}", tool_call_id="c1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Sunny", tool_call_id="c1"), + ), + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="It's sunny.")), + PartEndEvent(index=0, part=TextPart(content="It's sunny.")), + ] + + final = await stream_pydantic_ai_events(_aiter(events), TASK_ID) + + assert final == "It's sunny.", "multi-step: only the last text segment is returned" + + # AGX1-373: all 4 messages arrive via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 4 + assert all(ctx.closed for ctx in streaming.contexts) + + content_types = [type(ctx.initial_content).__name__ for ctx in streaming.contexts] + assert content_types == [ + "TextContent", + "ToolRequestContent", + "ToolResponseContent", + "TextContent", + ] + + text_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, TextContent)] + tool_req_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolRequestContent)] + tool_resp_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolResponseContent)] + + assert _text_deltas(text_ctxs[0]) == ["Looking up..."] + assert _text_deltas(text_ctxs[1]) == ["It's sunny."] + assert tool_req_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_req_ctxs[0].initial_content.name == "get_weather" + assert tool_req_ctxs[0].updates == [] + assert tool_resp_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_resp_ctxs[0].initial_content.content == "Sunny" + assert tool_resp_ctxs[0].updates == [] diff --git a/tests/lib/adk/test_pydantic_ai_sync.py b/tests/lib/adk/test_pydantic_ai_sync.py index 36d06200e..080bc5be8 100644 --- a/tests/lib/adk/test_pydantic_ai_sync.py +++ b/tests/lib/adk/test_pydantic_ai_sync.py @@ -3,9 +3,11 @@ from __future__ import annotations import json +import asyncio from typing import Any, AsyncIterator import pytest +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent from pydantic_ai.messages import ( TextPart, PartEndEvent, @@ -481,3 +483,75 @@ async def test_author_is_agent(self, events: list[Any]): content = getattr(e, "content", None) if content is not None and hasattr(content, "author"): assert content.author == "agent" + + +class TestOnResultCallback: + """on_result callback: captures the terminal AgentRunResultEvent without + altering streaming output.""" + + def _make_result_event(self, output: Any = "hello") -> AgentRunResultEvent: + result = AgentRunResult(output=output, _output_tool_name=None) + return AgentRunResultEvent(result=result) + + async def test_callback_invoked_once_with_result_event(self): + """on_result is called exactly once, with the AgentRunResultEvent.""" + captured: list[AgentRunResultEvent] = [] + + def on_result(event: AgentRunResultEvent) -> None: + captured.append(event) + + result_event = self._make_result_event("the answer") + events = [result_event] + await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=on_result)) + + assert len(captured) == 1 + assert captured[0] is result_event + assert captured[0].result.output == "the answer" + + async def test_streaming_output_unchanged_with_callback(self): + """Yielded StreamTaskMessage* sequence is identical whether on_result is set or not.""" + result_event = self._make_result_event() + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hi")), + PartEndEvent(index=0, part=TextPart(content="hi")), + result_event, + ] + + captured: list[AgentRunResultEvent] = [] + out_with = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=captured.append)) + out_without = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events))) + + assert len(out_with) == len(out_without) + for a, b in zip(out_with, out_without): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + assert len(captured) == 1 + + async def test_no_callback_no_error(self): + """AgentRunResultEvent is silently ignored when on_result is None.""" + result_event = self._make_result_event() + events = [result_event] + out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_async_callback_is_awaited(self): + """An async on_result callable is properly awaited. + + The callback suspends (``await asyncio.sleep(0)``) before recording its + side effect, so ``awaited`` is only populated if the converter actually + awaits the returned coroutine — distinguishing "awaited" from + "called-but-not-awaited." + """ + awaited: list[AgentRunResultEvent] = [] + + async def on_result_async(event: AgentRunResultEvent) -> None: + await asyncio.sleep(0) + awaited.append(event) + + result_event = self._make_result_event("async_output") + events = [result_event] + await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=on_result_async)) + + assert len(awaited) == 1 + assert awaited[0].result.output == "async_output" diff --git a/tests/lib/adk/test_pydantic_ai_sync_unified.py b/tests/lib/adk/test_pydantic_ai_sync_unified.py new file mode 100644 index 000000000..f920418de --- /dev/null +++ b/tests/lib/adk/test_pydantic_ai_sync_unified.py @@ -0,0 +1,209 @@ +"""Tests for the unified sync (HTTP ACP) path: PydanticAITurn + UnifiedEmitter. + +Exercises the path documented in _pydantic_ai_sync.py under "Recommended: unified surface": +- events forwarded by yield_turn equal PydanticAITurn(stream).events (passthrough) +- with a trace context + fake tracing backend, tool spans are derived (start_span / end_span called) +- with a trace context + fake tracing backend, reasoning spans are derived +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent +from pydantic_ai.usage import RunUsage +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + ThinkingPart, + ToolCallPart, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, + ThinkingPartDelta, + ToolCallPartDelta, +) + +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +class _FakeSpan: + def __init__(self, name: str): + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span): + self.ended.append((span.name, span.output)) + + +def _make_result_event(usage: RunUsage | None = None) -> AgentRunResultEvent: + result = AgentRunResult(output="done", _output_tool_name=None) + if usage is not None: + result._state.usage = usage + return AgentRunResultEvent(result=result) + + +class TestUnifiedSyncPathPassthrough: + """The events forwarded by yield_turn are identical to PydanticAITurn.events.""" + + async def test_text_stream_passthrough(self): + raw_events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hello")), + PartEndEvent(index=0, part=TextPart(content="hello")), + ] + + turn_a = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + direct = await _collect(turn_a.events) + + turn_b = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = await _collect(emitter.yield_turn(turn_b)) + + assert len(via_emitter) == len(direct) + for a, b in zip(via_emitter, direct): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + async def test_tool_call_stream_passthrough(self): + raw_events = [ + PartStartEvent(index=0, part=ToolCallPart(tool_name="Bash", args=None, tool_call_id="c1")), + PartDeltaEvent(index=0, delta=ToolCallPartDelta(args_delta='{"cmd":"ls"}')), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c1"), + ), + ] + + turn_a = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + direct = await _collect(turn_a.events) + + turn_b = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = await _collect(emitter.yield_turn(turn_b)) + + assert len(via_emitter) == len(direct) + for a, b in zip(via_emitter, direct): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + +class TestUnifiedSyncPathSpanDerivation: + """With trace context + fake tracing, spans are derived from the stream.""" + + async def test_tool_span_opened_and_closed(self): + """A tool call produces start_span + end_span on the fake tracing backend.""" + from pydantic_ai.messages import ToolReturnPart, FunctionToolResultEvent + + tool_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="call_1"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="call_1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="Bash", content="files", tool_call_id="call_1"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(tool_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracing=fake) + + events = await _collect(emitter.yield_turn(turn)) + + assert len(events) >= 2, "at least Start(tool) + Done + Full(response)" + assert len(fake.started) == 1, "one tool span opened" + assert len(fake.ended) == 1, "one tool span closed" + span_name, parent_id, span_input = fake.started[0] + assert span_name == "Bash" + assert parent_id == "p" + closed_name, closed_output = fake.ended[0] + assert closed_name == "Bash" + + async def test_reasoning_span_opened_and_closed(self): + """A thinking/reasoning block produces start_span + end_span.""" + reasoning_events = [ + PartStartEvent(index=0, part=ThinkingPart(content="")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta="let me think")), + PartEndEvent(index=0, part=ThinkingPart(content="let me think")), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(reasoning_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert len(fake.started) == 1, "one reasoning span opened" + assert len(fake.ended) == 1, "one reasoning span closed" + span_name, parent_id, _ = fake.started[0] + assert span_name == "reasoning" + assert parent_id == "p" + + async def test_no_trace_id_means_no_spans(self): + """When trace_id is None, no spans are derived even with a fake tracing backend.""" + raw_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="c2"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c2"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert fake.started == [], "no spans when trace_id is absent" + assert fake.ended == [] + + async def test_tracer_false_suppresses_spans_even_with_trace_id(self): + """tracer=False disables span derivation regardless of trace_id.""" + raw_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="c3"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c3"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracer=False, tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert fake.started == [] + assert fake.ended == [] diff --git a/tests/lib/adk/test_pydantic_ai_turn.py b/tests/lib/adk/test_pydantic_ai_turn.py new file mode 100644 index 000000000..46bf247a3 --- /dev/null +++ b/tests/lib/adk/test_pydantic_ai_turn.py @@ -0,0 +1,276 @@ +"""Tests for PydanticAITurn and pydantic_ai_usage_to_turn_usage.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent +from pydantic_ai.usage import RunUsage +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, +) + +from agentex.lib.core.harness import HarnessTurn +from agentex.lib.adk._modules._pydantic_ai_turn import ( + PydanticAITurn, + pydantic_ai_usage_to_turn_usage, +) + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +def _make_result_event(output: Any = "done", usage: RunUsage | None = None) -> AgentRunResultEvent: + result = AgentRunResult(output=output, _output_tool_name=None) + if usage is not None: + result._state.usage = usage + return AgentRunResultEvent(result=result) + + +class TestUsageNormalization: + def test_usage_normalization_maps_fields(self): + """Real RunUsage fields map correctly onto TurnUsage.""" + usage = RunUsage( + requests=3, + input_tokens=200, + output_tokens=80, + cache_read_tokens=25, + ) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + + assert turn_usage.model == "openai:gpt-4o" + assert turn_usage.input_tokens == 200 + assert turn_usage.output_tokens == 80 + assert turn_usage.num_llm_calls == 3 + + def test_total_tokens_is_computed(self): + """RunUsage.total_tokens is a computed property; we surface it correctly.""" + usage = RunUsage(input_tokens=100, output_tokens=50) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.total_tokens == 150 + + def test_cache_read_tokens_mapped_to_cached_input_tokens(self): + usage = RunUsage(input_tokens=100, output_tokens=50, cache_read_tokens=20) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.cached_input_tokens == 20 + + def test_none_model(self): + """model=None is preserved.""" + usage = RunUsage() + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model=None) + assert turn_usage.model is None + + def test_all_zero_usage_preserves_real_zeros(self): + """An all-zero RunUsage maps real 0s through (not None). + + RunUsage token fields are ints defaulting to 0. A 0 is a genuine + value (e.g. a cache-hit with 0 output tokens), not "unknown", so it + must survive normalization as 0 rather than being coerced to None. + """ + usage = RunUsage() + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.num_llm_calls == 0 + assert turn_usage.input_tokens == 0 + assert turn_usage.output_tokens == 0 + assert turn_usage.cached_input_tokens == 0 + assert turn_usage.total_tokens == 0 + + def test_missing_field_degrades_to_none(self): + """A usage object MISSING a field maps that field to None (defensive getattr). + + Guards the version-rename guarantee: if pydantic-ai renames a field, + the absent attribute degrades to None rather than raising. + """ + + class StubUsage: + requests = 2 + input_tokens = 100 + # no output_tokens / cache_read_tokens / total_tokens attributes + + turn_usage = pydantic_ai_usage_to_turn_usage(StubUsage(), model="openai:gpt-4o") + assert turn_usage.num_llm_calls == 2 + assert turn_usage.input_tokens == 100 + assert turn_usage.output_tokens is None + assert turn_usage.cached_input_tokens is None + assert turn_usage.total_tokens is None + + +class TestPydanticAITurn: + async def test_turn_satisfies_harness_turn_protocol(self): + """PydanticAITurn is structurally compatible with HarnessTurn.""" + turn = PydanticAITurn(_aiter([]), model="openai:gpt-4o") + assert isinstance(turn, HarnessTurn) + + async def test_usage_before_exhaustion_returns_default(self): + """usage() before iterating events returns default TurnUsage (model set, tokens None).""" + result_event = _make_result_event(usage=RunUsage(requests=1, input_tokens=100, output_tokens=40)) + events = [result_event] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + + # Do NOT exhaust events — check usage pre-run + pre_usage = turn.usage() + assert pre_usage.model == "openai:gpt-4o" + assert pre_usage.input_tokens is None + assert pre_usage.output_tokens is None + assert pre_usage.num_llm_calls is None + + async def test_turn_events_and_usage(self): + """Driving events to exhaustion populates usage from the terminal event.""" + known_usage = RunUsage( + requests=2, + input_tokens=300, + output_tokens=120, + cache_read_tokens=30, + ) + result_event = _make_result_event(usage=known_usage) + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hi")), + PartEndEvent(index=0, part=TextPart(content="hi")), + result_event, + ] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + + collected = await _collect(turn.events) + + # Events match bare converter output (Start + Delta + Done = 3 events) + assert len(collected) == 3 + + # Usage is populated after exhaustion + usage = turn.usage() + assert usage.model == "openai:gpt-4o" + assert usage.input_tokens == 300 + assert usage.output_tokens == 120 + assert usage.cached_input_tokens == 30 + assert usage.num_llm_calls == 2 + assert usage.total_tokens == 420 + + async def test_events_match_bare_converter(self): + """Yielded events are identical to bare convert_pydantic_ai_to_agentex_events output.""" + from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + + text_events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Hello")), + PartEndEvent(index=0, part=TextPart(content="Hello")), + ] + + turn = PydanticAITurn(_aiter(text_events), model="openai:gpt-4o") + turn_out = await _collect(turn.events) + + bare_out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(text_events))) + + assert len(turn_out) == len(bare_out) + for a, b in zip(turn_out, bare_out): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + async def test_usage_captured_via_real_usage_accessor(self): + """Drive the turn through the REAL ``result.usage`` property accessor. + + The production code reads ``getattr(run_result, "usage", None)``, which + on this pydantic-ai version resolves the ``_DeprecatedCallableRunUsage`` + property (NOT ``_state.usage`` directly). This asserts that the real + accessor path the converter uses captures the run usage. Constructing + the event without our test's ``_state`` shortcut: we set ``_state.usage`` + only because that is the sole supported way to seed an + ``AgentRunResult``, but we then assert capture happens through the + public ``.usage`` attribute access (verified below). + """ + known_usage = RunUsage(requests=4, input_tokens=512, output_tokens=64) + result = AgentRunResult(output="done", _output_tool_name=None) + result._state.usage = known_usage + result_event = AgentRunResultEvent(result=result) + + # Sanity: the value is reachable via the real public accessor the + # production code uses (not just via the private _state). The + # _DeprecatedCallableRunUsage property wraps the value, so compare by + # equality rather than identity. + accessed = getattr(result_event.result, "usage", None) + assert accessed is not None + assert accessed.input_tokens == 512 + assert accessed.requests == 4 + + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartEndEvent(index=0, part=TextPart(content="")), + result_event, + ] + turn = PydanticAITurn(_aiter(events), model="anthropic:claude-3-5-sonnet") + await _collect(turn.events) + + usage = turn.usage() + assert usage.model == "anthropic:claude-3-5-sonnet" + assert usage.input_tokens == 512 + assert usage.output_tokens == 64 + assert usage.num_llm_calls == 4 + + async def test_no_usage_event_leaves_default_usage(self): + """If the stream has no AgentRunResultEvent, usage() returns the default (tokens None).""" + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartEndEvent(index=0, part=TextPart(content="")), + ] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + await _collect(turn.events) + + usage = turn.usage() + assert usage.model == "openai:gpt-4o" + assert usage.input_tokens is None + assert usage.num_llm_calls is None + + +class TestToolRequestStreaming: + """PydanticAITurn.events equals the bare converter output unconditionally. + + The foundation auto_send delivers Start+ToolRequestDelta+Done natively + (AGX1-377), so no coalescing is needed on either channel. + """ + + async def test_events_match_bare_converter_for_streamed_tool_call(self): + """PydanticAITurn yields a ToolRequestDelta for a streamed-args tool call + — i.e. it is byte-for-byte the bare converter output, preserving + argument-token streaming on the sync/yield channel.""" + from pydantic_ai.messages import ToolCallPart, ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + from agentex.types.task_message_update import StreamTaskMessageDelta + from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + + tool_events = [ + PartStartEvent(index=0, part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1")), + PartDeltaEvent(index=0, delta=ToolCallPartDelta(args_delta='{"city":"Paris"}')), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="c1"), + ), + ] + + turn = PydanticAITurn(_aiter(tool_events), model="openai:gpt-4o") + turn_out = await _collect(turn.events) + + bare_out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(tool_events))) + + # Turn is identical to the bare converter. + assert len(turn_out) == len(bare_out) + for a, b in zip(turn_out, bare_out): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + # The arg-streaming delta is present. + deltas = [ + e for e in turn_out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ToolRequestDelta) + ] + assert len(deltas) == 1, "streamed tool-call args must surface as a ToolRequestDelta" + assert isinstance(deltas[0].delta, ToolRequestDelta) + assert deltas[0].delta.arguments_delta == '{"city":"Paris"}' diff --git a/tests/lib/adk/test_tracing_module.py b/tests/lib/adk/test_tracing_module.py index 52d5d3f82..58d5d4a85 100644 --- a/tests/lib/adk/test_tracing_module.py +++ b/tests/lib/adk/test_tracing_module.py @@ -1,7 +1,10 @@ from __future__ import annotations from datetime import datetime, timezone -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from temporalio.exceptions import ActivityError import agentex.lib.adk._modules.tracing as _tracing_mod from agentex.types.span import Span @@ -26,6 +29,24 @@ def _make_module() -> tuple[AsyncMock, TracingModule]: return mock_service, module +def _make_activity_error() -> ActivityError: + return ActivityError( + "activity timed out", + scheduled_event_id=1, + started_event_id=2, + identity="worker-1", + activity_type="start-span", + activity_id="activity-1", + retry_state=None, + ) + + +def _make_metric_meter() -> MagicMock: + mock_meter = MagicMock() + mock_meter.create_counter.return_value = MagicMock() + return mock_meter + + class TestStartSpan: async def test_start_span_with_task_id(self): mock_service, module = _make_module() @@ -87,6 +108,128 @@ async def test_end_span_preserves_task_id(self): mock_service.end_span.assert_called_once_with(trace_id="trace-123", span=span) +class TestTracingModuleTemporalPath: + async def test_start_span_in_workflow_returns_none_when_activity_fails(self): + mock_service, module = _make_module() + mock_meter = _make_metric_meter() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers, \ + patch.object(_tracing_mod.workflow, "logger") as mock_logger, \ + patch.object(_tracing_mod.workflow, "metric_meter", return_value=mock_meter): + mock_helpers.execute_activity = AsyncMock(side_effect=_make_activity_error()) + result = await module.start_span(trace_id="trace-123", name="test-span") + + assert result is None + mock_logger.warning.assert_called_once() + mock_meter.create_counter.assert_called_once_with( + _tracing_mod.TEMPORAL_SPAN_ACTIVITY_DROPPED_METRIC, + description="Temporal tracing span activities dropped after fail-open", + unit="1", + ) + mock_meter.create_counter.return_value.add.assert_called_once_with( + 1, {"event_type": "start"} + ) + mock_helpers.execute_activity.assert_called_once() + mock_service.start_span.assert_not_called() + + async def test_end_span_in_workflow_returns_span_when_activity_fails(self): + mock_service, module = _make_module() + span = _make_span() + mock_meter = _make_metric_meter() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers, \ + patch.object(_tracing_mod.workflow, "logger") as mock_logger, \ + patch.object(_tracing_mod.workflow, "metric_meter", return_value=mock_meter): + mock_helpers.execute_activity = AsyncMock(side_effect=_make_activity_error()) + result = await module.end_span(trace_id="trace-123", span=span) + + assert result == span + mock_logger.warning.assert_called_once() + mock_meter.create_counter.assert_called_once_with( + _tracing_mod.TEMPORAL_SPAN_ACTIVITY_DROPPED_METRIC, + description="Temporal tracing span activities dropped after fail-open", + unit="1", + ) + mock_meter.create_counter.return_value.add.assert_called_once_with( + 1, {"event_type": "end"} + ) + mock_helpers.execute_activity.assert_called_once() + mock_service.end_span.assert_not_called() + + async def test_context_manager_skips_end_when_temporal_start_fails(self): + mock_service, module = _make_module() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers, \ + patch.object(_tracing_mod.workflow, "logger"): + mock_helpers.execute_activity = AsyncMock(side_effect=_make_activity_error()) + async with module.span(trace_id="trace-123", name="test-span") as span: + assert span is None + + mock_helpers.execute_activity.assert_called_once() + mock_service.start_span.assert_not_called() + mock_service.end_span.assert_not_called() + + async def test_start_span_in_workflow_propagates_unexpected_errors(self): + mock_service, module = _make_module() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers: + mock_helpers.execute_activity = AsyncMock(side_effect=RuntimeError("bad response shape")) + try: + await module.start_span(trace_id="trace-123", name="test-span") + except RuntimeError as exc: + assert str(exc) == "bad response shape" + else: + raise AssertionError("Expected unexpected errors to propagate") + + mock_helpers.execute_activity.assert_called_once() + mock_service.start_span.assert_not_called() + + async def test_start_span_in_workflow_propagates_cancellation(self): + mock_service, module = _make_module() + activity_error = _make_activity_error() + mock_meter = _make_metric_meter() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers, \ + patch.object(_tracing_mod, "is_cancelled_exception", return_value=True), \ + patch.object(_tracing_mod.workflow, "logger") as mock_logger, \ + patch.object(_tracing_mod.workflow, "metric_meter", return_value=mock_meter): + mock_helpers.execute_activity = AsyncMock(side_effect=activity_error) + + with pytest.raises(ActivityError): + await module.start_span(trace_id="trace-123", name="test-span") + + mock_logger.warning.assert_not_called() + mock_meter.create_counter.assert_not_called() + mock_helpers.execute_activity.assert_called_once() + mock_service.start_span.assert_not_called() + + async def test_end_span_in_workflow_propagates_cancellation(self): + mock_service, module = _make_module() + span = _make_span() + activity_error = _make_activity_error() + mock_meter = _make_metric_meter() + + with patch.object(_tracing_mod, "in_temporal_workflow", return_value=True), \ + patch.object(_tracing_mod, "ActivityHelpers") as mock_helpers, \ + patch.object(_tracing_mod, "is_cancelled_exception", return_value=True), \ + patch.object(_tracing_mod.workflow, "logger") as mock_logger, \ + patch.object(_tracing_mod.workflow, "metric_meter", return_value=mock_meter): + mock_helpers.execute_activity = AsyncMock(side_effect=activity_error) + + with pytest.raises(ActivityError): + await module.end_span(trace_id="trace-123", span=span) + + mock_logger.warning.assert_not_called() + mock_meter.create_counter.assert_not_called() + mock_helpers.execute_activity.assert_called_once() + mock_service.end_span.assert_not_called() + + class TestSpanContextManager: async def test_span_context_manager_forwards_task_id(self): mock_service, module = _make_module() diff --git a/tests/lib/core/harness/__init__.py b/tests/lib/core/harness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/_fakes.py b/tests/lib/core/harness/_fakes.py new file mode 100644 index 000000000..f9fd34a45 --- /dev/null +++ b/tests/lib/core/harness/_fakes.py @@ -0,0 +1,63 @@ +"""Shared test doubles for the unified harness test suites. + +A single superset implementation of the in-memory tracing backend used across +the harness tests. Three recording shapes were previously duplicated: + +- Shape-1 (richest): ``started`` = ``(name, parent_id, input)`` 3-tuples, + ``ended`` = ``(name, output)`` 2-tuples, plus an ``ended_spans`` list of the + closed ``FakeSpan`` objects (which carry ``.name``, ``.output``, ``.data``). +- Shape-2: ``started`` = ``(name, parent_id)`` 2-tuples, ``ended`` = + ``(name, output)``. +- Shape-3: ``started`` = bare names, ``ended`` = bare outputs. + +``FakeTracing`` records the richest (shape-1) form and exposes read-only +convenience properties (``started_names``, ``started_pairs``, +``ended_outputs``) so shape-2 and shape-3 assertions stay clean. +""" + +from __future__ import annotations + +from typing import Any + + +class FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + self.data: Any = None + + +class FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + self.ended_spans: list[FakeSpan] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> FakeSpan: + self.started.append((name, parent_id, input)) + return FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: FakeSpan) -> None: + self.ended.append((span.name, span.output)) + self.ended_spans.append(span) + + @property + def started_names(self) -> list[str]: + return [name for (name, _parent, _input) in self.started] + + @property + def started_pairs(self) -> list[tuple[str, Any]]: + return [(name, parent) for (name, parent, _input) in self.started] + + @property + def ended_outputs(self) -> list[Any]: + return [output for (_name, output) in self.ended] diff --git a/tests/lib/core/harness/conformance/__init__.py b/tests/lib/core/harness/conformance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/conftest.py b/tests/lib/core/harness/conformance/conftest.py new file mode 100644 index 000000000..e4da7f1e2 --- /dev/null +++ b/tests/lib/core/harness/conformance/conftest.py @@ -0,0 +1,21 @@ +"""Conformance-suite test setup. + +Eagerly import every per-harness conformance module so each one's module-level +``register(...)`` calls run before any test executes. This makes +``all_fixtures()`` complete and independent of pytest's collection/import order +(the runner documents that cross-module registration order is not guaranteed), +so the cross-harness ``test_span_derivation_is_deterministic`` guard in +``test_conformance.py`` covers the full fixture set even when this directory is +run in isolation. +""" + +from __future__ import annotations + +# Importing these for their registration side effects only. +from . import ( + test_codex_conformance, # noqa: F401 + test_openai_conformance, # noqa: F401 + test_langgraph_conformance, # noqa: F401 + test_claude_code_conformance, # noqa: F401 + test_pydantic_ai_conformance, # noqa: F401 +) diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py new file mode 100644 index 000000000..02a07f726 --- /dev/null +++ b/tests/lib/core/harness/conformance/runner.py @@ -0,0 +1,507 @@ +"""Shared conformance engine: every harness tap registers fixtures here. + +A fixture is (name, list[StreamTaskMessage]). The runner asserts two things: + +1. **Cross-channel logical equivalence**: yield_events and auto_send produce the + same *logical* sequence of delivered message contents. "Logical" means we + normalise away the streaming-envelope difference: + - yield channel delivers StreamTaskMessageFull(ToolResponseContent) verbatim. + - auto_send channel delivers the same tool-response by opening a streaming + context with the full content and closing it immediately (Start+Done on the + wire), not a Full event. + Both reduce to the same LogicalDelivery(type, identity, payload) tuple; the + conformance test compares those normalised sequences. + + `payload` carries the content that callers actually consume: + - text: initial_content.content prepended, then accumulated delta string + - reasoning: initial_content.summary joined, then accumulated delta string + - tool_request: the arguments dict (JSON-sorted), from Start content + - tool_response: the content value (str) + This catches a channel that delivers the right structural shape but corrupts, + drops, or omits initial_content (including reasoning summary) or payload. + +2. **Span signal equivalence**: each channel is driven with its own recording + tracer that captures every SpanSignal it actually receives in handle(); the + two channels' recorded signal lists must be identical. Comparing what each + channel genuinely emitted (rather than re-deriving from the events) catches a + regression where a channel skips deriver.observe() for some event type. + +Registry shared-state hazard: `_REGISTRY` is process-global. Every `test_*.py` +module that calls `register()` at import time contributes to it, so a module +that parametrizes over `all_fixtures()` will see fixtures registered by ANY +other conformance module imported earlier in the same pytest process (collection +order is not guaranteed). To stay deterministic, each future harness conformance +module should register and parametrize over its OWN fixtures (e.g. keep a +module-local list it both registers and parametrizes), rather than relying on +cross-module global accumulation via `all_fixtures()`. + +Design decision — Full-message handling in auto_send +---------------------------------------------------- +auto_send posts a StreamTaskMessageFull (tool_request or tool_response) by +opening a streaming context with the full content and closing it immediately, +rather than calling adk.messages.create. This open+close approach is retained +because: + - StreamingTaskMessageContext.close() persists initial_content when no deltas + have been streamed, so the message IS correctly persisted. + - It mirrors the pattern already used by the real langgraph streaming helper + (now in _langgraph_turn.py), keeping behavioural parity. + - Switching to adk.messages.create would require an additional injectable + dependency, adding surface area for no observable benefit. +The conformance test treats this as an ACCEPTABLE envelope difference: at the +logical-content level, Full(ToolResponseContent) from yield and +Start(content)+Done from auto_send are equivalent. The recorded span signals are +identical because both adapters drive the same SpanDeriver.observe() call +sequence and forward every signal to their tracer. + +auto_send DELIVERS streamed tool-request messages (Start+Done): both channels +produce a LogicalDelivery for a streamed tool_request, and the cross-channel +assertion verifies it is delivered on both. +""" + +from __future__ import annotations + +import json +from typing import Any, NamedTuple, override +from dataclasses import dataclass + +from agentex.types.text_delta import TextDelta +from agentex.types.task_message import TaskMessage +from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.reasoning_content_delta import ReasoningContentDelta +from agentex.lib.core.harness.span_derivation import SpanDeriver + +from .._fakes import FakeTracing + + +@dataclass +class Fixture: + name: str + events: list[StreamTaskMessage] + + +_REGISTRY: list[Fixture] = [] + + +def register(fixture: Fixture) -> None: + _REGISTRY.append(fixture) + + +def all_fixtures() -> list[Fixture]: + return list(_REGISTRY) + + +def run_pure_async(coro: Any) -> Any: + """Drive a *pure* (I/O-free) coroutine to completion without an event loop. + + Conformance fixtures are built at import time so they can parametrize the + tests below. The fixture-building coroutines only iterate in-memory events + and never suspend on a real future, so we step them by hand instead of + ``asyncio.run()``. ``asyncio.run()`` at import raises ``RuntimeError`` when a + loop is already running (programmatic pytest, a Jupyter kernel, or a + session-scoped asyncio loop); this driver is unaffected by ambient loop + state. It raises if the coroutine ever suspends on real I/O. + """ + try: + coro.send(None) + except StopIteration as stop: + return stop.value + coro.close() + raise RuntimeError("conformance fixture build unexpectedly suspended on real I/O") + + +def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: + d = SpanDeriver() + out: list[SpanSignal] = [] + for e in events: + out.extend(d.observe(e)) + out.extend(d.flush()) + return out + + +# --------------------------------------------------------------------------- +# Logical delivery normalisation +# --------------------------------------------------------------------------- + + +class LogicalDelivery(NamedTuple): + """A single logically-delivered message, channel-agnostic. + + `content_type` is the .type of the content (e.g. "text", "reasoning", + "tool_request", "tool_response"). `identity` is a frozenset of key=value + pairs that uniquely identify the content (e.g. tool_call_id for tool + messages, or index for text/reasoning). `payload` is a stable string + representation of the content callers actually consume: + - text: initial_content.content prepended to accumulated delta strings + - reasoning: initial_content.summary joined, prepended to accumulated + reasoning-content delta strings + - tool_request: JSON-sorted arguments from Start content + - tool_response: str(content) from Full event + """ + + content_type: str + identity: frozenset[tuple[str, Any]] + payload: str = "" + + +def _yield_logical_deliveries(events: list[StreamTaskMessage]) -> list[LogicalDelivery]: + """Extract logical deliveries from the yield channel's event list. + + The yield channel forwards events verbatim. A logical delivery is: + - A Full event (tool_request / tool_response): content delivered as-is. + - A Start + ... + Done sequence for text/reasoning/tool_request content. + + The `payload` field captures the content callers consume: + - text: initial_content.content (from Start) prepended to accumulated deltas + - reasoning: initial_content.summary joined (from Start) prepended to + accumulated reasoning-content deltas (this catches a channel that drops + the summary) + - tool_request: JSON-sorted arguments from the Start content (delivered on + both channels) + - tool_response: str(content) from Full event + """ + from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.tool_request_content import ToolRequestContent + + deliveries: list[LogicalDelivery] = [] + # Track which indices had a Start so we can pair with Done + started: dict[int, Any] = {} # index -> initial content + # Accumulate delta text per index (seed with initial_content text if present) + accumulated: dict[int, list[str]] = {} # index -> list of delta strings + + for event in events: + if isinstance(event, StreamTaskMessageStart): + if event.index is not None: + started[event.index] = event.content + # Seed accumulator with initial_content so a channel that drops + # initial_content but delivers deltas correctly will fail. + seed: list[str] = [] + if isinstance(event.content, TextContent) and event.content.content: + seed = [event.content.content] + elif isinstance(event.content, ReasoningContent) and event.content.summary: + seed = list(event.content.summary) + accumulated[event.index] = seed + elif isinstance(event, StreamTaskMessageDelta): + if event.index is not None and event.delta is not None: + if isinstance(event.delta, TextDelta) and event.delta.text_delta: + accumulated.setdefault(event.index, []).append(event.delta.text_delta) + elif isinstance(event.delta, ReasoningContentDelta) and event.delta.content_delta: + accumulated.setdefault(event.index, []).append(event.delta.content_delta) + elif isinstance(event, StreamTaskMessageDone): + if event.index is not None and event.index in started: + content = started.pop(event.index) + deltas = accumulated.pop(event.index, []) + ctype = getattr(content, "type", None) or "" + if ctype in ("text", "reasoning"): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset({("index", event.index)}), + payload="".join(deltas), + ) + ) + elif ctype == "tool_request" and isinstance(content, ToolRequestContent): + # auto_send delivers streamed tool-request messages. Emit a + # delivery here so the cross-channel assertion verifies it is + # present on both channels. + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + elif isinstance(event, StreamTaskMessageFull): + content = event.content + ctype = getattr(content, "type", None) or "" + if ctype == "tool_response": + from agentex.types.tool_response_content import ToolResponseContent + + if isinstance(content, ToolResponseContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=str(content.content), + ) + ) + elif ctype == "tool_request": + from agentex.types.tool_request_content import ToolRequestContent + + if isinstance(content, ToolRequestContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + + return deliveries + + +# --------------------------------------------------------------------------- +# Fake streaming backend for auto_send conformance runner +# --------------------------------------------------------------------------- + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens, close() closes.""" + + def __init__(self, sink: list[Any], content_type: str, initial_content: Any) -> None: + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage( + id="msg-conformance", + task_id="conformance-task", + content=initial_content, + ) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.content_type, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Fake streaming backend; records every context lifecycle event.""" + + def __init__(self) -> None: + self.sink: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.sink.append(("ctx", ctype, initial_content)) + return _FakeCtx(self.sink, ctype, initial_content) + + +class _RecordingTracer(SpanTracer): + """SpanTracer that records every SpanSignal it actually receives. + + Each delivery channel calls `tracer.handle(signal)` for every signal it + derives from the stream, so `received_signals` captures what the channel + genuinely emitted — not a re-derivation. Comparing the two channels' + recorded lists catches regressions where a channel skips + `deriver.observe(event)` for some event type. + """ + + def __init__(self, tracing: Any) -> None: + super().__init__( + trace_id="conformance-trace", + parent_span_id="conformance-parent", + tracing=tracing, + ) + self.received_signals: list[SpanSignal] = [] + + @override + async def handle(self, signal: SpanSignal) -> None: + self.received_signals.append(signal) + await super().handle(signal) + + +async def _gen(events: list[StreamTaskMessage]): # type: ignore[return] + for e in events: + yield e + + +def _auto_send_logical_deliveries(sink: list[Any]) -> list[LogicalDelivery]: + """Extract logical deliveries from the auto_send fake streaming sink. + + Each context lifecycle in the sink looks like: + ("ctx", ctype, content) -- context created + ("open", ctype, content) -- context __aenter__ + [("update", delta), ...] -- optional deltas (StreamTaskMessageDelta) + ("close", ctype) -- context closed + + A logical delivery corresponds to each open+close pair. For text/reasoning + we identify by sequential position and build the payload by prepending the + initial_content text (TextContent.content) or summary (ReasoningContent.summary) + to accumulated deltas. This matches _yield_logical_deliveries so a channel + that drops initial_content or reasoning summary fails the comparison. + For tool messages we use tool_call_id + name and capture arguments/content. + """ + from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.tool_request_content import ToolRequestContent + from agentex.types.tool_response_content import ToolResponseContent + + deliveries: list[LogicalDelivery] = [] + open_idx = 0 + while open_idx < len(sink): + entry = sink[open_idx] + if entry[0] == "ctx": + ctype: str = entry[1] + content: Any = entry[2] + found_open = False + delta_parts: list[str] = [] + # Seed delta_parts with initial_content so payload comparison + # catches a channel that drops initial_content but delivers deltas. + if isinstance(content, TextContent) and content.content: + delta_parts = [content.content] + elif isinstance(content, ReasoningContent) and content.summary: + delta_parts = list(content.summary) + for j in range(open_idx + 1, len(sink)): + if sink[j][0] == "open" and sink[j][1] == ctype and not found_open: + found_open = True + elif found_open and sink[j][0] == "update": + # Accumulate delta content from StreamTaskMessageDelta + update = sink[j][1] + if isinstance(update, StreamTaskMessageDelta) and update.delta is not None: + if isinstance(update.delta, TextDelta) and update.delta.text_delta: + delta_parts.append(update.delta.text_delta) + elif isinstance(update.delta, ReasoningContentDelta) and update.delta.content_delta: + delta_parts.append(update.delta.content_delta) + elif sink[j][0] == "close" and sink[j][1] == ctype and found_open: + # Matched open+close: emit logical delivery with payload + if ctype in ("text", "reasoning"): + count = sum(1 for k in range(open_idx) if sink[k][0] == "ctx" and sink[k][1] == ctype) + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset({("seq", count)}), + payload="".join(delta_parts), + ) + ) + elif ctype == "tool_response": + if isinstance(content, ToolResponseContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=str(content.content), + ) + ) + elif ctype == "tool_request": + if isinstance(content, ToolRequestContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + open_idx = j + 1 + break + else: + open_idx += 1 + else: + open_idx += 1 + + return deliveries + + +def _yield_text_reasoning_seq(deliveries: list[LogicalDelivery]) -> list[LogicalDelivery]: + """Re-key text/reasoning deliveries from index-based to seq-based identity. + + The yield channel uses event.index as identity; auto_send uses a sequential + counter. To compare across channels, normalise both to sequential position + within each content type. + """ + result: list[LogicalDelivery] = [] + counts: dict[str, int] = {} + for d in deliveries: + if d.content_type in ("text", "reasoning"): + seq = counts.get(d.content_type, 0) + counts[d.content_type] = seq + 1 + result.append( + LogicalDelivery( + content_type=d.content_type, + identity=frozenset({("seq", seq)}), + payload=d.payload, + ) + ) + else: + result.append(d) + return result + + +async def run_cross_channel_conformance( + fixture: Fixture, +) -> tuple[list[LogicalDelivery], list[LogicalDelivery], list[SpanSignal], list[SpanSignal]]: + """Run both channels over a fixture; return (yield_deliveries, auto_deliveries, + yield_spans, auto_spans). + + The caller asserts yield_deliveries == auto_deliveries and + yield_spans == auto_spans. The span signals are the ones each channel's + tracer ACTUALLY recorded while delivering (not a re-derivation), so a + regression where a channel skips deriver.observe() for some event type is + caught. + """ + from agentex.lib.core.harness.auto_send import auto_send + from agentex.lib.core.harness.yield_delivery import yield_events + + # --- yield channel --- + tracer_yield = _RecordingTracer(tracing=FakeTracing()) + yield_out = [e async for e in yield_events(_gen(fixture.events), tracer=tracer_yield)] + + # Span signals the yield channel actually emitted to its tracer + yield_spans = tracer_yield.received_signals + + # Logical deliveries from yield output + yield_deliveries = _yield_text_reasoning_seq(_yield_logical_deliveries(yield_out)) + + # --- auto_send channel --- + tracer_auto = _RecordingTracer(tracing=FakeTracing()) + fake_streaming = _FakeStreaming() + await auto_send( + _gen(fixture.events), + task_id="conformance-task", + tracer=tracer_auto, + streaming=fake_streaming, + ) + + # Span signals the auto_send channel actually emitted to its tracer + auto_spans = tracer_auto.received_signals + + # Logical deliveries from what the streaming backend received + auto_deliveries = _auto_send_logical_deliveries(fake_streaming.sink) + + return yield_deliveries, auto_deliveries, yield_spans, auto_spans diff --git a/tests/lib/core/harness/conformance/test_claude_code_conformance.py b/tests/lib/core/harness/conformance/test_claude_code_conformance.py new file mode 100644 index 000000000..010bc530b --- /dev/null +++ b/tests/lib/core/harness/conformance/test_claude_code_conformance.py @@ -0,0 +1,192 @@ +"""Cross-channel conformance tests for the claude-code parser tap. + +Each fixture is a representative sequence of claude-code stream-json +envelopes, converted into canonical ``StreamTaskMessage*`` events via +``ClaudeCodeTurn``, then registered into the shared conformance runner. + +The conformance runner asserts two guarantees per fixture: + +1. **Logical-delivery equivalence**: ``yield_events`` and ``auto_send`` + produce the same logically-delivered message contents. + +2. **Span signal equivalence**: both channels emit the same ``SpanSignal`` + sequence to their ``SpanTracer``. + +Fixtures +-------- +text-only: single ``assistant`` text block +tool-call-result: ``tool_use`` block followed by ``tool_result`` +thinking-block: ``thinking`` block with full text +multi-step: text + tool_use + tool_result + text (two model turns) + +Note +---- +Relative imports are used throughout (runner.py and these fixtures live in the +same package). The per-module ``_FIXTURES`` list is both registered globally +(via ``register()``) and parametrized locally so this module's tests are +self-contained regardless of global registry ordering (see runner.py docstring). +""" + +from __future__ import annotations + +import pytest + +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + +from .runner import ( + Fixture, + register, + run_pure_async, + run_cross_channel_conformance, +) + +# --------------------------------------------------------------------------- +# Convert claude-code envelopes to StreamTaskMessage* events +# --------------------------------------------------------------------------- + + +async def _envelopes_to_events(envelopes: list[dict]) -> list: + """Drive convert_claude_code_to_agentex_events and collect all events.""" + + async def _aiter(items): # type: ignore[return] + for item in items: + yield item + + return [e async for e in convert_claude_code_to_agentex_events(_aiter(envelopes))] + + +# --------------------------------------------------------------------------- +# Fixture definitions (raw claude-code envelope sequences) +# --------------------------------------------------------------------------- + +_TEXT_ENVELOPES = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "The answer is 42."}]}, + } +] + +_TOOL_ENVELOPES = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_read", + "name": "Read", + "input": {"path": "/workspace/README.md"}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_read", + "content": "# My Project\n\nA great project.", + } + ] + }, + }, +] + +_THINKING_ENVELOPES = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "thinking", "thinking": "Let me think about this carefully.\nStep 1: check the facts."}, + {"type": "text", "text": "Here is my answer."}, + ] + }, + } +] + +_MULTI_STEP_ENVELOPES = [ + # Turn 1: text + tool call + { + "type": "assistant", + "message": { + "content": [ + {"type": "text", "text": "Let me look that up."}, + { + "type": "tool_use", + "id": "call_bash", + "name": "Bash", + "input": {"command": "cat /etc/hostname"}, + }, + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_bash", + "content": "myhost", + } + ] + }, + }, + # Turn 2: final text after tool result + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "The hostname is myhost."}]}, + }, +] + + +# --------------------------------------------------------------------------- +# Build fixtures from envelopes at module load time +# --------------------------------------------------------------------------- + + +async def _build_fixture(name: str, envelopes: list[dict]) -> Fixture: + events = await _envelopes_to_events(envelopes) + return Fixture(name=name, events=events) + + +# Fixtures must exist before pytest collects (they parametrize the test below), +# so they are built at import time. The conversion only iterates in-memory +# envelopes — it never suspends on a real future — so we drive the coroutines to +# completion with the shared loop-free ``run_pure_async`` driver instead of +# asyncio.run(), which raises RuntimeError at import when an event loop is +# already running (programmatic pytest, a Jupyter kernel, or session-scoped +# asyncio loops). +_FIXTURES: list[Fixture] = [ + run_pure_async(_build_fixture("claude-code-text-only", _TEXT_ENVELOPES)), + run_pure_async(_build_fixture("claude-code-tool-call-result", _TOOL_ENVELOPES)), + run_pure_async(_build_fixture("claude-code-thinking-block", _THINKING_ENVELOPES)), + run_pure_async(_build_fixture("claude-code-multi-step", _MULTI_STEP_ENVELOPES)), +] + +# Register into the shared registry so all_fixtures() can enumerate them +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance assertions +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """yield_events and auto_send must produce equivalent logical deliveries + and identical span signals for every claude-code fixture. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) diff --git a/tests/lib/core/harness/conformance/test_codex_conformance.py b/tests/lib/core/harness/conformance/test_codex_conformance.py new file mode 100644 index 000000000..d51a73584 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_codex_conformance.py @@ -0,0 +1,215 @@ +"""Conformance fixtures for the codex harness tap. + +Each fixture is derived from a ``CodexTurn`` and registered into the +cross-channel conformance runner so that span derivation is validated +alongside all other harness taps. + +Following the per-module registry pattern from runner.py: this module keeps +its own local list of fixtures, both registers them AND parametrizes over +them, to guarantee determinism regardless of pytest collection order. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.adk._modules._codex_sync import convert_codex_to_agentex_events + +from .runner import Fixture, register, run_pure_async + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(events: list[Any]) -> list[StreamTaskMessage]: + return [msg async for msg in convert_codex_to_agentex_events(_aiter(events))] + + +def _build(events: list[Any]) -> list[StreamTaskMessage]: + # Loop-free driver: this runs at import time, where asyncio.run() would raise + # under an already-running loop (programmatic pytest, notebooks). + return run_pure_async(_collect(events)) + + +# --------------------------------------------------------------------------- +# Fixture 1: plain text response +# --------------------------------------------------------------------------- + +_CODEX_TEXT = Fixture( + name="codex-text", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-abc"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.updated", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello, world"}, + }, + { + "type": "item.completed", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello, world!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + }, + ] + ), +) +register(_CODEX_TEXT) + +# --------------------------------------------------------------------------- +# Fixture 2: tool call (command_execution) +# --------------------------------------------------------------------------- + +_CODEX_TOOL = Fixture( + name="codex-tool-command", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-cmd"}, + { + "type": "item.started", + "item": { + "id": "tool1", + "type": "command_execution", + "command": "ls /workspace", + }, + }, + { + "type": "item.completed", + "item": { + "id": "tool1", + "type": "command_execution", + "command": "ls /workspace", + "aggregated_output": "file1.txt\nfile2.py", + "exit_code": 0, + }, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}, + }, + ] + ), +) +register(_CODEX_TOOL) + +# --------------------------------------------------------------------------- +# Fixture 3: reasoning block +# --------------------------------------------------------------------------- + +_CODEX_REASONING = Fixture( + name="codex-reasoning", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-reason"}, + { + "type": "item.started", + "item": {"id": "r1", "type": "reasoning", "text": ""}, + }, + { + "type": "item.updated", + "item": {"id": "r1", "type": "reasoning", "text": "Step 1: analyze the problem"}, + }, + { + "type": "item.completed", + "item": { + "id": "r1", + "type": "reasoning", + "text": "Step 1: analyze the problem\nStep 2: solve it", + }, + }, + { + "type": "item.started", + "item": {"id": "msg2", "type": "agent_message", "text": ""}, + }, + { + "type": "item.completed", + "item": {"id": "msg2", "type": "agent_message", "text": "The answer is 42."}, + }, + { + "type": "turn.completed", + "usage": { + "input_tokens": 30, + "output_tokens": 20, + "reasoning_tokens": 50, + "total_tokens": 100, + }, + }, + ] + ), +) +register(_CODEX_REASONING) + +# --------------------------------------------------------------------------- +# Fixture 4: multi-step (mcp_tool_call + follow-up text) +# --------------------------------------------------------------------------- + +_CODEX_MULTI = Fixture( + name="codex-multi-step", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-multi"}, + { + "type": "item.started", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "filesystem", + "tool": "read_file", + "arguments": {"path": "/workspace/README.md"}, + }, + }, + { + "type": "item.completed", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "filesystem", + "tool": "read_file", + "arguments": {"path": "/workspace/README.md"}, + "result": {"content": "# My Project"}, + }, + }, + { + "type": "item.started", + "item": {"id": "msg3", "type": "agent_message", "text": "The README says:"}, + }, + { + "type": "item.completed", + "item": { + "id": "msg3", + "type": "agent_message", + "text": "The README says: # My Project", + }, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 50, "output_tokens": 30, "total_tokens": 80}, + }, + ] + ), +) +register(_CODEX_MULTI) + + +# --------------------------------------------------------------------------- +# Local parametrized tests (cross-channel conformance) +# --------------------------------------------------------------------------- + +_LOCAL_FIXTURES = [_CODEX_TEXT, _CODEX_TOOL, _CODEX_REASONING, _CODEX_MULTI] + + +@pytest.mark.parametrize("fixture", _LOCAL_FIXTURES, ids=lambda f: f.name) +def test_codex_events_are_non_empty(fixture: Fixture) -> None: + """Every codex fixture yields at least one StreamTaskMessage*.""" + assert len(fixture.events) > 0 diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py new file mode 100644 index 000000000..7c79f9397 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -0,0 +1,299 @@ +"""Cross-channel conformance tests: yield_events vs auto_send. + +What is asserted +---------------- +For each fixture the conformance runner drives BOTH delivery channels and +verifies two guarantees: + +1. **Logical-delivery equivalence**: the sequence of logically-delivered + messages is identical across channels. "Logical" normalises away the + streaming-envelope difference: + - yield channel delivers StreamTaskMessageFull(ToolResponseContent) as-is. + - auto_send delivers the same tool-response by opening a streaming context + with the full content and closing it immediately. + Both collapse to LogicalDelivery(content_type, identity, payload) tuples + that compare equal. The payload includes initial_content (TextContent.content + and ReasoningContent.summary) so a channel that drops initial content fails. + +2. **Span signal equivalence**: both channels feed the same pure SpanDeriver + over the same event sequence, so the derived span signals must be identical. + +What is NOT asserted +-------------------- +Raw wire-level event shapes are NOT compared (that would fail by design: the +Full vs Start+Done envelope difference is a documented, acceptable choice in +auto_send — see runner.py for the rationale). + +auto_send delivers streamed tool-request messages: both channels produce a +delivery for streamed tool_request, verified by the "streamed-tool-request" +fixture. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import ( + Fixture, + register, + derive_all, + all_fixtures, + run_cross_channel_conformance, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +_FIXTURES: list[Fixture] = [ + # fixture 1: single tool call — tool_request delivered via Full (classic path) + # plus a streamed tool_response via Full. Both channels should deliver both. + Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ], + ), + # fixture 2: streaming text — exercises the text start/delta/done path. + # Uses non-empty initial_content so the payload comparison catches a channel + # that drops StreamTaskMessageStart.content (Greptile id 3438655533, P1). + Fixture( + name="streaming-text", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content="Init"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hello"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta=" world"), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ), + # fixture 3: reasoning block — exercises reasoning span open/close + delivery. + # ReasoningContent.summary is included in the payload so a channel that drops + # the reasoning-summary fails (Greptile id 3438655533, P1). + Fixture( + name="reasoning-block", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["Thinking..."], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="step 1", + ), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ), + # fixture 4: streamed tool_request — tool_request delivered via Start+Done + # (no Full). Both channels must produce a LogicalDelivery for this fixture. + Fixture( + name="streamed-tool-request", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="tr-1", + name="Read", + arguments={"path": "/tmp/foo"}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="tr-1", + name="Read", + content="file contents", + ), + ), + ], + ), + # fixture 5: parallel tool calls + a tool that errors (AGX1-373 review, + # danielmillerp). The earlier fixtures only exercise one tool at a time, so + # equivalence is proven over trivially-orderable streams. This stresses the + # representative case: two tool spans open SIMULTANEOUSLY (p-ls opens via the + # streamed Start+Done path, p-read opens via Full while p-ls is still open), + # then close in a different order than they opened, and one of them returns + # an error. It guards against the two channels agreeing with each other while + # both mishandling interleaved/parallel spans or a failing tool. + # + # The failing tool sets ToolResponseContent.is_error=True (AGX1-371), which + # the span deriver threads onto the closed tool span's CloseSpan.is_error. + # Both channels feed the same deriver, so the recorded span signals — error + # status included — must match. + Fixture( + name="parallel-tools-with-error", + events=[ + # p-ls: streamed tool_request (opens its span at Done). + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="p-ls", + name="Bash", + arguments={"command": "ls /nope"}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + # p-read: Full tool_request opens a second span while p-ls is open. + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="p-read", + name="Read", + arguments={"path": "/etc/hosts"}, + ), + ), + # p-ls errors and closes first (close order != open order). + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="p-ls", + name="Bash", + content="Error: ls: /nope: No such file or directory", + is_error=True, + ), + ), + # p-read succeeds and closes second. + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="p-read", + name="Read", + content="127.0.0.1 localhost", + ), + ), + ], + ), +] + +# Register all fixtures for backward-compatible use via all_fixtures() +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for every fixture. + + This is the real cross-channel guarantee: the two delivery adapters + agree on WHAT was delivered (logical content) and HOW spans were derived, + even though their streaming-envelope shapes differ (Full vs Start+Done for + tool messages). + + The span signals are the ones each channel's tracer ACTUALLY recorded while + delivering, not a re-derivation, so a regression where one channel skips + deriver.observe() for some event type is caught here. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) + + +# --------------------------------------------------------------------------- +# Backward-compatible determinism test (kept for regression coverage) +# --------------------------------------------------------------------------- + + +def test_span_derivation_is_deterministic() -> None: + """Span derivation over the same event list is idempotent, for EVERY + registered fixture across all harnesses. + + ``all_fixtures()`` is read at run time (not at collection/parametrize time) + so it sees fixtures registered by every conformance module, regardless of + import/collection order. The per-harness conformance modules are imported + eagerly via ``conftest.py`` in this directory, so this test covers the full + cross-harness fixture set even when run in isolation. (Parametrizing on + ``all_fixtures()`` at import time would freeze the set to whatever happened + to be registered before this module was collected.) + + Retained as a lightweight regression guard. The primary cross-channel + guarantee is asserted in test_cross_channel_equivalence above. + """ + fixtures = all_fixtures() + assert len(fixtures) > len(_FIXTURES), ( + "expected per-harness fixtures to be registered in addition to the " + f"{len(_FIXTURES)} generic ones; got {len(fixtures)} total — a conformance " + "module's fixtures are not being registered (check conftest imports)" + ) + for fixture in fixtures: + assert derive_all(fixture.events) == derive_all(fixture.events), ( + f"[{fixture.name}] span derivation is not deterministic" + ) diff --git a/tests/lib/core/harness/conformance/test_langgraph_conformance.py b/tests/lib/core/harness/conformance/test_langgraph_conformance.py new file mode 100644 index 000000000..a8d43aef6 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_langgraph_conformance.py @@ -0,0 +1,218 @@ +"""Cross-channel conformance fixtures for LangGraph harness tap. + +Each fixture is built as a canonical sequence of ``StreamTaskMessage*`` events +that matches what ``convert_langgraph_to_agentex_events`` (via ``LangGraphTurn``) +emits for the given scenario. The fixtures are registered with the shared +conformance runner and exercised by both the cross-channel equivalence test +(yield_events vs auto_send) and the backward-compatible span-derivation test. + +LangGraph-specific note +----------------------- +LangGraph emits tool *requests* as ``StreamTaskMessageFull`` events (from the +"updates" stream), NOT as Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events by opening a streaming context with the full content and +closing it immediately, so both channels deliver the same logical payload. +No ``coalesce_tool_requests`` option is needed. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import Fixture, register, run_cross_channel_conformance + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY = Fixture( + name="langgraph-text-only", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hello from LangGraph!"), + ), + StreamTaskMessageDone(type="done", index=0), + ], +) + +_SINGLE_TOOL = Fixture( + name="langgraph-single-tool", + events=[ + # LangGraph tool request is a Full event (from "updates" stream) + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_1", + name="get_weather", + arguments={"city": "Paris"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_1", + name="get_weather", + content="Sunny, 72F", + ), + ), + StreamTaskMessageStart( + type="start", + index=2, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=2, + delta=TextDelta(type="text", text_delta="The weather in Paris is sunny, 72F."), + ), + StreamTaskMessageDone(type="done", index=2), + ], +) + +_REASONING = Fixture( + name="langgraph-reasoning", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="Thinking about this...", + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="The answer is 42."), + ), + StreamTaskMessageDone(type="done", index=1), + ], +) + +_MULTI_STEP = Fixture( + name="langgraph-multi-step", + events=[ + # Turn 1: streaming text + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Let me search for that."), + ), + StreamTaskMessageDone(type="done", index=0), + # Tool request (Full — from "updates" stream) + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_2", + name="search", + arguments={"query": "langgraph"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_2", + name="search", + content="LangGraph is a framework for...", + ), + ), + # Turn 2: final streaming text + StreamTaskMessageStart( + type="start", + index=3, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=3, + delta=TextDelta(type="text", text_delta="Based on my research, LangGraph is..."), + ), + StreamTaskMessageDone(type="done", index=3), + ], +) + +_LANGGRAPH_FIXTURES = [_TEXT_ONLY, _SINGLE_TOOL, _REASONING, _MULTI_STEP] + +for _fixture in _LANGGRAPH_FIXTURES: + register(_fixture) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _LANGGRAPH_FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for each LangGraph fixture. + + See runner.py for the full contract. The key LangGraph difference: tool + requests arrive as Full events rather than Start+Delta+Done, so auto_send + handles them by opening a streaming context with the full content and + closing it immediately — both channels produce the same LogicalDelivery. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) diff --git a/tests/lib/core/harness/conformance/test_openai_conformance.py b/tests/lib/core/harness/conformance/test_openai_conformance.py new file mode 100644 index 000000000..e8630ca7f --- /dev/null +++ b/tests/lib/core/harness/conformance/test_openai_conformance.py @@ -0,0 +1,206 @@ +"""OpenAI conformance fixtures for the shared harness span-derivation engine. + +The cross-channel guarantee is that yield-delivery and auto_send observe the +SAME canonical StreamTaskMessage* stream, so span derivation and logical +delivery over that stream must be equivalent regardless of channel. These +fixtures express the canonical sequences an OpenAI turn produces (text, +tool-call, reasoning, and a combined multi-step turn) and assert that property +via run_cross_channel_conformance. + +Registry hazard (see conformance/runner.py): _REGISTRY is process-global and +collection order across modules is not guaranteed. To stay deterministic this +module keeps its OWN fixture list and parametrizes over THAT list, rather than +over all_fixtures(). It still calls register() so the cross-module conformance +suite can see these fixtures too. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import Fixture, register, run_cross_channel_conformance + +_OPENAI_FIXTURES: list[Fixture] = [] + + +def _add(fixture: Fixture) -> None: + """Register both module-locally (for parametrization) and globally.""" + _OPENAI_FIXTURES.append(fixture) + register(fixture) + + +# Text-only turn: start -> deltas -> done. +# Uses non-empty initial_content so payload comparison catches a channel that +# drops StreamTaskMessageStart.content. +_add( + Fixture( + name="openai-text-only", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content="Init"), + ), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")), + StreamTaskMessageDone(type="done", index=0), + ], + ) +) + +# Tool-call turn: Full(ToolRequestContent) for the call + Full(ToolResponseContent) +# for the result, matched by tool_call_id. Mirrors the OpenAI converter's tool path. +_add( + Fixture( + name="openai-tool-call", + events=[ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_1", + name="get_weather", + arguments={"city": "SF"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_1", + name="get_weather", + content="72F", + ), + ), + ], + ) +) + +# Reasoning turn: start(ReasoningContent) -> content deltas -> done. +# ReasoningContent.summary is seeded in the payload so a channel that drops the +# summary fails the cross-channel comparison. +_add( + Fixture( + name="openai-reasoning", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["Thinking..."], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="step 1", + ), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ) +) + +# Multi-step turn: reasoning, then a tool round, then the final answer text. +_add( + Fixture( + name="openai-multi-step", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["plan"], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="elaboration", + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_2", + name="search", + arguments={"q": "x"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_2", + name="search", + content="result", + ), + ), + StreamTaskMessageStart( + type="start", + index=3, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta(type="delta", index=3, delta=TextDelta(type="text", text_delta="done")), + StreamTaskMessageDone(type="done", index=3), + ], + ) +) + + +@pytest.mark.parametrize("fixture", _OPENAI_FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_openai_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for every OpenAI fixture. + + This is the cross-channel guarantee: the two delivery adapters agree on + WHAT was delivered (logical content) and HOW spans were derived, even + though their streaming-envelope shapes differ (Full vs Start+Done for tool + messages). + + The span signals are the ones each channel's tracer ACTUALLY recorded while + delivering, not a re-derivation, so a regression where one channel skips + deriver.observe() for some event type is caught here. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) diff --git a/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py b/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py new file mode 100644 index 000000000..5d9952334 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py @@ -0,0 +1,187 @@ +"""Cross-channel conformance fixtures derived from real pydantic-ai event sequences. + +Each fixture is built by running a pydantic_ai event stream through PydanticAITurn +and collecting the canonical StreamTaskMessage* output. These canonical event lists are +then registered with the conformance runner and exercised by the cross-channel test +(yield_events vs auto_send). + +Streamed tool requests +---------------------- +The pydantic-ai stream emits a tool REQUEST as Start + ToolRequestDelta + Done (not a +Full event). Both the conformance runner and auto_send deliver the +Start+Delta+Done(tool_request) shape, so the cross-channel test asserts full +delivery-equivalence for streamed tool requests. The fixtures below retain the +ToolRequestDelta events as the streamed tool-request inputs. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + ThinkingPart, + ToolCallPart, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, + ToolReturnPart, + ThinkingPartDelta, + ToolCallPartDelta, + FunctionToolResultEvent, +) + +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +from .runner import ( + Fixture, + register, + run_pure_async, + run_cross_channel_conformance, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _canonical(pydantic_events: list[Any]) -> list[Any]: + """Run pydantic_ai events through PydanticAITurn and collect the output. + + The output equals the bare convert_pydantic_ai_to_agentex_events output. + """ + turn = PydanticAITurn(_aiter(pydantic_events), model=None) + return [e async for e in turn.events] + + +def _build_fixtures() -> list[Fixture]: + """Build all pydantic-ai conformance fixtures synchronously at import time. + + Uses the loop-free ``run_pure_async`` driver rather than ``asyncio.run()``, + which would raise under an already-running loop (programmatic pytest, + notebooks) since this runs during module import. + """ + + # ------------------------------------------------------------------ # + # 1. Text-only run: simple streaming text response. + # ------------------------------------------------------------------ # + text_only_pydantic = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Hello, ")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="world!")), + PartEndEvent(index=0, part=TextPart(content="Hello, world!")), + ] + + # ------------------------------------------------------------------ # + # 2. Single tool call + tool response. + # The canonical stream emits Start+ToolRequestDelta+Done for the request + # and Full(ToolResponseContent) for the response. Both are asserted + # delivery-equivalent cross-channel (see the module docstring). + # ------------------------------------------------------------------ # + tool_call_pydantic = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="call_01"), + ), + PartDeltaEvent( + index=0, + delta=ToolCallPartDelta(args_delta='{"city":"Paris"}', tool_call_id="call_01"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="call_01"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Sunny, 22C", tool_call_id="call_01"), + ), + ] + + # ------------------------------------------------------------------ # + # 3. Reasoning/thinking block: produces ReasoningContent Start+Delta+Done. + # ------------------------------------------------------------------ # + reasoning_pydantic = [ + PartStartEvent(index=0, part=ThinkingPart(content="")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta="First, let me think...")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta=" Then conclude.")), + PartEndEvent(index=0, part=ThinkingPart(content="First, let me think... Then conclude.")), + ] + + # ------------------------------------------------------------------ # + # 4. Multi-step run: text -> tool call + response -> text. + # Pydantic AI restarts part indices at 0 for each model response; the + # converter assigns globally-monotonic indices to Agentex messages. + # ------------------------------------------------------------------ # + multi_step_pydantic = [ + # First model turn: text then tool call + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Let me check the weather.")), + PartEndEvent(index=0, part=TextPart(content="Let me check the weather.")), + PartStartEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="call_ms1"), + ), + PartDeltaEvent( + index=1, + delta=ToolCallPartDelta(args_delta='{"city":"London"}', tool_call_id="call_ms1"), + ), + PartEndEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args='{"city":"London"}', tool_call_id="call_ms1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Cloudy, 15C", tool_call_id="call_ms1"), + ), + # Second model turn: text response (pydantic restarts index at 0) + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="It's cloudy and 15C in London.")), + PartEndEvent(index=0, part=TextPart(content="It's cloudy and 15C in London.")), + ] + + text_only_events = run_pure_async(_canonical(text_only_pydantic)) + tool_call_events = run_pure_async(_canonical(tool_call_pydantic)) + reasoning_events = run_pure_async(_canonical(reasoning_pydantic)) + multi_step_events = run_pure_async(_canonical(multi_step_pydantic)) + + return [ + Fixture(name="pydantic-ai-text-only", events=text_only_events), + Fixture(name="pydantic-ai-single-tool-call", events=tool_call_events), + Fixture(name="pydantic-ai-reasoning-block", events=reasoning_events), + Fixture(name="pydantic-ai-multi-step", events=multi_step_events), + ] + + +_FIXTURES: list[Fixture] = _build_fixtures() + +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for each pydantic-ai fixture. + + See runner.py for the full contract, including streamed-tool-request + delivery equivalence. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py new file mode 100644 index 000000000..764dae8b3 --- /dev/null +++ b/tests/lib/core/harness/test_auto_send.py @@ -0,0 +1,479 @@ +"""Tests for auto_send delivery adapter. + +The fake mirrors the real StreamingTaskMessageContext API exactly: +- streaming_task_message_context(...) returns a context object (synchronously) +- open the context via __aenter__ (returns self after creating the task message) +- stream deltas via ctx.stream_update(StreamTaskMessageDelta(...)) +- close via ctx.close() (NOT __aexit__) + +This mirrors _langgraph_async.py lines 62-78 and 100-127. +""" + +from datetime import datetime + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_delta import TextDelta +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.auto_send import auto_send +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + +from ._fakes import FakeTracing + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens (returns self with task_message set), + close() closes. stream_update records the call. + + task_message is a real TaskMessage instance so that auto_send can use it + as parent_task_message in StreamTaskMessageDelta without Pydantic validation errors. + """ + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + # Real TaskMessage so StreamTaskMessageDelta(parent_task_message=...) passes validation + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + # __aexit__ delegates to close in the real impl; keep for safety + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Mirrors StreamingService: streaming_task_message_context returns a context object.""" + + def __init__(self): + self.sink = [] + self.recorded_created_at: list[datetime | None] = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + self.recorded_created_at.append(created_at) + return _FakeCtx(self.sink, ctype, initial_content) + + +async def _gen(events): + for e in events: + yield e + + +# --------------------------------------------------------------------------- +# Test 1: text streaming — open, stream deltas, close; return accumulated text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_text_and_returns_final_text(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hel"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="lo"), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "Hello" + + kinds = [s[0] for s in streaming.sink] + # A context was created for the text content + assert kinds[0] == "ctx" + # It was opened and closed + assert "open" in kinds + assert "close" in kinds + # Exactly two updates were streamed (one per delta) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + +# --------------------------------------------------------------------------- +# Test 2: tool_request Full + tool_response Full — each posts one full message +# (open context with the content, no deltas, close immediately) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_posts_full_tool_messages(): + streaming = _FakeStreaming() + events = [ + # Two Full events post two messages (open+close immediately, no deltas). + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="file.py", + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + # Each Full event opens and closes exactly one context. + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + content_types = [s[1] for s in ctx_events] + assert content_types == ["tool_request", "tool_response"] + + # Each context is opened and closed + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 2 + assert len(closes) == 2 + + # No stream_update calls (full messages have no deltas) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 0 + + +# --------------------------------------------------------------------------- +# Test 3: tracing — spans are derived and handed to the tracer +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_derives_tool_spans_via_tracer(): + fake_tracing = FakeTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake_tracing) + streaming = _FakeStreaming() + + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="ok", + ), + ), + ] + + result = await auto_send(_gen(events), task_id="task1", tracer=tracer, streaming=streaming) + + assert result.final_text == "" + assert fake_tracing.started_names == ["Bash"] + assert fake_tracing.ended_outputs == ["ok"] + + +# --------------------------------------------------------------------------- +# Test 4: text followed by a tool Full — text context is closed before Full +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_closes_text_context_before_full_message(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hi"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c2", + name="read_file", + arguments={}, + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Hi" + + # Verify ordering: text ctx opens, updates, closes; then tool_request ctx opens, closes + event_sequence = [(s[0], s[1]) for s in streaming.sink] + text_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "text")) + text_close_idx = next(i for i, s in enumerate(event_sequence) if s == ("close", "text")) + tool_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "tool_request")) + assert text_open_idx < text_close_idx < tool_open_idx + + +# --------------------------------------------------------------------------- +# Test 5: midstream error — propagates AND the open context is closed (finally) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_open_context_closed_on_midstream_error(): + streaming = _FakeStreaming() + + async def _exploding_gen(): + yield StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ) + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + await auto_send(_exploding_gen(), task_id="task1", tracer=None, streaming=streaming) + + # The text context that was opened mid-stream was closed by the finally block. + assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] + assert ("close", "text") in [(s[0], s[1]) for s in streaming.sink] + + +# --------------------------------------------------------------------------- +# Test 6: streamed tool_request delivered +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_tool_request(): + """A Start(ToolRequestContent) MUST open a streaming context.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_tool", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta( + type="tool_request", + tool_call_id="c_tool", + name="Bash", + arguments_delta='{"cmd": "ls"}', + ), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 1 + assert ctx_events[0][1] == "tool_request" + + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 1 + assert len(closes) == 1 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 1 + + +# --------------------------------------------------------------------------- +# Test 7: interleaved indexes route correctly +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_interleaved_indexes_route_correctly(): + """Deltas must be routed to the correct index-keyed context.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="A"), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="B"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + + opens = [s for s in streaming.sink if s[0] == "open"] + assert len(opens) == 2 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + update_deltas = [s[1].delta for s in streaming.sink if s[0] == "update"] + text_deltas = [d.text_delta for d in update_deltas if isinstance(d, TextDelta)] + assert set(text_deltas) == {"A", "B"} + + +# --------------------------------------------------------------------------- +# Test 8: final_text returns last text segment for multi-step +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_final_text_last_segment(): + """final_text must be the LAST text segment, not accumulated across all turns.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="First"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="Second"), + ), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Second" + + +# --------------------------------------------------------------------------- +# Test 9: Full(TextContent) contributes to final_text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_full_text_content_sets_final_text(): + """A Full(TextContent) must contribute its text to final_text.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=TextContent(type="text", author="agent", content="hello"), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "hello" + + +# --------------------------------------------------------------------------- +# Test 10: created_at is forwarded to streaming context +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_created_at_forwarded(): + """created_at must be forwarded to every streaming_task_message_context call.""" + streaming = _FakeStreaming() + dt = datetime(2025, 1, 15, 12, 0, 0) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_ts", + name="Bash", + arguments={}, + ), + ), + ] + await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming, created_at=dt) + + assert all(ts == dt for ts in streaming.recorded_created_at) diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py new file mode 100644 index 000000000..3f70660ec --- /dev/null +++ b/tests/lib/core/harness/test_emitter.py @@ -0,0 +1,142 @@ +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + +from ._fakes import FakeTracing + + +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake (see test_auto_send.py).""" + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +class _Turn: + def __init__(self, events_list, usage): + self._events_list = events_list + self._usage = usage + + @property + async def events(self): + for e in self._events_list: + yield e + + def usage(self): + return self._usage + + +@pytest.mark.asyncio +async def test_emitter_yield_mode_passes_through(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, TurnUsage(model="m")) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +@pytest.mark.asyncio +async def test_emitter_tracing_default_on_when_trace_id_present(): + # Inject a fake tracing backend so the test env doesn't need temporalio. + # This exercises the default-on path (tracer=None) when trace_id is truthy. + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracing=FakeTracing()) + assert emitter.tracer is not None + + +@pytest.mark.asyncio +async def test_emitter_tracing_overridable_off(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) + assert emitter.tracer is None + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_returns_usage(): + usage = TurnUsage(model="m", input_tokens=5) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hello")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, usage) + fake = _FakeStreaming() + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=fake) + result = await emitter.auto_send_turn(turn) + assert result.usage == usage + assert result.final_text == "Hello" + + +class _ContractTurn: + """A turn that honors the single-pass contract: usage() is the empty default + UNTIL `events` is exhausted, then the real usage (this is how real harness + turns behave — they populate usage while the stream is consumed).""" + + def __init__(self, events_list, real_usage): + self._events_list = events_list + self._real_usage = real_usage + self._exhausted = False + + @property + async def events(self): + for e in self._events_list: + yield e + self._exhausted = True + + def usage(self): + return self._real_usage if self._exhausted else TurnUsage(model="m") + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_reads_usage_after_exhaustion(): + # Regression: auto_send_turn must read turn.usage() AFTER consuming the + # stream, not eagerly when building the auto_send call (which would capture + # the empty default and lose real token usage on the auto_send path). + real_usage = TurnUsage(model="m", input_tokens=11, output_tokens=22, total_tokens=33, num_llm_calls=2) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _ContractTurn(events, real_usage) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=_FakeStreaming()) + result = await emitter.auto_send_turn(turn) + assert result.usage == real_usage + assert result.usage.input_tokens == 11 and result.usage.total_tokens == 33 diff --git a/tests/lib/core/harness/test_harness_langgraph_async.py b/tests/lib/core/harness/test_harness_langgraph_async.py new file mode 100644 index 000000000..39bf5bc66 --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_async.py @@ -0,0 +1,298 @@ +"""Integration test: async (Redis-streaming) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.auto_send_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The async handler pushes the correct sequence of messages to the fake streaming + backend: Full(ToolRequest) + Full(ToolResponse) + text Start/Delta/Done. +- final_text accumulates all text (not just last segment — AGX1-377 unified behavior). +- Tool messages go through streaming_task_message_context (not messages.create). +- With a SpanTracer, no tool spans are produced (AGX1-377: Full events are not + handled by SpanDeriver today). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Redis streaming (requires a running Redis instance). +- The ACP on_task_event_send / on_task_create / on_task_cancel lifecycle. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP async request lifecycle. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnResult +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend (replaces adk.streaming; no Redis required) +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span(self, *, trace_id: str, name: str, **kw: Any) -> _FakeSpan: + self.started.append((name, kw.get("parent_id"))) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_auto_send_turn( + stream_events: list[tuple[str, Any]], + trace_id: str | None = None, +) -> tuple[TurnResult, _FakeStreaming, _FakeTracing | None]: + fake_streaming = _FakeStreaming() + fake_tracing = _FakeTracing() if trace_id else None + + tracer: SpanTracer | bool = False + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + turn = LangGraphTurn(_make_stream(stream_events), model=None) + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestAsyncAutoSendChannel: + async def test_text_only_streams_text_and_returns_final(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + assert result.final_text == "Hello from LangGraph!" + text_ctxs = [c for c in fake_streaming.contexts if c.ctype == "text"] + assert len(text_ctxs) == 1 + assert text_ctxs[0].closed is True + + async def test_tool_call_posted_via_streaming_context(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # Tool request via streaming_task_message_context (Full event) + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + assert len(tool_req_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.tool_call_id == "call_1" + assert tool_req_ctxs[0].closed is True + assert tool_req_ctxs[0].deltas == [], "Full messages have no deltas" + + async def test_tool_response_posted_via_streaming_context(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + + _, fake_streaming, _ = await _run_auto_send_turn(events) + + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_resp_ctxs) == 1 + assert tool_resp_ctxs[0].initial_content.content == "Sunny, 72F" + assert tool_resp_ctxs[0].closed is True + + async def test_multi_step_final_text_is_last_segment(self): + """Unified surface: final_text uses last-segment semantics. + + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + This matches the behaviour documented in auto_send.py and mirrors + stream_pydantic_ai_events. + """ + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "s", "args": {}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="s") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # Last segment only — first text segment is NOT in final_text + assert result.final_text == "Found it!" + + # Two text streaming contexts still opened (both streamed to Redis) + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + + async def test_empty_stream_returns_empty_final_text(self): + result, fake_streaming, _ = await _run_auto_send_turn([]) + assert result.final_text == "" + assert fake_streaming.contexts == [] + + async def test_turn_usage_populated_after_events_consumed(self): + """LangGraphTurn.usage() is populated via the on_final_ai_message callback + during event iteration. TurnResult.usage is a snapshot from before events run + (emitter.auto_send_turn evaluates turn.usage() eagerly); the authoritative + post-iteration usage is on turn.usage() directly.""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="hi", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter( + task_id="task1", trace_id=None, parent_span_id=None, tracer=False, streaming=fake_streaming + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, _, fake_tracing = await _run_auto_send_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" diff --git a/tests/lib/core/harness/test_harness_langgraph_sync.py b/tests/lib/core/harness/test_harness_langgraph_sync.py new file mode 100644 index 000000000..9f67dd2b6 --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_sync.py @@ -0,0 +1,229 @@ +"""Integration test: sync (HTTP-yield) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.yield_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The sync handler correctly yields StreamTaskMessage* events in order: + Full(ToolRequest) then Full(ToolResponse) then text Start+Delta+Done. +- With trace_id + fake tracing, the SpanDeriver fires for text events. +- LangGraph emits tool calls as Full events (not Start+Done); the SpanDeriver + opens a tool span on Full(ToolRequestContent) and closes it on the matching + Full(ToolResponseContent) (see test_tracer_produces_tool_spans_for_full_events). +- Final text is accumulated via yield mode. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual HTTP streaming over the ACP sync endpoint. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP request/response lifecycle. + +See also: test_harness_langgraph_async.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, *, trace_id: str, name: str, input: Any = None, parent_id: Any = None, **kw: Any + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_yield_turn( + stream_events: list[tuple[str, Any]], trace_id: str | None = None +) -> tuple[list[Any], _FakeTracing | None]: + fake_tracing = _FakeTracing() if trace_id else None + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer if tracer is not None else False, + ) + turn = LangGraphTurn(_make_stream(stream_events), model=None) + out = [e async for e in emitter.yield_turn(turn)] + return out, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestSyncYieldChannel: + async def test_text_only_stream_yields_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_tool_call_yields_full_events(self): + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + full_events = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(full_events) == 2 + + contents = [e.content for e in full_events] + assert any(isinstance(c, ToolRequestContent) for c in contents) + assert any(isinstance(c, ToolResponseContent) for c in contents) + + async def test_multi_step_yields_events_in_order(self): + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "search", "args": {"q": "test"}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + out, _ = await _run_yield_turn(events) + + # Should have multiple start events (one per text segment) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) >= 2 + # And two Full events (tool req + tool resp) + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(fulls) == 2 + + async def test_empty_stream_yields_nothing(self): + out, _ = await _run_yield_turn([]) + assert out == [] + + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, fake_tracing = await _run_yield_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" + + async def test_usage_captured_after_yield(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + _ = [e async for e in emitter.yield_turn(turn)] + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 diff --git a/tests/lib/core/harness/test_harness_langgraph_temporal.py b/tests/lib/core/harness/test_harness_langgraph_temporal.py new file mode 100644 index 000000000..1a094a33c --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_temporal.py @@ -0,0 +1,233 @@ +"""Integration test: Temporal channel with a LangGraph agent. + +The Temporal LangGraph agent pattern uses ``emit_langgraph_messages`` (from +``_langgraph_messages.py``) inside a Temporal activity. That module is not +yet unified onto the harness surface (it has its own Redis-streaming code). + +This test file verifies the LangGraph Temporal agent's streaming behavior using +the same fake streaming infrastructure as test_harness_langgraph_async.py. The +key difference from the non-temporal async path is that in Temporal, each agent +turn runs inside a Temporal activity that has already been handed the task_id +and a pre-wired streaming client — so the ``UnifiedEmitter.auto_send_turn`` +path is identical. The graph activities and workflow scaffolding are not tested +here; that requires a running Temporal cluster. + +What is tested +-------------- +- stream_langgraph_events (the public async API used by temporal agent acp.py via + the workflow activity) produces the same result via the unified surface. +- Usage from AIMessage.usage_metadata is captured in TurnResult.usage. +- The auto_send_turn path for a temporal-style call (same as async). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Temporal workflow execution (requires a running Temporal cluster). +- The Temporal activity retry/compensation logic. +- LangGraph checkpoint storage via TemporalCheckpointer. +- emit_langgraph_messages (the Temporal-specific streaming helper). +- Real LLM calls or real LangGraph graph execution. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_async.py. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestTemporalAutoSendChannel: + async def test_stream_langgraph_events_plain_text(self, monkeypatch): + """stream_langgraph_events (used by temporal agents via the acp.py activity) returns + the accumulated final text.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + chunk = AIMessageChunk(content="Hello Temporal!") + ai_msg = AIMessage(content="Hello Temporal!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + assert final == "Hello Temporal!" + + async def test_stream_langgraph_events_tool_call(self, monkeypatch): + from langchain_core.messages import AIMessage, ToolMessage + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + tc = {"id": "c1", "name": "search", "args": {"q": "test"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk_final = AIMessage(content="Here are the results.") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("updates", {"agent": {"messages": [chunk_final]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + + # Check tool request and response posted to fake streaming + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_req_ctxs) == 1 + assert len(tool_resp_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.name == "search" + + async def test_langgraph_turn_auto_send_via_unified_emitter(self): + """Direct UnifiedEmitter.auto_send_turn path used by temporal agent workflow + activities. Uses a fake streaming backend (no Redis).""" + from langchain_core.messages import AIMessage, AIMessageChunk + + fake_streaming = _FakeStreaming() + chunk = AIMessageChunk(content="Temporal answer!") + ai_msg = AIMessage(content="Temporal answer!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + turn = LangGraphTurn(_make_stream(events), model=None) + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert result.final_text == "Temporal answer!" + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 1 + + async def test_usage_captured_via_turn_after_events_consumed(self): + """Usage from AIMessage.usage_metadata is captured via the on_final_ai_message + callback during event iteration. The authoritative usage is on turn.usage() + after events are consumed (emitter.auto_send_turn evaluates turn.usage() + eagerly before iteration, so TurnResult.usage is a pre-iteration snapshot).""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 20, "output_tokens": 10, "total_tokens": 30} + ai_msg = AIMessage(content="answer", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 20 + assert usage.output_tokens == 10 + assert usage.total_tokens == 30 + + async def test_empty_stream_returns_empty_string(self, monkeypatch): + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + final = await stream_langgraph_events(_make_stream([]), "task-1") + assert final == "" + assert fake_streaming.contexts == [] diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_async.py b/tests/lib/core/harness/test_harness_pydantic_ai_async.py new file mode 100644 index 000000000..8bda7d020 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_async.py @@ -0,0 +1,361 @@ +"""Integration test: async (Redis-streaming) channel with a pydantic-ai agent. + +Exercises the unified harness surface (UnifiedEmitter.auto_send_turn + PydanticAITurn) +with a minimal pydantic-ai agent backed by TestModel so the test runs fully +offline (no API keys, no Redis, no Agentex server). + +Agent description +----------------- +Same single-tool agent as the sync test: ``get_weather(city: str) -> str`` +returning "sunny and 72F". TestModel is configured to call the tool once then +produce a fixed text reply. + +The async path uses the bare PydanticAITurn (no coalescing): the foundation +auto_send delivers streamed tool-request Start+ToolRequestDelta+Done messages +natively (AGX1-377 fix), so no coalescing wrapper is needed. + +What is tested +-------------- +- The async handler pushes the correct sequence of messages to the fake streaming + backend: tool_request + tool_response + text (in that order). +- final_text equals the TestModel custom output. +- With a SpanTracer, tool spans are derived and forwarded to the fake tracing + backend (streamed tool-request delivery now triggers span derivation on the + async path). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Redis streaming (requires a running Redis instance). +- The ACP on_task_event_send / on_task_create / on_task_cancel lifecycle. +- Multi-turn history persistence via adk.state. +- Real LLM calls or production model behaviour. +- The full FastACP async request lifecycle. + +See also: test_harness_pydantic_ai_sync.py (span derivation with sync path) and +test_harness_pydantic_ai_temporal.py (temporal activity path). +""" + +from __future__ import annotations + +from typing import Any + +import pytest +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel + +from agentex.types.task_message import TaskMessage +from agentex.lib.core.harness.types import TurnResult +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Minimal agent under test +# --------------------------------------------------------------------------- + + +def _make_agent() -> Agent: + """Build a pydantic-ai agent with one weather tool and a TestModel.""" + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + agent: Agent = Agent(model) + + @agent.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return agent + + +# --------------------------------------------------------------------------- +# Fake streaming backend (replaces adk.streaming; no Redis required) +# --------------------------------------------------------------------------- + + +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake.""" + + def __init__(self, sink: list[Any], ctype: str, initial_content: Any) -> None: + self.sink = sink + self.ctype = ctype + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.ctype, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.ctype)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("delta", self.ctype, update)) + return update + + +class _FakeStreaming: + """Fake streaming backend; records every context lifecycle event.""" + + def __init__(self) -> None: + self.sink: list[Any] = [] + self.messages_opened: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.messages_opened.append(initial_content) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _run_auto_send_turn( + agent: Agent, + user_msg: str = "What is the weather in Paris?", + trace_id: str | None = None, + parent_span_id: str | None = None, + fake_tracing: _FakeTracing | None = None, +) -> tuple[TurnResult, _FakeStreaming]: + """Drive the async (auto_send) path and return the TurnResult + fake streaming state.""" + fake_streaming = _FakeStreaming() + + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn( + stream, + model="test", + ) + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=tracer if tracer is not None else False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + return result, fake_streaming + + +# --------------------------------------------------------------------------- +# Tests: message order and content +# --------------------------------------------------------------------------- + + +class TestAsyncAutoSendMessageOrder: + """auto_send pushes messages to the streaming backend in canonical order.""" + + async def test_tool_request_pushed_first(self) -> None: + """tool_request is the first message type pushed to the streaming backend.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in message_types + assert message_types.index("tool_request") < message_types.index("tool_response"), ( + "tool_request must be pushed before tool_response" + ) + + async def test_tool_response_pushed_after_tool_request(self) -> None: + """tool_response appears after tool_request in the pushed messages.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_response" in message_types + + async def test_text_pushed_last(self) -> None: + """Text content is the last type pushed (after tool round-trip).""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert message_types[-1] == "text", f"Expected last message type=text, got {message_types}" + + async def test_exactly_three_messages(self) -> None: + """Exactly three message contexts are opened: tool_request, tool_response, text.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + assert len(fake_streaming.messages_opened) == 3, ( + f"Expected 3 messages (tool_request + tool_response + text), " + f"got {len(fake_streaming.messages_opened)}: " + f"{[getattr(m, 'type', None) for m in fake_streaming.messages_opened]}" + ) + + +class TestAsyncAutoSendContentVerification: + """The content pushed to the streaming backend is correct.""" + + async def test_tool_request_content(self) -> None: + """The pushed tool_request is a ToolRequestContent for get_weather.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_reqs = [m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)] + assert len(tool_reqs) == 1, "Expected exactly one ToolRequestContent" + assert tool_reqs[0].name == "get_weather" + + async def test_tool_response_content(self) -> None: + """The pushed tool_response is a ToolResponseContent containing the weather result.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_resps = [m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)] + assert len(tool_resps) == 1, "Expected exactly one ToolResponseContent" + assert isinstance(tool_resps[0].content, str) + assert "72F" in tool_resps[0].content + assert tool_resps[0].name == "get_weather" + + async def test_tool_call_ids_match(self) -> None: + """tool_request and tool_response have the same tool_call_id.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_req = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)) + tool_resp = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)) + assert tool_req.tool_call_id == tool_resp.tool_call_id, ( + "tool_request and tool_response must share the same tool_call_id" + ) + + +class TestAsyncAutoSendFinalText: + """auto_send_turn returns the accumulated text from the last text part.""" + + async def test_final_text_matches_model_output(self) -> None: + """TurnResult.final_text equals the TestModel custom_output_text.""" + agent = _make_agent() + result, _ = await _run_auto_send_turn(agent) + assert result.final_text == "The weather in Paris is sunny and 72F." + + async def test_turn_result_has_usage(self) -> None: + """TurnResult carries a TurnUsage object (may have None tokens from TestModel).""" + agent = _make_agent() + result, _ = await _run_auto_send_turn(agent) + assert result.usage is not None + + async def test_context_lifecycle_open_then_close(self) -> None: + """Every message context is opened then closed (no leak).""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + opens = [e for e in fake_streaming.sink if e[0] == "open"] + closes = [e for e in fake_streaming.sink if e[0] == "close"] + assert len(opens) == len(closes) == 3, "Each of the 3 messages must have exactly one open and one close" + + +class TestAsyncAutoSendSpanDerivation: + """Span derivation on the async path now works for streamed tool requests. + + The foundation auto_send delivers Start+ToolRequestDelta+Done natively + (AGX1-377 fix). The SpanDeriver opens a tool span on Done(tool_request), + so the async path now derives spans just like the sync path. + """ + + async def test_tool_span_derived_on_async_path(self) -> None: + """With the bare PydanticAITurn (no coalescing), a tool span is derived + on the async/auto_send path when auto_send delivers the streamed + Start+ToolRequestDelta+Done sequence.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent", + task_id="task1", + tracing=fake_tracing, + ) + fake_streaming = _FakeStreaming() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent", + tracer=tracer, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + + assert len(fake_tracing.started) == 1, ( + "Expected one tool span to be started for the get_weather call." + ) + assert fake_tracing.started[0][0] == "get_weather" + assert len(fake_tracing.ended) == 1 + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_async_handler_pushes_messages_for_various_inputs(user_msg: str) -> None: + """auto_send pushes at least tool_request + tool_response + text for any input.""" + agent = _make_agent() + result, fake_streaming = await _run_auto_send_turn(agent, user_msg=user_msg) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in message_types + assert "tool_response" in message_types + assert "text" in message_types + assert isinstance(result.final_text, str) + assert len(result.final_text) > 0 diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_sync.py b/tests/lib/core/harness/test_harness_pydantic_ai_sync.py new file mode 100644 index 000000000..1557d0dd1 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_sync.py @@ -0,0 +1,388 @@ +"""Integration test: sync (HTTP-yield) channel with a pydantic-ai agent. + +Exercises the unified harness surface (UnifiedEmitter.yield_turn + PydanticAITurn) +with a minimal pydantic-ai agent backed by TestModel so the test runs fully +offline (no API keys, no live infrastructure). + +Agent description +----------------- +A single-tool agent with ``get_weather(city: str) -> str`` that always returns +"sunny and 72F". TestModel is configured to call that tool once then produce +a fixed text reply, giving a deterministic event sequence. + +What is tested +-------------- +- The sync handler correctly yields StreamTaskMessage* events in order: + tool_request (Start+Done) then tool_response (Full) then text (Start+Delta+Done). +- Final accumulated text equals the TestModel custom output. +- With a trace_id + fake tracing, a tool span is opened (OpenSpan) and + closed (CloseSpan) — proving the SpanDeriver is wired on the yield path. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual HTTP streaming over the ACP sync endpoint (requires a running + Agentex server + deployed agent). +- Real LLM calls or production model behaviour. +- The full FastACP request/response lifecycle. + +See also: tests/lib/core/harness/test_harness_pydantic_ai_async.py and +test_harness_pydantic_ai_temporal.py for the other two channels. +""" + +from __future__ import annotations + +from typing import Any, override + +import pytest +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel + +from agentex.types.text_delta import TextDelta +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Minimal agent under test +# --------------------------------------------------------------------------- + + +def _make_agent() -> Agent: + """Build a pydantic-ai agent with one weather tool and a TestModel. + + TestModel is instantiated with call_tools=['get_weather'] so it always + invokes the tool once, then emits custom_output_text as the reply. + """ + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + agent: Agent = Agent(model) + + @agent.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return agent + + +# --------------------------------------------------------------------------- +# Fake tracing backend (no network calls) +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _run_yield_turn( + agent: Agent, + user_msg: str = "What is the weather in Paris?", + trace_id: str | None = None, + parent_span_id: str | None = None, + fake_tracing: _FakeTracing | None = None, +) -> list[Any]: + """Drive the sync (yield) path and collect all yielded events.""" + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id="task1", + tracing=fake_tracing, + ) + + events: list[Any] = [] + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=tracer if tracer is not None else False, + ) + events = [ev async for ev in emitter.yield_turn(turn)] + return events + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestSyncYieldEventOrder: + """The yield channel forwards events in canonical order.""" + + async def test_tool_request_precedes_tool_response(self) -> None: + """tool_request events appear before the tool_response Full event.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + content_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, (StreamTaskMessageStart, StreamTaskMessageFull)) + ] + assert "tool_request" in content_types + assert "tool_response" in content_types + tool_req_idx = content_types.index("tool_request") + tool_resp_idx = content_types.index("tool_response") + assert tool_req_idx < tool_resp_idx, "tool_request must appear before tool_response in the event stream" + + async def test_text_appears_after_tool_response(self) -> None: + """Text content (Start/Done) comes after the tool_response Full event.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + full_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, StreamTaskMessageFull) + ] + start_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, StreamTaskMessageStart) + ] + + assert "tool_response" in full_types + assert "text" in start_types + + tool_resp_pos = next( + i + for i, ev in enumerate(events) + if isinstance(ev, StreamTaskMessageFull) + and getattr(getattr(ev, "content", None), "type", None) == "tool_response" + ) + text_start_pos = next( + i + for i, ev in enumerate(events) + if isinstance(ev, StreamTaskMessageStart) and getattr(getattr(ev, "content", None), "type", None) == "text" + ) + assert tool_resp_pos < text_start_pos + + async def test_tool_response_carries_weather_result(self) -> None: + """The ToolResponseContent contains the get_weather return value.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + full_events = [ + ev + for ev in events + if isinstance(ev, StreamTaskMessageFull) and isinstance(getattr(ev, "content", None), ToolResponseContent) + ] + assert len(full_events) >= 1, "Expected at least one tool_response Full event" + tool_response = full_events[0].content + assert isinstance(tool_response, ToolResponseContent) + assert isinstance(tool_response.content, str) + assert "72F" in tool_response.content + assert tool_response.name == "get_weather" + + async def test_accumulated_text_matches_model_output(self) -> None: + """Accumulated text deltas equal the TestModel custom_output_text.""" + from agentex.types.task_message_update import StreamTaskMessageDelta + + agent = _make_agent() + events = await _run_yield_turn(agent) + + accumulated = "".join( + ev.delta.text_delta + for ev in events + if isinstance(ev, StreamTaskMessageDelta) and isinstance(ev.delta, TextDelta) and ev.delta.text_delta + ) + assert accumulated == "The weather in Paris is sunny and 72F." + + async def test_every_start_has_matching_done(self) -> None: + """Every StreamTaskMessageStart has a corresponding StreamTaskMessageDone.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + starts = {ev.index for ev in events if isinstance(ev, StreamTaskMessageStart)} + dones = {ev.index for ev in events if isinstance(ev, StreamTaskMessageDone)} + assert starts == dones, f"Unmatched Start/Done indices: starts={starts} dones={dones}" + + +class TestSyncYieldSpanDerivation: + """SpanDeriver is wired on the yield path; tool spans are opened/closed.""" + + async def test_tool_span_opened_and_closed(self) -> None: + """One tool span is opened and closed per tool call.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent-span", + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=tracer, + ) + await emitter.yield_turn(turn).__anext__.__self__ if False else None + [_ async for _ in emitter.yield_turn(turn)] + + assert len(fake_tracing.started) == 1, "Expected exactly one tool span opened" + assert len(fake_tracing.ended) == 1, "Expected exactly one tool span closed" + span_name, parent_id = fake_tracing.started[0] + assert span_name == "get_weather" + assert parent_id == "parent-span" + + async def test_tool_span_output_is_tool_result(self) -> None: + """The closed tool span's output equals the tool's return value.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent-span", + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=tracer, + ) + [_ async for _ in emitter.yield_turn(turn)] + + name, output = fake_tracing.ended[0] + assert name == "get_weather" + assert output is not None + assert "72F" in str(output) + + async def test_no_trace_id_means_no_spans(self) -> None: + """With trace_id=None, no spans are derived (emitter disables tracing).""" + agent = _make_agent() + fake_tracing = _FakeTracing() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id=None, + parent_span_id=None, + tracing=fake_tracing, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert fake_tracing.started == [] + assert fake_tracing.ended == [] + + async def test_tracer_false_suppresses_spans(self) -> None: + """tracer=False disables span derivation regardless of trace_id.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=False, + tracing=fake_tracing, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert fake_tracing.started == [] + assert fake_tracing.ended == [] + + async def test_span_signal_types(self) -> None: + """The signals received by the tracer are OpenSpan then CloseSpan.""" + from agentex.lib.core.harness.tracer import SpanTracer as RealTracer + + received_signals: list[Any] = [] + + class _RecordingTracer(RealTracer): + @override + async def handle(self, signal: Any) -> None: + received_signals.append(signal) + await super().handle(signal) + + fake_tracing = _FakeTracing() + tracer = _RecordingTracer( + trace_id="trace1", + parent_span_id="parent", + task_id="task1", + tracing=fake_tracing, + ) + + agent = _make_agent() + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent", + tracer=tracer, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert len(received_signals) == 2 + assert isinstance(received_signals[0], OpenSpan) + assert isinstance(received_signals[1], CloseSpan) + assert received_signals[0].name == "get_weather" + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_sync_handler_produces_events_for_various_inputs(user_msg: str) -> None: + """Yield path produces at least a tool_response Full for any user message.""" + agent = _make_agent() + events = await _run_yield_turn(agent, user_msg=user_msg) + + full_event_types = [ + getattr(getattr(ev, "content", None), "type", None) for ev in events if isinstance(ev, StreamTaskMessageFull) + ] + assert "tool_response" in full_event_types diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py b/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py new file mode 100644 index 000000000..0ead8e832 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py @@ -0,0 +1,370 @@ +"""Integration test: Temporal-backed pydantic-ai agent, offline. + +Exercises the core of the Temporal pydantic-ai harness path — the +event_stream_handler activity — with a TemporalAgent backed by TestModel so the +test runs fully offline (no Temporal server, no Redis, no API keys). + +Architecture overview +--------------------- +In a real Temporal deployment the pydantic-ai Temporal harness runs like this: + + HTTP POST /task/event/send + -> @workflow.signal on At110PydanticAiWorkflow + -> temporal_agent.run(user_message, deps=TaskDeps(...)) + internally schedules: + 1. request_activity (LLM HTTP call — recorded by Temporal) + 2. call_tool_activity (for each tool call — also recorded) + 3. event_stream_handler_activity (streams events to Redis) + +The third activity is what we test here: it receives a +``RunContext[TaskDeps]`` and an ``AsyncIterable[AgentStreamEvent]`` from +pydantic-ai, calls ``stream_pydantic_ai_events`` (which internally constructs +a ``UnifiedEmitter`` + ``PydanticAITurn`` and calls ``auto_send_turn``), and +pushes the resulting messages to Redis. + +What we test +----------- +Since ``TemporalAgent.run_stream_events`` works offline with TestModel (it does +not schedule Temporal activities — it runs in-process), we can: + +1. Build a TemporalAgent with TestModel. +2. Call ``run_stream_events`` on it directly, just as the event_stream_handler + would see the event iterable. +3. Feed that stream into ``stream_pydantic_ai_events`` backed by a fake streaming + backend, and assert the canonical message sequence. + +This covers the full inner harness chain that the Temporal workflow exercises, +minus the Temporal scheduling/durability layer itself. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Temporal scheduling (the workflow.signal -> activity dispatch chain). +- Temporal durability guarantees and replay behaviour. +- Redis streaming (requires a running Redis instance). +- Multi-turn history (pydantic-ai message_history round-tripping via Temporal + workflow state). +- Real LLM calls or production model behaviour. +- The full temporal_agent.run(...) path, which schedules activities and cannot + run without a connected Temporal client. + +To test with live infrastructure: spin up Temporal + Redis + the ACP server + +the Temporal worker, then use the AsyncAgentex client to create a task, send a +message, and poll for messages — exactly as the existing examples/tutorials/ +10_async/10_temporal/110_pydantic_ai/tests/test_agent.py does. +""" + +from __future__ import annotations + +from typing import Any + +import pytest +from pydantic import BaseModel +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel +from pydantic_ai.durable_exec.temporal import TemporalAgent + +from agentex.types.task_message import TaskMessage +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Agent under test (mirrors examples/tutorials/10_async/10_temporal/110_pydantic_ai) +# --------------------------------------------------------------------------- + + +class TaskDeps(BaseModel): + """Per-run dependencies injected via RunContext.deps.""" + + task_id: str + parent_span_id: str | None = None + + +def _make_temporal_agent() -> TemporalAgent[TaskDeps, str]: + """Build a TemporalAgent with TestModel and one weather tool. + + The underlying pydantic-ai Agent is constructed with TaskDeps as the + deps_type, mirroring the real temporal tutorial agent. TestModel makes + the run deterministic and offline. + """ + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + base: Agent[TaskDeps, str] = Agent(model, deps_type=TaskDeps) + + @base.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return TemporalAgent(base, name="test_temporal_agent") + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink: list[Any], ctype: str, initial_content: Any) -> None: + self.sink = sink + self.ctype = ctype + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.ctype, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.ctype)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("delta", self.ctype, update)) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.sink: list[Any] = [] + self.messages_opened: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.messages_opened.append(initial_content) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers: the event_stream_handler pattern tested offline +# --------------------------------------------------------------------------- + + +async def _run_event_stream_handler( + temporal_agent: TemporalAgent[TaskDeps, str], + user_msg: str = "What is the weather in Paris?", + task_id: str = "task1", +) -> _FakeStreaming: + """Simulate the event_stream_handler activity offline. + + In production the event_stream_handler receives the event stream from + pydantic-ai's model activity and calls stream_pydantic_ai_events. + Here we obtain the stream directly from run_stream_events (which works + offline with TestModel) and forward it to stream_pydantic_ai_events backed + by a fake streaming backend. + + This is equivalent to: + async def event_handler(ctx: RunContext[TaskDeps], events: AsyncIterable[AgentStreamEvent]) -> None: + await stream_pydantic_ai_events(events, ctx.deps.task_id) + but without requiring a running Temporal server. + """ + fake_streaming = _FakeStreaming() + + async with temporal_agent.run_stream_events(user_msg) as stream: + await _fake_stream_pydantic_ai_events(stream, task_id, fake_streaming) + + return fake_streaming + + +async def _fake_stream_pydantic_ai_events( + stream: Any, + task_id: str, + fake_streaming: _FakeStreaming, +) -> str: + """Like stream_pydantic_ai_events but uses an injected fake streaming backend. + + Mirrors the exact chain that stream_pydantic_ai_events uses internally: + PydanticAITurn(stream) + + UnifiedEmitter.auto_send_turn(turn) + but with the fake backend injected so no Redis is needed. + """ + turn = PydanticAITurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result.final_text + + +# --------------------------------------------------------------------------- +# Tests: TemporalAgent + event_stream_handler pattern +# --------------------------------------------------------------------------- + + +class TestTemporalEventStreamHandlerMessageOrder: + """The event_stream_handler pushes messages in canonical order.""" + + async def test_tool_request_before_tool_response(self) -> None: + """tool_request is pushed before tool_response.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in types + assert "tool_response" in types + assert types.index("tool_request") < types.index("tool_response") + + async def test_text_is_last(self) -> None: + """Text content is pushed last (after the tool round-trip).""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert types[-1] == "text" + + async def test_exactly_three_messages(self) -> None: + """Exactly tool_request + tool_response + text are pushed.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + assert len(fake_streaming.messages_opened) == 3, ( + f"Expected 3 messages, got {len(fake_streaming.messages_opened)}: " + f"{[getattr(m, 'type', None) for m in fake_streaming.messages_opened]}" + ) + + +class TestTemporalEventStreamHandlerContent: + """Content verification for the messages pushed by the event_stream_handler.""" + + async def test_tool_request_is_get_weather(self) -> None: + """The pushed tool_request is for the get_weather function.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_reqs = [m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)] + assert len(tool_reqs) == 1 + assert tool_reqs[0].name == "get_weather" + + async def test_tool_response_contains_weather_result(self) -> None: + """The pushed tool_response contains the get_weather return value.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_resps = [m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)] + assert len(tool_resps) == 1 + assert isinstance(tool_resps[0].content, str) + assert "72F" in tool_resps[0].content + assert tool_resps[0].name == "get_weather" + + async def test_tool_call_ids_match(self) -> None: + """tool_request and tool_response share the same tool_call_id.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_req = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)) + tool_resp = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)) + assert tool_req.tool_call_id == tool_resp.tool_call_id + + +class TestTemporalFinalText: + """stream_pydantic_ai_events returns the correct final text.""" + + async def test_final_text_matches_model_output(self) -> None: + """The returned final text equals the TestModel custom_output_text.""" + temporal_agent = _make_temporal_agent() + fake_streaming = _FakeStreaming() + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + final = await _fake_stream_pydantic_ai_events(stream, "task1", fake_streaming) + + assert final == "The weather in Paris is sunny and 72F." + + async def test_context_lifecycle_complete(self) -> None: + """Every opened streaming context is also closed.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + opens = [e for e in fake_streaming.sink if e[0] == "open"] + closes = [e for e in fake_streaming.sink if e[0] == "close"] + assert len(opens) == len(closes), "Every opened context must be closed" + + +class TestTemporalAgentStreamEventsOffline: + """TemporalAgent.run_stream_events produces the expected raw pydantic-ai events. + + This verifies that the TemporalAgent wrapper does not suppress event stream + delivery when used with TestModel, so the event_stream_handler pattern is + meaningful offline. + """ + + async def test_run_stream_events_yields_tool_call_and_text(self) -> None: + """TemporalAgent.run_stream_events with TestModel yields tool + text events.""" + + temporal_agent = _make_temporal_agent() + collected: list[Any] = [] + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + async for ev in stream: + collected.append(ev) + + event_types = {type(ev).__name__ for ev in collected} + assert "FunctionToolResultEvent" in event_types, "Expected FunctionToolResultEvent proving tool call ran" + assert "PartDeltaEvent" in event_types or "PartEndEvent" in event_types, ( + "Expected text part events in the stream" + ) + + async def test_run_stream_events_contains_tool_result(self) -> None: + """The raw event stream contains a FunctionToolResultEvent with the tool output.""" + from pydantic_ai.messages import FunctionToolResultEvent + + temporal_agent = _make_temporal_agent() + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + events = [ev async for ev in stream] + + tool_results = [ev for ev in events if isinstance(ev, FunctionToolResultEvent)] + assert len(tool_results) >= 1 + assert isinstance(tool_results[0].part.content, str) + assert "72F" in tool_results[0].part.content + + +class TestTemporalLiveInfraNote: + """Placeholder tests documenting what requires live Temporal infrastructure. + + These tests are skipped by design. They document the gap between what the + offline tests cover and what a full integration test would exercise. + """ + + @pytest.mark.skip( + reason=( + "Requires live Temporal server + Redis + ACP server + worker. " + "See examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py " + "for the live integration test that exercises this path end-to-end." + ) + ) + async def test_temporal_workflow_full_round_trip(self) -> None: + """Full Temporal workflow: create_task -> send_event -> poll_messages.""" + pass # Covered by the live tutorial test + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_temporal_handler_pushes_messages_for_various_inputs(user_msg: str) -> None: + """event_stream_handler pushes tool_request + tool_response + text for any input.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent, user_msg=user_msg) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in types + assert "tool_response" in types + assert "text" in types diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py new file mode 100644 index 000000000..51e2ede2c --- /dev/null +++ b/tests/lib/core/harness/test_span_derivation.py @@ -0,0 +1,286 @@ +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +def _signals(deriver, events): + out = [] + for e in events: + out.extend(deriver.observe(e)) + out.extend(deriver.flush()) + return out + + +def _tool_req(idx, tcid, name, args): + return StreamTaskMessageStart( + type="start", + index=idx, + content=ToolRequestContent(type="tool_request", author="agent", tool_call_id=tcid, name=name, arguments=args), + ) + + +def test_text_only_yields_no_spans(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=None), + StreamTaskMessageDone(type="done", index=0), + ] + assert _signals(d, events) == [] + + +def test_single_tool_opens_on_done_closes_on_response(): + d = SpanDeriver() + events = [ + _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="call_1", name="Bash", content="files" + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}), + CloseSpan(key="call_1", output="files", is_complete=True), + ] + # No status reported -> CloseSpan carries is_error=None. + assert sigs[1].is_error is None + + +def test_tool_response_is_error_propagates_to_close_span(): + """ToolResponseContent.is_error flows onto the CloseSpan so a derived tool + span can be marked as a failure (AGX1-371).""" + d = SpanDeriver() + events = [ + _tool_req(0, "call_err", "Bash", {"cmd": "false"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_err", + name="Bash", + content="boom", + is_error=True, + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_err", kind="tool", name="Bash", input={"cmd": "false"}), + CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True), + ] + + +def test_reasoning_opens_on_start_closes_on_done(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", index=0, content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[]) + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="reasoning:0", kind="reasoning", name="reasoning", input={}) + assert sigs[1] == CloseSpan(key="reasoning:0", output=None, is_complete=True) + + +def test_parallel_tools_pair_by_tool_call_id(): + d = SpanDeriver() + events = [ + _tool_req(0, "a", "T1", {}), + _tool_req(1, "b", "T2", {}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="b", name="T2", content="rb" + ), + ), + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="a", name="T1", content="ra" + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + closes = [s for s in sigs if isinstance(s, CloseSpan)] + assert {o.key for o in opens} == {"a", "b"} + assert [c.key for c in closes] == ["b", "a"] + assert all(c.is_complete for c in closes) + + +def test_streamed_args_accumulate_into_open_input(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='{"cmd":'), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='"ls"}'), + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="c", kind="tool", name="Bash", input={"cmd": "ls"}) + + +def test_unclosed_tool_closed_incomplete_on_flush(): + d = SpanDeriver() + events = [ + _tool_req(0, "x", "Bash", {}), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) + assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) + + +def test_none_index_is_skipped(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=None, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="n", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=None), + ] + assert _signals(d, events) == [] + + +def test_orphan_tool_response_ignored(): + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="z", name="Bash", content="r" + ), + ), + ] + assert _signals(d, events) == [] + + +def test_full_tool_request_opens_span(): + """Full(ToolRequestContent) must open a tool span (for LangGraph-style harnesses).""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_x", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="call_x", kind="tool", name="Bash", input={"cmd": "ls"}) + assert sigs[1] == CloseSpan(key="call_x", output=None, is_complete=False) + + +def test_full_tool_request_and_response_paired(): + """Full(ToolRequestContent) + Full(ToolResponseContent) produces a complete span pair.""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_y", + name="Grep", + arguments={}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_y", + name="Grep", + content="result", + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_y", kind="tool", name="Grep", input={}), + CloseSpan(key="call_y", output="result", is_complete=True), + ] + + +def test_full_tool_request_does_not_double_open(): + """A Full(ToolRequestContent) for an already-open tool_call_id is a no-op.""" + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + assert len(opens) == 1 + assert opens[0].key == "call_z" diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py new file mode 100644 index 000000000..b3d9002c4 --- /dev/null +++ b/tests/lib/core/harness/test_tracer.py @@ -0,0 +1,73 @@ +from typing import override + +import pytest + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer + +from ._fakes import FakeTracing + + +@pytest.mark.asyncio +async def test_open_then_close_starts_and_ends_span(): + fake = FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.started == [("Bash", "p1", {"cmd": "ls"})] + assert fake.ended == [("Bash", "files")] + + +@pytest.mark.asyncio +async def test_close_records_is_error_on_span_data(): + """A CloseSpan carrying is_error records the status on span.data (AGX1-371).""" + fake = FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_err", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True)) + assert fake.ended_spans[0].data == {"is_error": True} + + +@pytest.mark.asyncio +async def test_close_without_status_leaves_span_data_untouched(): + """is_error=None (no status reported) must not write to span.data.""" + fake = FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.ended_spans[0].data is None + + +@pytest.mark.asyncio +async def test_no_trace_id_is_noop(): + fake = FakeTracing() + tracer = SpanTracer(trace_id="", parent_span_id=None, tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert fake.started == [] and fake.ended == [] + + +@pytest.mark.asyncio +async def test_tracing_failure_is_swallowed(): + class _Boom(FakeTracing): + @override + async def start_span(self, **kw): + raise RuntimeError("backend down") + + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=_Boom()) + # Must not raise. + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert tracer._open == {} + + +@pytest.mark.asyncio +async def test_duplicate_open_replaces_silently(): + fake = FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="A")) + await tracer.handle(OpenSpan(key="k", kind="tool", name="B")) + await tracer.handle(CloseSpan(key="k")) + # Both opens started spans, but only the second ("B") is closed. + assert [name for name, _, _ in fake.started] == ["A", "B"] + assert fake.ended == [("B", None)] diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py new file mode 100644 index 000000000..68bc89ce2 --- /dev/null +++ b/tests/lib/core/harness/test_types.py @@ -0,0 +1,53 @@ +from typing import AsyncIterator + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) + + +def test_open_close_span_construct(): + o = OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}) + c = CloseSpan(key="call_1", output="files", is_complete=True) + assert o.key == c.key == "call_1" + assert o.kind == "tool" + assert c.is_complete is True + + +def test_turn_usage_defaults_are_none(): + u = TurnUsage(model="claude-opus-4-6") + assert u.model == "claude-opus-4-6" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +def test_turn_result_wraps_usage(): + r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) + assert r.final_text == "hi" + assert r.usage.model == "m" + + +def test_close_span_defaults(): + c = CloseSpan(key="x") + assert c.output is None + assert c.is_complete is True + + +def test_harness_turn_runtime_check(): + class _Turn: + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + async def _gen() -> AsyncIterator[StreamTaskMessage]: + if False: + yield # pragma: no cover + + return _gen() + + def usage(self) -> TurnUsage: + return TurnUsage(model="m") + + assert isinstance(_Turn(), HarnessTurn) is True diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py new file mode 100644 index 000000000..ef3861a16 --- /dev/null +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -0,0 +1,77 @@ +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.yield_delivery import yield_events + +from ._fakes import FakeTracing + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_passes_events_through_and_traces(): + fake = FakeTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ] + out = [e async for e in yield_events(_gen(events), tracer=tracer)] + assert out == events # passthrough unchanged + assert fake.started_names == ["Bash"] # span derived + opened + assert fake.ended_outputs == ["ok"] # span closed with response + + +@pytest.mark.asyncio +async def test_yield_without_tracer_is_pure_passthrough(): + events = [ + StreamTaskMessageDone(type="done", index=0), + ] + out = [e async for e in yield_events(_gen(events), tracer=None)] + assert out == events + + +@pytest.mark.asyncio +async def test_flush_runs_on_early_close(): + fake = FakeTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + # response intentionally never arrives + ] + gen = yield_events(_gen(events), tracer=tracer) + first = await gen.__anext__() # Start + second = await gen.__anext__() # Done -> tool span opens here + await gen.aclose() # triggers the finally -> flush() + assert fake.started_names == ["Bash"] + assert fake.ended_outputs == [None] # flush closed the unpaired span (incomplete, no output) diff --git a/tests/lib/core/tracing/processors/test_agentex_tracing_processor.py b/tests/lib/core/tracing/processors/test_agentex_tracing_processor.py index ec1ed5e88..84f37b495 100644 --- a/tests/lib/core/tracing/processors/test_agentex_tracing_processor.py +++ b/tests/lib/core/tracing/processors/test_agentex_tracing_processor.py @@ -2,7 +2,8 @@ import asyncio import weakref -from unittest.mock import MagicMock, patch +from datetime import datetime, timezone +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -24,11 +25,163 @@ MODULE = "agentex.lib.core.tracing.processors.agentex_tracing_processor" +SKIP_ENV = "AGENTEX_TRACING_SKIP_AGENTEX_SPAN_START" + + def _make_config() -> MagicMock: """Empty config — AgentexTracingProcessorConfig is unused by __init__.""" return MagicMock() +def _make_span(): + from agentex.types.span import Span + + now = datetime.now(timezone.utc) + return Span( + id="span-1", + trace_id="trace-1", + name="test-span", + start_time=now, + end_time=now, + input={"in": 1}, + output={"out": 2}, + ) + + +class TestAgentexSyncSkipSpanStart: + """The Agentex backend writes create-on-start + update-on-end by default. + End-only ingest (default) skips the start write and makes the END a single + create — verify the start is a no-op and end does an INSERT, not an UPDATE. + """ + + def test_start_skipped_and_end_creates_by_default(self, monkeypatch): + monkeypatch.delenv(SKIP_ENV, raising=False) # default ON + with patch(f"{MODULE}.Agentex") as MockAgentex: + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexSyncTracingProcessor, + ) + + processor = AgentexSyncTracingProcessor(_make_config()) + client = MockAgentex.return_value + span = _make_span() + + processor.on_span_start(span) + client.spans.create.assert_not_called() # start skipped + client.spans.update.assert_not_called() + + processor.on_span_end(span) + client.spans.create.assert_called_once() # single INSERT on end + client.spans.update.assert_not_called() # never a 404-prone UPDATE + + def test_start_creates_and_end_updates_when_skip_disabled(self, monkeypatch): + monkeypatch.setenv(SKIP_ENV, "0") + with patch(f"{MODULE}.Agentex") as MockAgentex: + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexSyncTracingProcessor, + ) + + processor = AgentexSyncTracingProcessor(_make_config()) + client = MockAgentex.return_value + span = _make_span() + + processor.on_span_start(span) + client.spans.create.assert_called_once() # start write restored + + processor.on_span_end(span) + client.spans.update.assert_called_once() # end is the UPDATE + + def test_skip_decision_captured_at_init_not_per_call(self, monkeypatch): + """The two halves of a span MUST use the same skip decision. A flag + toggled after construction must not split it (start-skip + end-update + would 404). The decision is captured once at init. + """ + monkeypatch.delenv(SKIP_ENV, raising=False) # construct with skip ON + with patch(f"{MODULE}.Agentex") as MockAgentex: + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexSyncTracingProcessor, + ) + + processor = AgentexSyncTracingProcessor(_make_config()) + client = MockAgentex.return_value + span = _make_span() + + processor.on_span_start(span) # skipped (cached ON) + monkeypatch.setenv(SKIP_ENV, "0") # toggle mid-span — must be ignored + processor.on_span_end(span) + + client.spans.create.assert_called_once() # still end-only INSERT + client.spans.update.assert_not_called() # NOT a 404-prone UPDATE + + +class TestAgentexAsyncSkipSpanStart: + async def test_start_skipped_and_end_creates_by_default(self, monkeypatch): + monkeypatch.delenv(SKIP_ENV, raising=False) # default ON + with patch(f"{MODULE}.create_async_agentex_client") as mock_factory: + client = MagicMock() + client.spans.create = AsyncMock() + client.spans.update = AsyncMock() + mock_factory.return_value = client + + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexAsyncTracingProcessor, + ) + + processor = AgentexAsyncTracingProcessor(_make_config()) + span = _make_span() + + await processor.on_span_start(span) + client.spans.create.assert_not_called() # start skipped + client.spans.update.assert_not_called() + + await processor.on_span_end(span) + client.spans.create.assert_awaited_once() # single INSERT on end + client.spans.update.assert_not_called() + + async def test_start_creates_and_end_updates_when_skip_disabled(self, monkeypatch): + monkeypatch.setenv(SKIP_ENV, "0") + with patch(f"{MODULE}.create_async_agentex_client") as mock_factory: + client = MagicMock() + client.spans.create = AsyncMock() + client.spans.update = AsyncMock() + mock_factory.return_value = client + + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexAsyncTracingProcessor, + ) + + processor = AgentexAsyncTracingProcessor(_make_config()) + span = _make_span() + + await processor.on_span_start(span) + client.spans.create.assert_awaited_once() # start write restored + + await processor.on_span_end(span) + client.spans.update.assert_awaited_once() # end is the UPDATE + + async def test_skip_decision_captured_at_init_not_per_call(self, monkeypatch): + """A flag toggled after construction must not split a span's lifecycle.""" + monkeypatch.delenv(SKIP_ENV, raising=False) # construct with skip ON + with patch(f"{MODULE}.create_async_agentex_client") as mock_factory: + client = MagicMock() + client.spans.create = AsyncMock() + client.spans.update = AsyncMock() + mock_factory.return_value = client + + from agentex.lib.core.tracing.processors.agentex_tracing_processor import ( + AgentexAsyncTracingProcessor, + ) + + processor = AgentexAsyncTracingProcessor(_make_config()) + span = _make_span() + + await processor.on_span_start(span) # skipped (cached ON) + monkeypatch.setenv(SKIP_ENV, "0") # toggle mid-span — must be ignored + await processor.on_span_end(span) + + client.spans.create.assert_awaited_once() # still end-only INSERT + client.spans.update.assert_not_called() # NOT a 404-prone UPDATE + + class TestAgentexAsyncTracingProcessor: """Coverage for the per-event-loop client cache. The SGP processor has matching tests; mirror them here so a regression in the Agentex side diff --git a/tests/lib/test_state_machine.py b/tests/lib/test_state_machine.py new file mode 100644 index 000000000..ce32ba9f0 --- /dev/null +++ b/tests/lib/test_state_machine.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import override +from unittest.mock import AsyncMock, patch + +from agentex.lib.sdk.state_machine import State, StateMachine, StateWorkflow +from agentex.lib.utils.model_utils import BaseModel + + +class ExampleData(BaseModel): + value: int = 0 + + +class InitialWorkflow(StateWorkflow): + transitions = ["next"] + + @override + async def execute(self, state_machine, state_machine_data=None): + return "next" + + +class NextWorkflow(StateWorkflow): + transitions = ["initial"] + + @override + async def execute(self, state_machine, state_machine_data=None): + return "initial" + + +class ExampleStateMachine(StateMachine[ExampleData]): + @override + async def terminal_condition(self): + return False + + +def _make_state_machine() -> ExampleStateMachine: + return ExampleStateMachine( + initial_state="initial", + states=[ + State(name="initial", workflow=InitialWorkflow()), + State(name="next", workflow=NextWorkflow()), + ], + task_id="task-123", + state_machine_data=ExampleData(value=1), + trace_transitions=True, + ) + + +async def test_reset_to_initial_state_skips_end_span_when_start_span_fails_open(): + state_machine = _make_state_machine() + await state_machine.transition("next") + + with patch( + "agentex.lib.sdk.state_machine.state_machine.adk.tracing.start_span", + new=AsyncMock(return_value=None), + ) as start_span, patch( + "agentex.lib.sdk.state_machine.state_machine.adk.tracing.end_span", + new=AsyncMock(), + ) as end_span: + await state_machine.reset_to_initial_state() + + assert state_machine.get_current_state() == "initial" + start_span.assert_awaited_once_with( + trace_id="task-123", + name="state_transition_reset", + input={"input_state": "next"}, + ) + end_span.assert_not_awaited() diff --git a/tests/lib/test_webhooks.py b/tests/lib/test_webhooks.py new file mode 100644 index 000000000..e42fac9dd --- /dev/null +++ b/tests/lib/test_webhooks.py @@ -0,0 +1,267 @@ +"""Unit tests for the SDK webhook helper (agentex.lib.sdk.utils.webhooks).""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest + +from agentex.lib import adk +from agentex.lib.sdk.utils.webhooks import ( + WebhookError, + session_key, + handle_webhook, + render_generic, + shape_github_pr, + resolve_remote_params, +) + + +def _pr_payload(**pr_overrides) -> dict: + pr = { + "number": 42, + "title": "Add retry to uploader", + "body": "Adds backoff on 503.", + "html_url": "https://example.com/acme/widgets/pull/42", + } + pr.update(pr_overrides) + return { + "action": "opened", + "repository": {"full_name": "acme/widgets"}, + "sender": {"login": "octocat"}, + "pull_request": pr, + } + + +class TestSessionKey: + def test_stable_and_folds_same_conversation(self): + a = session_key("agent-1", "github_pr", "acme/widgets#42") + b = session_key("agent-1", "github_pr", "acme/widgets#42") + assert a == b and a.startswith("wh-github_pr-") + + def test_differs_by_peer(self): + assert session_key("a", "github_pr", "r#1") != session_key("a", "github_pr", "r#2") + + +class TestShaping: + def test_render_generic_prefers_text_field(self): + assert render_generic({"text": "hello"}) == "hello" + + def test_render_generic_falls_back_to_json(self): + assert "zen" in render_generic({"zen": "be awesome"}) + + def test_render_generic_matches_keys_case_insensitively(self): + assert render_generic({"Message": "hi there"}) == "hi there" + + def test_render_generic_supports_broadened_keys(self): + assert render_generic({"description": "do the thing"}) == "do the thing" + + def test_github_pr_shape(self): + text, peer, sender = shape_github_pr(_pr_payload()) + assert "Pull request acme/widgets#42: Add retry to uploader" in text + assert "Action: opened" in text + assert "Adds backoff on 503." in text + assert peer == "acme/widgets#42" + assert sender == "octocat" + + def test_github_pr_includes_diff(self): + body = _pr_payload() + body["pull_request"]["diff"] = "diff --git a/x b/x\n+line" + text, _, _ = shape_github_pr(body) + assert "Diff:" in text and "+line" in text + + def test_non_pr_payload_falls_back_to_generic(self): + text, peer, _ = shape_github_pr({"zen": "be awesome", "hook_id": 1}) + assert "Pull request" not in text + assert "be awesome" in text + assert peer is None + + +class TestResolveRemoteParams: + async def test_envelope_with_params_and_metadata(self): + async def fetch(_url): + return {"params": {"system_prompt": "x", "model": "m"}, "task_metadata": {"cfg": "1"}} + + params, md = await resolve_remote_params("https://h/resolve", fetch=fetch) + assert params == {"system_prompt": "x", "model": "m"} + assert md == {"cfg": "1"} + + async def test_bare_object_is_params_minus_task_metadata(self): + async def fetch(_url): + return {"system_prompt": "x", "task_metadata": {"cfg": "1"}} + + params, md = await resolve_remote_params("https://h/resolve", fetch=fetch) + assert params == {"system_prompt": "x"} # task_metadata stripped from params + assert md == {"cfg": "1"} + + async def test_non_object_raises(self): + async def fetch(_url): + return ["nope"] + + with pytest.raises(WebhookError): + await resolve_remote_params("https://h/resolve", fetch=fetch) + + +def _agent_msg(text: str): + return SimpleNamespace(content=SimpleNamespace(author="agent", type="text", content=text)) + + +class TestHandleWebhook: + @pytest.fixture(autouse=True) + def _mock_adk(self, monkeypatch): + self.created = {} + self.sent = {} + self.stamped = {} + self.created_task_metadata = {} + + async def create_task(*, name, agent_name, params=None, request=None, **_): + self.created = {"name": name, "agent_name": agent_name, "params": params, "request": request} + return SimpleNamespace(id="task-1", task_metadata=self.created_task_metadata) + + async def send_message(*, task_id, agent_name, content, **_): + self.sent = {"task_id": task_id, "content": content} + return [_agent_msg("Looks good — ship it.")] + + async def update_task(*, task_id, task_metadata=None, **_): + self.stamped = {"task_id": task_id, "task_metadata": task_metadata} + return SimpleNamespace(id=task_id) + + send_event = AsyncMock() + monkeypatch.setattr(adk.acp, "create_task", create_task) + monkeypatch.setattr(adk.acp, "send_message", send_message) + monkeypatch.setattr(adk.acp, "send_event", send_event) + monkeypatch.setattr(adk.tasks, "update", update_task) + self.send_event = send_event + yield + + async def test_sync_github_pr_with_config_by_id(self): + async def fake_resolve(_url): + return {"params": {"system_prompt": "review"}, "task_metadata": {"agent_config_id": "cfg-9"}} + + result = await handle_webhook( + agent_name="golden-agent", + payload=_pr_payload(), + acp_type="sync", + shaper="github_pr", + params_source="https://h/v5/agent_configs/cfg-9/resolve", + fetch=fake_resolve, + ) + + assert result.reply == "Looks good — ship it." + assert self.created["params"] == {"system_prompt": "review"} + # metadata is returned on the result (SDK task/create can't carry it) + md = result.task_metadata + assert md["channel"] == "github_pr" + assert md["peer_id"] == "acme/widgets#42" + assert md["agent_config_id"] == "cfg-9" + # task folded on a stable session key + assert self.created["name"].startswith("wh-github_pr-") + # metadata is also stamped onto the task (best-effort) so it's labeled in the UI + assert self.stamped["task_id"] == "task-1" + assert self.stamped["task_metadata"]["peer_id"] == "acme/widgets#42" + assert self.stamped["task_metadata"]["agent_config_id"] == "cfg-9" + + async def test_inline_params_no_fetch(self): + result = await handle_webhook( + agent_name="a", + payload={"text": "hi"}, + acp_type="sync", + params={"system_prompt": "inline"}, + ) + assert result.reply == "Looks good — ship it." + assert self.created["params"] == {"system_prompt": "inline"} + + async def test_source_metadata_cannot_override_canonical(self): + async def fake_resolve(_url): + return {"params": {}, "task_metadata": {"channel": "spoofed"}} + + result = await handle_webhook( + agent_name="a", + payload=_pr_payload(), + shaper="github_pr", + params_source="https://h/resolve", + fetch=fake_resolve, + ) + assert result.task_metadata["channel"] == "github_pr" + + async def test_task_metadata_preserves_existing_keys_on_reused_task(self): + self.created_task_metadata = { + "labels": ["customer-facing"], + "agent_config_id": "old-cfg", + "channel": "old-channel", + } + + async def fake_resolve(_url): + return {"params": {}, "task_metadata": {"agent_config_id": "cfg-9"}} + + await handle_webhook( + agent_name="a", + payload=_pr_payload(), + shaper="github_pr", + params_source="https://h/resolve", + fetch=fake_resolve, + ) + + stamped_metadata = self.stamped["task_metadata"] + assert stamped_metadata["labels"] == ["customer-facing"] + assert stamped_metadata["agent_config_id"] == "cfg-9" + assert stamped_metadata["channel"] == "github_pr" + + async def test_async_without_wait_sends_event_and_returns_no_reply(self): + result = await handle_webhook(agent_name="a", payload={"text": "go"}, acp_type="async", wait=False) + assert result.reply is None + self.send_event.assert_awaited_once() + + +class TestAwaitReplyIgnoresStalePriorReply: + async def test_returns_only_new_agent_text_on_reused_task(self, monkeypatch): + from agentex.lib.sdk.utils.webhooks import _await_reply + + old = _agent_msg("OLD reply") + old.id = "m1" + new = _agent_msg("NEW reply") + new.id = "m2" + calls = {"n": 0} + + async def fake_list(*, task_id, **_): + calls["n"] += 1 + return [old] if calls["n"] < 2 else [old, new] # new appears on 2nd poll + + async def no_sleep(_seconds): + return None + + monkeypatch.setattr(adk.messages, "list", fake_list) + monkeypatch.setattr("asyncio.sleep", no_sleep) + + # baseline = the pre-existing old message; only m2 (NEW) should be returned + reply = await _await_reply("task-1", {"m1"}, interval_s=0.0, quiescence_s=0.0) + assert reply == "NEW reply" + + async def test_returns_idless_agent_text_after_snapshot(self, monkeypatch): + from agentex.lib.sdk.utils.webhooks import _await_reply + + old = _agent_msg("OLD reply") + old.id = None + new = _agent_msg("NEW reply") + new.id = None + calls = {"n": 0} + + async def fake_list(*, task_id, **_): + calls["n"] += 1 + return [old] if calls["n"] < 2 else [old, new] + + async def no_sleep(_seconds): + return None + + monkeypatch.setattr(adk.messages, "list", fake_list) + monkeypatch.setattr("asyncio.sleep", no_sleep) + + reply = await _await_reply( + "task-1", + set(), + seen_count=1, + interval_s=0.0, + quiescence_s=0.0, + ) + assert reply == "NEW reply"