dapr
diff --git a/‎examples/workflow/async_activities.py‎
Lines changed: 104 additions & 0 deletions b/‎examples/workflow/async_activities.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎ext/dapr-ext-workflow/AGENTS.md‎
Lines changed: 20 additions & 1 deletion b/‎ext/dapr-ext-workflow/AGENTS.md‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎ext/dapr-ext-workflow/benchmarks/RESULTS.md‎
Lines changed: 132 additions & 0 deletions b/‎ext/dapr-ext-workflow/benchmarks/RESULTS.md‎
Lines changed: 132 additions & 0 deletions
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+# Copyright 2026 The Dapr Authors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Async activities running alongside sync ones in a single workflow.
+
+Starts three async activities that do an HTTP request, then a sync activity that
+sums up the results. Shows that sync and async activities work side by side.
+
+Run with:
+
+    dapr run --app-id async-activities --app-protocol grpc --dapr-grpc-port 50001 \\
+        -- python async_activities.py
+"""
+
+from __future__ import annotations
+
+from time import sleep
+
+import dapr.ext.workflow as wf
+import httpx
+from pydantic import BaseModel
+
+wfr = wf.WorkflowRuntime()
+
+
+class FetchRequest(BaseModel):
+    url: str
+    timeout_seconds: float = 5.0
+
+
+class FetchResult(BaseModel):
+    url: str
+    status_code: int
+    body_length: int
+
+
+@wfr.workflow(name='parallel_fetch_workflow')
+def parallel_fetch_workflow(ctx: wf.DaprWorkflowContext, urls: list[str]):
+    fetch_tasks = [
+        ctx.call_activity(fetch_url, input=FetchRequest(url=url).model_dump()) for url in urls
+    ]
+    results = yield wf.when_all(fetch_tasks)
+    summary = yield ctx.call_activity(summarize_fetches, input=results)
+    return summary
+
+
+@wfr.activity(name='fetch_url')
+async def fetch_url(ctx: wf.WorkflowActivityContext, request: FetchRequest) -> dict:
+    """Async activity: fetch a URL with httpx. Multiple instances run concurrently."""
+    async with httpx.AsyncClient(timeout=request.timeout_seconds) as client:
+        response = await client.get(request.url)
+    result = FetchResult(
+        url=request.url,
+        status_code=response.status_code,
+        body_length=len(response.content),
+    )
+    print(
+        f'[async] fetched {result.url} -> {result.status_code} ({result.body_length}B)', flush=True
+    )
+    return result.model_dump()
+
+
+@wfr.activity(name='summarize_fetches')
+def summarize_fetches(ctx: wf.WorkflowActivityContext, results: list[dict]) -> str:
+    """Sync activity: runs in the sync-fallback thread pool. Unchanged from before."""
+    total_bytes = sum(r['body_length'] for r in results)
+    summary = f'fetched {len(results)} URLs, total {total_bytes} bytes'
+    print(f'[sync] {summary}', flush=True)
+    return summary
+
+
+def main() -> None:
+    urls = [
+        'https://httpbin.org/uuid',
+        'https://httpbin.org/get',
+        'https://httpbin.org/headers',
+    ]
+
+    wfr.start()
+    sleep(5)  # wait for workflow runtime to start
+
+    wf_client = wf.DaprWorkflowClient()
+    instance_id = wf_client.schedule_new_workflow(workflow=parallel_fetch_workflow, input=urls)
+    print(f'Workflow started. Instance ID: {instance_id}')
+
+    state = wf_client.wait_for_workflow_completion(instance_id, timeout_in_seconds=60)
+    assert state is not None
+    print(f'Workflow completed! Status: {state.runtime_status.name}')
+    print(f'Workflow result: {state.serialized_output.strip(chr(34))}')
+
+    wfr.shutdown()
+
+
+if __name__ == '__main__':
+    main()
@@ -105,6 +105,24 @@ The entry point for registration and lifecycle:
 
 Internally wraps user functions: workflow functions get a `DaprWorkflowContext`, activity functions get a `WorkflowActivityContext`. Tracks registration state via `_workflow_registered` / `_activity_registered` attributes on functions to prevent double registration.
 
+#### Sync and async activities
+
+Activities can be either `def my_activity(ctx, inp)` or `async def my_activity(ctx, inp)`. At registration, `_make_activity_wrapper` calls `_is_async_callable(fn)` to detect async-ness. That helper unwraps `functools.partial`, `@functools.wraps` chains, and callable-class `__call__` so common decorator patterns route correctly. The wrapper is built `async def` or `def` to match, then stored in the registry.
+
+At dispatch time (the gRPC stream loop in `_durabletask/worker.py`), `inspect.iscoroutinefunction(activity_fn)` on the wrapper selects between two handlers.
+
+- **Async activities** go through `_execute_activity_async`, then `_ActivityExecutor.execute_async`, which awaits `fn(...)` directly on the event loop. No thread pool involvement. The gRPC response is delivered via `loop.run_in_executor(None, stub.CompleteActivityTask, ...)` (asyncio's default executor).
+- **Sync activities** go through `_execute_activity`, dispatched to the thread pool by `_AsyncWorkerManager._run_func`. The activity runs on a worker thread, and the response is delivered from the same thread. The thread pool size is controlled by `maximum_thread_pool_workers`.
+
+Workflow (orchestrator) functions must remain generators (`def` with `yield`). They cannot be `async def` because durabletask's deterministic replay depends on synchronous generator semantics. Only activities support async.
+
+**Decorator ordering gotcha.** Stacking `@wfr.activity` over `@alternate_name(...)` over `async def` works because `@alternate_name` now emits an `async def innerfn` when the wrapped function is async. A user-written decorator that wraps an async function in a sync `def` (without `@functools.wraps` exposing `__wrapped__`) defeats `_is_async_callable`, routes the activity to the sync path, and produces an un-awaited coroutine. Such decorators should use `@functools.wraps(fn)` so the unwrap walks through them.
+
+**`maximum_thread_pool_workers` gotcha.** This knob sizes the sync-activity thread pool only. Async-activity response delivery uses asyncio's default executor (process-wide, lazily sized to `min(32, cpu_count + 4)`), which is not capped by this knob. Strict thread-count bounds for async response delivery require calling `asyncio.get_event_loop().set_default_executor(ThreadPoolExecutor(max_workers=N))` before `wfr.start()`. A future PR may migrate the worker to `grpc.aio` and remove this caveat by sending responses without any thread pool.
+
+**Concurrency sizing and load characterization.** See `docs/concurrency.md` for sizing recommendations (`maximum_concurrent_activity_work_items`, `maximum_thread_pool_workers`), an async-vs-sync decision tree, and the default-executor caveat with a worked example. The `benchmarks/` directory ships `bench_async_activities.py` and the generated `RESULTS.md`; re-run it locally before claiming a perf regression — the report captures the run environment so a reader can tell whether a number applies to their hardware.
+
+
 ### DaprWorkflowClient (`dapr_workflow_client.py`)
 
 Client for workflow lifecycle management:
@@ -163,7 +181,7 @@ Retry configuration for activities and child workflows:
 1. **Registration**: User decorates functions with `@wfr.workflow` / `@wfr.activity`. The runtime wraps them and stores them in the durabletask worker's registry.
 2. **Startup**: `wfr.start()` opens a gRPC stream to the Dapr sidecar. The worker polls for work items.
 3. **Scheduling**: Client calls `schedule_new_workflow(fn, input=...)`. The function's name (or `_dapr_alternate_name`) is sent to the backend.
-4. **Execution**: The durabletask engine dispatches work items. Workflow functions are Python **generators** that `yield` tasks (activity calls, timers, child workflows). The engine records history; on replay, yielded tasks return cached results without re-executing.
+4. **Execution**: The durabletask engine dispatches work items. Workflow functions are Python **generators** that `yield` tasks (activity calls, timers, child workflows). Activity functions are either sync (dispatched to the worker's thread pool) or `async def` (awaited directly on the worker's event loop). The engine records history; on replay, yielded tasks return cached results without re-executing.
 5. **Determinism**: Workflows must be deterministic — no random, no wall-clock time, no I/O. Use `ctx.current_utc_datetime` instead of `datetime.now()`. Use `ctx.is_replaying` to guard side effects like logging.
 6. **Completion**: Client polls via `wait_for_workflow_completion()` or `get_workflow_state()`.
 
@@ -191,6 +209,7 @@ Two example directories exercise workflows:
   - `cross-app1.py`, `cross-app2.py`, `cross-app3.py` — cross-app calls
   - `versioning.py` — workflow versioning with `is_patched()`
   - `simple_aio_client.py` — async client variant
+  - `async_activities.py` — `async def` activities (HTTP fan-out with `httpx.AsyncClient`)
 
 ## Testing
 
 
@@ -0,0 +1,132 @@
+# Async-activity load benchmark results
+
+Generated by `bench_async_activities.py`. Re-run with:
+
+```bash
+uv run python ext/dapr-ext-workflow/benchmarks/bench_async_activities.py
+```
+
+## Run environment
+
+- **Timestamp**: 2026-05-25 20:40:09 UTC
+- **Git commit**: `8f13da0-dirty`
+- **Python**: CPython 3.13.12
+- **OS**: Darwin 25.5.0 (arm64) on Apple M3 Pro (12 logical cores), 36.0 GB
+- **asyncio default executor**: `max_workers=16` (`min(32, cpu_count + 4)`)
+- **CI environment**: no
+
+Numbers are specific to this hardware. Re-run locally to compare. The shape of
+the curves (throughput plateau, p99 inflection, drift) is what to compare
+across machines.
+
+Each scenario drives `TaskHubGrpcWorker._execute_activity_async` through
+`_AsyncWorkerManager` against a mock `CompleteActivityTask` stub. End-to-end
+latency is measured from `submit_activity` to the mock stub seeing the response.
+
+## 1. Concurrency win (issue #897 repro)
+
+100 × 1 s HTTP fetches. Async runs them concurrently on the loop, sync gates
+them through the thread pool.
+
+| Scenario | Wallclock (s) | Tput/s | Peak tasks | Peak RSS Δ (MB) |
+| --- | ---: | ---: | ---: | ---: |
+| Async fan-out | 1.47 | 68.1 | 305 | 86.4 |
+| Sync baseline | 13.34 | 7.5 | 121 | 2.4 |
+
+## 2. Throughput scaling
+
+Async fan-out, 50 ms activity, sem=5000, pool=16. Throughput plateaus around
+N=2500.
+
+| N | Wallclock (s) | Tput/s | p50 ms | p95 ms | p99 ms | Peak tasks | Peak RSS Δ (MB) |
+| ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| 100 | 0.06 | 1542.3 | 62.0 | 64.1 | 64.1 | 105 | 0.0 |
+| 500 | 0.08 | 5931.1 | 78.6 | 79.6 | 79.6 | 505 | 0.4 |
+| 1000 | 0.11 | 8956.5 | 102.9 | 106.2 | 106.3 | 1005 | 2.9 |
+| 2500 | 0.24 | 10532.0 | 218.8 | 225.3 | 225.9 | 2505 | 10.0 |
+| 5000 | 0.57 | 8696.7 | 543.8 | 557.2 | 558.7 | 5005 | 25.2 |
+
+## 3. Semaphore-cap sensitivity
+
+N=2500, 50 ms activity, pool=16. Caps below ~500 starve the loop. Gains
+compress above ~1000.
+
+| Sem | Wallclock (s) | Tput/s | p50 ms | p95 ms | p99 ms |
+| ---: | ---: | ---: | ---: | ---: | ---: |
+| 50 | 2.69 | 928.6 | 1422.7 | 2583.5 | 2687.0 |
+| 100 | 1.42 | 1758.2 | 794.9 | 1360.7 | 1412.0 |
+| 500 | 0.40 | 6229.5 | 279.2 | 387.9 | 392.3 |
+| 1000 | 0.30 | 8322.3 | 235.6 | 286.9 | 290.2 |
+| 5000 | 0.23 | 10720.7 | 215.0 | 222.3 | 222.8 |
+
+## 4. Failure threshold (queue-wait inflection)
+
+Cap=1000, ramp N, 50 ms activity. p99 first exceeds 2× server latency at
+**N=1000** (p99 = 104.7 ms).
+
+| N | Wallclock (s) | Tput/s | p50 ms | p95 ms | p99 ms |
+| ---: | ---: | ---: | ---: | ---: | ---: |
+| 500 | 0.08 | 6264.2 | 70.6 | 77.4 | 77.5 |
+| 1000 | 0.11 | 9145.3 | 94.5 | 104.2 | 104.7 |
+| 2500 | 0.31 | 8086.8 | 243.6 | 294.2 | 298.0 |
+| 5000 | 0.72 | 6983.2 | 584.2 | 691.1 | 700.5 |
+| 10000 | 2.08 | 4813.1 | 1801.7 | 2019.3 | 2046.2 |
+
+## 5. Sidecar response delivery overhead
+
+N=1000, sem=1000, pool=16, 50 ms activity. Mock `CompleteActivityTask` given
+an artificial delay. Async responses go through `loop.run_in_executor(None, ...)`,
+sharing asyncio's default executor (`max_workers=16` here). Delays past ~5 ms
+saturate that pool.
+
+| Delivery | Wallclock (s) | Tput/s | p50 ms | p95 ms | p99 ms |
+| ---: | ---: | ---: | ---: | ---: | ---: |
+| 0 ms | 0.11 | 9497.2 | 98.2 | 101.3 | 101.5 |
+| 1 ms | 0.18 | 5699.8 | 133.0 | 167.7 | 171.0 |
+| 5 ms | 0.48 | 2077.9 | 287.7 | 458.6 | 473.4 |
+| 10 ms | 0.86 | 1162.5 | 494.1 | 820.4 | 843.5 |
+
+## 6. Sustained load
+
+200/s for 120 s, 50 ms activity. Submitted/completed: 24 000 / 24 000.
+Wallclock 120.05 s (effective 199.9/s).
+
+- p50 50.2 ms, p95 50.6 ms, p99 50.8 ms, max 62.8 ms.
+- First-25% p99 50.8 ms, last-25% p99 50.7 ms. No drift.
+- Peak tasks 19, peak queue depth 3, peak RSS Δ 5.8 MB.
+
+## 7. Real HTTP workload
+
+Each activity opens a fresh `httpx.AsyncClient` and GETs an aiohttp endpoint
+sleeping 50 ms. Mirrors `examples/workflow/async_activities.py`. Pool=16 for
+all rows.
+
+| Scenario | N | Sem | Wallclock (s) | Tput/s | p50 ms | p95 ms | p99 ms | Peak tasks | Peak RSS Δ (MB) |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| Async | 100 | 1000 | 0.49 | 205.3 | 485.1 | 485.4 | 485.5 | 305 | 0.0 |
+| Async | 500 | 1000 | 2.06 | 243.2 | 1990.2 | 2052.6 | 2053.0 | 1376 | 308.1 |
+| Async | 1000 | 1000 | 4.28 | 233.4 | 4200.5 | 4274.9 | 4280.5 | 2555 | 398.5 |
+| Async | 2500 | 5000 | 15.16 | 165.0 | 10240.9 | 13260.9 | 15111.6 | 5776 | 1219.1 |
+| Sync | 100 | 1000 | 0.51 | 194.2 | 324.6 | 458.5 | 514.4 | 137 | 0.7 |
+
+## 8. Real HTTP sustained load
+
+Open-loop 100/s for 60 s with real `httpx.AsyncClient`. Submitted/completed:
+6000 / 6000. Wallclock 60.05 s (effective 99.9/s).
+
+- p50 56.1 ms, p95 68.9 ms, p99 76.0 ms, max 145.2 ms.
+- First-25% p99 75.7 ms, last-25% p99 76.2 ms. No drift.
+- Peak tasks 45, peak queue depth 6, peak RSS Δ 0.0 MB.
+
+## 9. OOM safety
+
+10 000 in-flight async activities, 50 ms, sem=1000, pool=8. ~9 000 Tasks
+parked on the semaphore. Peak RSS Δ stays well under the 500 MB budget.
+
+| N | Sem | Wallclock (s) | Tput/s | Peak tasks | Peak RSS Δ (MB) |
+| ---: | ---: | ---: | ---: | ---: | ---: |
+| 10000 | 1000 | 2.03 | 4918.2 | 10005 | 0.0 |
+
+## Operational guidance
+
+See `ext/dapr-ext-workflow/docs/concurrency.md` for sizing recommendations.