microsoft
diff --git a/‎.github/workflows/python-integration-tests.yml‎
Lines changed: 57 additions & 18 deletions b/‎.github/workflows/python-integration-tests.yml‎
Lines changed: 57 additions & 18 deletions
diff --git a/‎.github/workflows/python-merge-tests.yml‎
Lines changed: 57 additions & 18 deletions b/‎.github/workflows/python-merge-tests.yml‎
Lines changed: 57 additions & 18 deletions
diff --git a/‎python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py‎
Lines changed: 3 additions & 5 deletions b/‎python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py‎
Lines changed: 4 additions & 4 deletions b/‎python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py‎
Lines changed: 0 additions & 1 deletion b/‎python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/packages/foundry/tests/foundry/test_foundry_agent.py‎
Lines changed: 3 additions & 3 deletions b/‎python/packages/foundry/tests/foundry/test_foundry_agent.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/packages/foundry_hosting/tests/test_responses_int.py‎
Lines changed: 3 additions & 7 deletions b/‎python/packages/foundry_hosting/tests/test_responses_int.py‎
Lines changed: 3 additions & 7 deletions
@@ -157,6 +157,8 @@ jobs:
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       ANTHROPIC_CHAT_MODEL: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }}
       LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }}
+      OLLAMA_MODEL: qwen2.5:1.5b
+      OLLAMA_EMBEDDING_MODEL: nomic-embed-text
     defaults:
       run:
         working-directory: python
@@ -171,6 +173,43 @@ jobs:
         with:
           python-version: ${{ env.UV_PYTHON }}
           os: ${{ runner.os }}
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+        working-directory: .
+      - name: Cache Ollama models
+        uses: actions/cache@v4
+        with:
+          path: ~/.ollama/models
+          key: ollama-models-qwen2.5-1.5b-nomic-embed-text-v1
+      - name: Start Ollama and pull models
+        run: |
+          # Stop any Ollama instance auto-started by the install script
+          pkill ollama || true
+          sleep 2
+          ollama serve &
+          for i in $(seq 1 30); do
+            if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
+              break
+            fi
+            sleep 1
+          done
+          # Pull models with retry for transient 429 rate limits
+          for model in qwen2.5:1.5b nomic-embed-text; do
+            pulled=false
+            for attempt in 1 2 3; do
+              if ollama pull "$model"; then
+                pulled=true
+                break
+              fi
+              echo "Retry $attempt for $model (waiting 15s)..."
+              sleep 15
+            done
+            if [ "$pulled" != "true" ]; then
+              echo "ERROR: Failed to pull $model after 3 attempts"
+              exit 1
+            fi
+          done
+        working-directory: .
       - name: Start local MCP server
         id: local-mcp
         uses: ./.github/actions/setup-local-mcp-server
@@ -271,7 +310,7 @@ jobs:
           -m integration
           -n logical --dist worksteal
           -x
-          --timeout=360 --session-timeout=900 --timeout_method thread
+          --timeout=480 --session-timeout=900 --timeout_method thread
           --retries 2 --retry-delay 5
           --junitxml=pytest.xml
       - name: Upload test results
@@ -435,9 +474,9 @@ jobs:
           path: ./python/pytest.xml
           if-no-files-found: ignore
 
-  # Flaky test trend report (aggregates per-job JUnit XML results)
-  python-flaky-test-report:
-    name: Flaky Test Report
+  # Integration test trend report (aggregates per-job JUnit XML results)
+  python-integration-test-report:
+    name: Integration Test Report
     if: >
       always() &&
       (contains(join(needs.*.result, ','), 'success') ||
@@ -471,36 +510,36 @@ jobs:
         with:
           pattern: test-results-*
           path: test-results/
-      - name: Restore flaky report history cache
+      - name: Restore report history cache
         uses: actions/cache/restore@v4
         with:
-          path: python/flaky-report-history.json
-          key: flaky-report-history-integration-${{ github.run_id }}
+          path: python/integration-report-history.json
+          key: integration-report-history-integration-${{ github.run_id }}
           restore-keys: |
-            flaky-report-history-integration-
+            integration-report-history-integration-
       - name: Generate trend report
         run: >
-          uv run python scripts/flaky_report/aggregate.py
+          uv run python scripts/integration_test_report/aggregate.py
           ../test-results/
-          flaky-report-history.json
-          flaky-test-report.md
+          integration-report-history.json
+          integration-test-report.md
       - name: Post to Job Summary
         if: always()
-        run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY
-      - name: Save flaky report history cache
+        run: cat integration-test-report.md >> $GITHUB_STEP_SUMMARY
+      - name: Save report history cache
         if: always()
         uses: actions/cache/save@v4
         with:
-          path: python/flaky-report-history.json
-          key: flaky-report-history-integration-${{ github.run_id }}
+          path: python/integration-report-history.json
+          key: integration-report-history-integration-${{ github.run_id }}
       - name: Upload unified trend report
         if: always()
         uses: actions/upload-artifact@v7
         with:
-          name: flaky-test-report
+          name: integration-test-report
           path: |
-            python/flaky-test-report.md
-            python/flaky-report-history.json
+            python/integration-test-report.md
+            python/integration-report-history.json
 
   python-integration-tests-check:
     if: always()
 
@@ -278,6 +278,8 @@ jobs:
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       ANTHROPIC_CHAT_MODEL: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }}
       LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }}
+      OLLAMA_MODEL: qwen2.5:1.5b
+      OLLAMA_EMBEDDING_MODEL: nomic-embed-text
     defaults:
       run:
         working-directory: python
@@ -289,6 +291,43 @@ jobs:
         with:
           python-version: ${{ env.UV_PYTHON }}
           os: ${{ runner.os }}
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+        working-directory: .
+      - name: Cache Ollama models
+        uses: actions/cache@v4
+        with:
+          path: ~/.ollama/models
+          key: ollama-models-qwen2.5-1.5b-nomic-embed-text-v1
+      - name: Start Ollama and pull models
+        run: |
+          # Stop any Ollama instance auto-started by the install script
+          pkill ollama || true
+          sleep 2
+          ollama serve &
+          for i in $(seq 1 30); do
+            if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
+              break
+            fi
+            sleep 1
+          done
+          # Pull models with retry for transient 429 rate limits
+          for model in qwen2.5:1.5b nomic-embed-text; do
+            pulled=false
+            for attempt in 1 2 3; do
+              if ollama pull "$model"; then
+                pulled=true
+                break
+              fi
+              echo "Retry $attempt for $model (waiting 15s)..."
+              sleep 15
+            done
+            if [ "$pulled" != "true" ]; then
+              echo "ERROR: Failed to pull $model after 3 attempts"
+              exit 1
+            fi
+          done
+        working-directory: .
       - name: Start local MCP server
         id: local-mcp
         uses: ./.github/actions/setup-local-mcp-server
@@ -403,7 +442,7 @@ jobs:
           -m integration
           -n logical --dist worksteal
           -x
-          --timeout=360 --session-timeout=900 --timeout_method thread
+          --timeout=480 --session-timeout=900 --timeout_method thread
           --retries 2 --retry-delay 5
           --junitxml=pytest.xml
         working-directory: ./python
@@ -619,9 +658,9 @@ jobs:
           path: ./python/pytest.xml
           if-no-files-found: ignore
 
-  # Flaky test trend report (aggregates per-job JUnit XML results)
-  python-flaky-test-report:
-    name: Flaky Test Report
+  # Integration test trend report (aggregates per-job JUnit XML results)
+  python-integration-test-report:
+    name: Integration Test Report
     if: >
       always() &&
       (contains(join(needs.*.result, ','), 'success') ||
@@ -652,36 +691,36 @@ jobs:
         with:
           pattern: test-results-*
           path: test-results/
-      - name: Restore flaky report history cache
+      - name: Restore report history cache
         uses: actions/cache/restore@v4
         with:
-          path: python/flaky-report-history.json
-          key: flaky-report-history-merge-${{ github.run_id }}
+          path: python/integration-report-history.json
+          key: integration-report-history-merge-${{ github.run_id }}
           restore-keys: |
-            flaky-report-history-merge-
+            integration-report-history-merge-
       - name: Generate trend report
         run: >
-          uv run python scripts/flaky_report/aggregate.py
+          uv run python scripts/integration_test_report/aggregate.py
           ../test-results/
-          flaky-report-history.json
-          flaky-test-report.md
+          integration-report-history.json
+          integration-test-report.md
       - name: Post to Job Summary
         if: always()
-        run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY
-      - name: Save flaky report history cache
+        run: cat integration-test-report.md >> $GITHUB_STEP_SUMMARY
+      - name: Save report history cache
         if: always()
         uses: actions/cache/save@v4
         with:
-          path: python/flaky-report-history.json
-          key: flaky-report-history-merge-${{ github.run_id }}
+          path: python/integration-report-history.json
+          key: integration-report-history-merge-${{ github.run_id }}
       - name: Upload unified trend report
         if: always()
         uses: actions/upload-artifact@v7
         with:
-          name: flaky-test-report
+          name: integration-test-report
           path: |
-            python/flaky-test-report.md
-            python/flaky-report-history.json
+            python/integration-test-report.md
+            python/integration-report-history.json
 
   python-integration-tests-check:
     if: always()
 
@@ -26,7 +26,6 @@
     pytest.mark.integration,
     pytest.mark.sample("03_reliable_streaming"),
     pytest.mark.usefixtures("function_app_for_test"),
-    pytest.mark.skip(reason="Temp disabled to fix test instability - needs investigation into root cause"),
 ]
 
 
@@ -56,12 +55,11 @@ def test_agent_run_and_stream(self) -> None:
         # Wait a moment for the agent to start writing to Redis
         time.sleep(2)
 
-        # Stream response from Redis with shorter timeout
-        # Note: We use text/plain to avoid SSE parsing complexity
+        # Stream response from Redis with longer timeout to account for LLM latency
         stream_response = requests.get(
             f"{self.stream_url}/{thread_id}",
             headers={"Accept": "text/plain"},
-            timeout=30,  # Shorter timeout for test
+            timeout=60,
         )
         assert stream_response.status_code == 200
 
@@ -83,7 +81,7 @@ def test_stream_with_sse_format(self) -> None:
         stream_response = requests.get(
             f"{self.stream_url}/{thread_id}",
             headers={"Accept": "text/event-stream"},
-            timeout=30,  # Shorter timeout
+            timeout=60,
         )
         assert stream_response.status_code == 200
         content_type = stream_response.headers.get("content-type", "")
 
@@ -42,7 +42,7 @@ def _setup(self, base_url: str, sample_helper) -> None:
         self.base_url = base_url
         self.helper = sample_helper
 
-    @pytest.mark.skip(reason="Causes timeouts.")
+    @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process")
     def test_parallel_workflow_document_analysis(self) -> None:
         """Test parallel workflow with a standard document."""
         payload = {
@@ -71,7 +71,7 @@ def test_parallel_workflow_document_analysis(self) -> None:
         assert status["runtimeStatus"] == "Completed"
         assert "output" in status
 
-    @pytest.mark.skip(reason="Causes timeouts.")
+    @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process")
     def test_parallel_workflow_short_document(self) -> None:
         """Test parallel workflow with a short document."""
         payload = {
@@ -91,7 +91,7 @@ def test_parallel_workflow_short_document(self) -> None:
         assert status["runtimeStatus"] == "Completed"
         assert "output" in status
 
-    @pytest.mark.skip(reason="Causes timeouts.")
+    @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process")
     def test_parallel_workflow_technical_document(self) -> None:
         """Test parallel workflow with a technical document."""
         payload = {
@@ -115,7 +115,7 @@ def test_parallel_workflow_technical_document(self) -> None:
         status = self.helper.wait_for_orchestration_with_output(data["statusQueryGetUri"], max_wait=300)
         assert status["runtimeStatus"] == "Completed"
 
-    @pytest.mark.skip(reason="Causes timeouts.")
+    @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process")
     def test_workflow_status_endpoint(self) -> None:
         """Test that the workflow status endpoint works correctly."""
         payload = {
 
@@ -52,7 +52,6 @@ def test_agents_registered(self):
         assert email_agent is not None
         assert email_agent.name == EMAIL_AGENT_NAME
 
-    @pytest.mark.skip(reason="Consistently fails due to orchestration timeouts - needs investigation")
     def test_conditional_branching(self):
         """Test that conditional branching works correctly."""
         # Test with obvious spam
 
@@ -634,7 +634,6 @@ def _import_with_missing_azure_monitor(
 @pytest.mark.flaky
 @pytest.mark.integration
 @skip_if_foundry_agent_integration_tests_disabled
-@pytest.mark.skip(reason="Test agent seems to have disappeared from the test environment; needs investigation.")
 async def test_foundry_agent_basic_run() -> None:
     """Smoke-test FoundryAgent against a real configured agent."""
     async with FoundryAgent(credential=AzureCliCredential(), allow_preview=True) as agent:
@@ -648,10 +647,11 @@ async def test_foundry_agent_basic_run() -> None:
 @pytest.mark.flaky
 @pytest.mark.integration
 @skip_if_foundry_agent_integration_tests_disabled
-@pytest.mark.skip(reason="Test agent seems to have disappeared from the test environment; needs investigation.")
 async def test_foundry_agent_custom_client_run() -> None:
     """Smoke-test FoundryAgent against a real configured agent."""
-    async with FoundryAgent(credential=AzureCliCredential(), client_type=RawFoundryAgentChatClient) as agent:
+    async with FoundryAgent(
+        credential=AzureCliCredential(), client_type=RawFoundryAgentChatClient, allow_preview=True
+    ) as agent:
         response = await agent.run("Please respond with exactly: 'This is a response test.'")
 
     assert isinstance(response, AgentResponse)
 
@@ -559,25 +559,21 @@ async def test_tool_call_streaming(self, server_with_tools: ResponsesHostServer)
 class TestOptions:
     """Verify chat options are passed through to the model."""
 
-    @pytest.mark.skip(reason="Flaky in merge queue, blocking unrelated PRs. Tracked in #5553.")
     @pytest.mark.flaky
     @pytest.mark.integration
     @skip_if_foundry_hosting_integration_tests_disabled
     async def test_temperature_and_max_tokens(self, server: ResponsesHostServer) -> None:
-        """Set temperature and max_output_tokens and verify the response succeeds."""
+        """Set max_output_tokens and verify the response succeeds."""
         resp = await _post_json(
             server,
             {
                 "input": "Say hello briefly.",
                 "stream": False,
-                "max_output_tokens": 50,
+                "max_output_tokens": 200,
             },
         )
 
         assert resp.status_code == 200
         body = resp.json()
         assert body["status"] == "completed"
-        output_messages = [o for o in body["output"] if o["type"] == "message"]
-        assert len(output_messages) == 1
-        output_text = output_messages[0]["content"][0]["text"]
-        assert len(output_text) > 0
+        assert len(body["output"]) > 0