From b544cf22f48b5ae3d6a609e1ab983e95ad78b8c3 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 11:30:10 -0400
Subject: [PATCH 01/23] feat: add Anthropic SDK backend + per-model backend
 selection

Adds anthropic_sdk as a third backend adapter driving the native
Anthropic Messages API (/v1/messages) via the official anthropic
Python SDK. Supports streaming, MCP tool calling, and adaptive
thinking with configurable reasoning effort.

Key changes:
- New backend: sdk/anthropic_sdk/backend.py implementing AgentBackend
- Per-model backend selection via model_settings.backend (allows mixed
  backends in a single taskflow, e.g. Anthropic for code_analysis +
  OpenAI for general_tasks)
- Both anthropic and github-copilot-sdk are now regular dependencies
  (not optional) since per-model backend config means any SDK could be
  needed at runtime
- BackendSdk/ApiType Literals extended for anthropic_sdk/messages
- _resolve_task_model() returns per-task backend override
- stream_thinking model_settings option (opt-in, default off)
- README and GRAMMAR.md updated with backend docs

Auth: CAPI's /v1/messages expects Authorization: Bearer (not x-api-key);
the adapter passes the bearer header via default_headers.

Thinking: Uses adaptive thinking with output_config.effort. CAPI
returns encrypted thinking signatures (content not readable); the
stream_thinking flag is ready for when/if thinking content is exposed.

Tested: basic messages, streaming, multi-turn tool calling via MCP,
mixed-backend taskflows, all reasoning effort levels (low/medium/high/
max), error handling, openai_agents regression.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md                                     |  44 ++-
 doc/GRAMMAR.md                                |  12 +-
 pyproject.toml                                |  11 +-
 src/seclab_taskflow_agent/models.py           |   4 +-
 src/seclab_taskflow_agent/runner.py           |  17 +-
 src/seclab_taskflow_agent/sdk/__init__.py     |   8 +-
 .../sdk/anthropic_sdk/__init__.py             |   4 +
 .../sdk/anthropic_sdk/backend.py              | 284 ++++++++++++++++++
 8 files changed, 345 insertions(+), 39 deletions(-)
 create mode 100644 src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py
 create mode 100644 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py

diff --git a/README.md b/README.md
index 3e44d86d..75c588a7 100644
--- a/README.md
+++ b/README.md
@@ -83,37 +83,49 @@ Per-model `model_settings` can include:
 
 ### Backends
 
-The runner can drive two SDKs behind a common interface:
+The runner can drive three SDKs behind a common interface:
 
 - **`openai_agents`** (default) — the OpenAI Agents Python SDK. Supports
   multi-personality handoffs, both `chat_completions` and `responses`
   `api_type`, `temperature`, `parallel_tool_calls`,
   `exclude_from_context`, and MCP over stdio, SSE, and streamable HTTP.
-- **`copilot_sdk`** (optional, `pip install seclab-taskflow-agent[copilot]`)
-  — the GitHub Copilot Python SDK. Supports streaming, `reasoning_effort`,
-  MCP over stdio/SSE/HTTP, and per-tool permission gating. The SDK
-  selects its own wire protocol per model, so the YAML `api_type` field
-  is not honoured; multi-personality handoffs, `temperature`, and
-  `parallel_tool_calls` are likewise not available. Taskflows that use
-  unsupported fields fail at load time with a `BackendCapabilityError`
-  naming the offending field.
+- **`copilot_sdk`** — the GitHub Copilot Python SDK. Supports streaming,
+  `reasoning_effort`, MCP over stdio/SSE/HTTP, and per-tool permission
+  gating. The SDK selects its own wire protocol per model, so the YAML
+  `api_type` field is not honoured; multi-personality handoffs,
+  `temperature`, and `parallel_tool_calls` are likewise not available.
+  Taskflows that use unsupported fields fail at load time with a
+  `BackendCapabilityError` naming the offending field.
+- **`anthropic_sdk`** — the Anthropic Python SDK, driving the native
+  Messages API (`/v1/messages`). Supports streaming, tool calling via
+  MCP, and adaptive thinking with configurable `reasoning.effort`
+  (`low`, `medium`, `high`, `max`). Handoffs are not supported.
+  Designed for use with CAPI's Anthropic endpoint; auth uses
+  `Authorization: Bearer` (not `x-api-key`).
 
 Selection precedence:
 
-1. `backend:` field in the model config document.
-2. `SECLAB_TASKFLOW_BACKEND` environment variable.
-3. Endpoint auto-default (`api.githubcopilot.com` prefers `copilot_sdk`
-   when the optional dependency is installed).
+1. Per-model `backend:` in `model_settings` (allows mixed backends in a
+   single taskflow).
+2. `backend:` field in the model config document (global default).
+3. `SECLAB_TASKFLOW_BACKEND` environment variable.
 4. `openai_agents`.
 
 ```yaml
 seclab-taskflow-agent:
   version: "1.0"
   filetype: model_config
-backend: copilot_sdk
 models:
-  fast: gpt-5-mini
-  slow: claude-opus-4.6
+  code_analysis: claude-mythos-5
+  general_tasks: gpt-5.4-mini
+model_settings:
+  code_analysis:
+    backend: anthropic_sdk
+    reasoning:
+      effort: high
+  general_tasks:
+    api_type: responses
+    backend: openai_agents
 ```
 
 ### Session Recovery
diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md
index b7e16ee9..67c57497 100644
--- a/doc/GRAMMAR.md
+++ b/doc/GRAMMAR.md
@@ -524,6 +524,7 @@ api_type: chat_completions        # default for all models
 models:
   gpt_default: gpt-4.1
   gpt_responses: gpt-5.1
+  claude_native: claude-mythos-5
 model_settings:
   gpt_default:
     temperature: 0.7
@@ -532,6 +533,10 @@ model_settings:
     endpoint: https://api.githubcopilot.com
     token: CAPI_TOKEN             # env var name containing the API key
     temperature: 0.5
+  claude_native:
+    backend: anthropic_sdk        # use the Anthropic Messages API
+    reasoning:
+      effort: high
 ```
 
 The following keys in `model_settings` are handled by the engine and are not
@@ -539,9 +544,10 @@ passed to the underlying model provider:
 
 | Key | Description | Default |
 |-----|-------------|---------|
-| `api_type` | `"chat_completions"` or `"responses"` | Inherited from top-level `api_type`, or `"chat_completions"` |
+| `api_type` | `"chat_completions"`, `"responses"`, or `"messages"` | Inherited from top-level `api_type`, or `"chat_completions"` |
+| `backend` | SDK adapter: `"openai_agents"`, `"copilot_sdk"`, or `"anthropic_sdk"` | Inherited from top-level `backend`, or `"openai_agents"` |
 | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var |
 | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` |
 
-All other keys (e.g. `temperature`, `top_p`) are passed through as model
-parameters to the OpenAI SDK.
+All other keys (e.g. `temperature`, `top_p`, `reasoning`) are passed through as
+model parameters to the selected SDK backend.
diff --git a/pyproject.toml b/pyproject.toml
index 6f805a7f..aa163f75 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ classifiers = [
 dependencies = [
   "aiofiles==24.1.0",
   "annotated-types==0.7.0",
+  "anthropic>=0.50,<1",
   "anyio==4.9.0",
   "attrs==25.3.0",
   "Authlib==1.6.12",
@@ -55,6 +56,7 @@ dependencies = [
   "email-validator==2.3.0",
   "exceptiongroup==1.3.0",
   "fastmcp==3.2.0",
+  "github-copilot-sdk>=0.2.2,<0.3",
   "griffe==1.7.3",
   "h11==0.16.0",
   "httpcore==1.0.9",
@@ -124,15 +126,6 @@ dependencies = [
 [project.scripts]
 seclab-taskflow-agent = "seclab_taskflow_agent.cli:app"
 
-[project.optional-dependencies]
-# Pulls in the GitHub Copilot SDK (public preview) so the copilot_sdk
-# backend can be selected. Requires Python >= 3.11. Pinned to the
-# 0.2.x line because the SDK may ship breaking changes between minor
-# versions while still in preview.
-copilot = [
-  "github-copilot-sdk>=0.2.2,<0.3",
-]
-
 [project.urls]
 Source = "https://github.com/GitHubSecurityLab/seclab-taskflow-agent"
 Issues = "https://github.com/GitHubSecurityLab/seclab-taskflow-agent/issues"
diff --git a/src/seclab_taskflow_agent/models.py b/src/seclab_taskflow_agent/models.py
index eff05ee6..837e4e2c 100644
--- a/src/seclab_taskflow_agent/models.py
+++ b/src/seclab_taskflow_agent/models.py
@@ -31,10 +31,10 @@
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 # Valid API type values for model configuration.
-ApiType = Literal["chat_completions", "responses"]
+ApiType = Literal["chat_completions", "responses", "messages"]
 
 # Valid backend names. Must stay in sync with ``sdk._KNOWN``.
-BackendSdk = Literal["openai_agents", "copilot_sdk"]
+BackendSdk = Literal["openai_agents", "copilot_sdk", "anthropic_sdk"]
 
 
 # ---------------------------------------------------------------------------
diff --git a/src/seclab_taskflow_agent/runner.py b/src/seclab_taskflow_agent/runner.py
index 12d36bd8..b5ed43fd 100644
--- a/src/seclab_taskflow_agent/runner.py
+++ b/src/seclab_taskflow_agent/runner.py
@@ -126,12 +126,12 @@ def _resolve_task_model(
     model_dict: dict[str, str],
     models_params: dict[str, dict[str, Any]],
     default_api_type: str = "chat_completions",
-) -> tuple[str, dict[str, Any], str, str | None, str | None]:
+) -> tuple[str, dict[str, Any], str, str | None, str | None, str | None]:
     """Resolve the final model name, settings, and per-model overrides.
 
     Returns:
-        A tuple of ``(model_id, model_settings, api_type, endpoint, token)``
-        where *endpoint* and *token* are ``None`` when not overridden.
+        A tuple of ``(model_id, model_settings, api_type, endpoint, token, backend)``
+        where *endpoint*, *token*, and *backend* are ``None`` when not overridden.
 
     Raises:
         ValueError: If task-level model_settings is not a dictionary.
@@ -141,6 +141,7 @@ def _resolve_task_model(
     api_type: str = default_api_type
     endpoint: str | None = None
     token: str | None = None
+    backend: str | None = None
 
     if logical_name in model_keys:
         if logical_name in models_params:
@@ -151,6 +152,7 @@ def _resolve_task_model(
     api_type = model_settings.pop("api_type", api_type)
     endpoint = model_settings.pop("endpoint", None)
     token = model_settings.pop("token", None)
+    backend = model_settings.pop("backend", None)
 
     task_model_settings: dict[str, Any] | Any = task.model_settings or {}
     if not isinstance(task_model_settings, dict):
@@ -161,9 +163,10 @@ def _resolve_task_model(
     api_type = task_settings.pop("api_type", api_type)
     endpoint = task_settings.pop("endpoint", endpoint)
     token = task_settings.pop("token", token)
+    backend = task_settings.pop("backend", backend)
 
     model_settings.update(task_settings)
-    return logical_name, model_settings, api_type, endpoint, token
+    return logical_name, model_settings, api_type, endpoint, token, backend
 
 
 async def _build_prompts_to_run(
@@ -600,8 +603,8 @@ async def on_handoff_hook(context: RunContextWrapper[TContext], agent: Agent[TCo
             if task.uses:
                 task = _merge_reusable_task(available_tools, task)
 
-            # Resolve model (name, settings, api_type, optional endpoint/token)
-            model, model_settings, task_api_type, task_endpoint, task_token = _resolve_task_model(
+            # Resolve model (name, settings, api_type, optional endpoint/token/backend)
+            model, model_settings, task_api_type, task_endpoint, task_token, task_backend = _resolve_task_model(
                 task, model_keys, model_dict, models_params, default_api_type=api_type,
             )
 
@@ -697,7 +700,7 @@ async def _deploy(ra: dict, pp: str) -> bool:
                                     api_type=task_api_type,
                                     endpoint=task_endpoint,
                                     token=task_token,
-                                    backend=backend,
+                                    backend=task_backend or backend,
                                     agent_hooks=TaskAgentHooks(on_handoff=on_handoff_hook),
                                 )
 
diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py
index 15086922..972afa90 100644
--- a/src/seclab_taskflow_agent/sdk/__init__.py
+++ b/src/seclab_taskflow_agent/sdk/__init__.py
@@ -33,7 +33,7 @@
 )
 
 _ENV_VAR = "SECLAB_TASKFLOW_BACKEND"
-_KNOWN = ("openai_agents", "copilot_sdk")
+_KNOWN = ("openai_agents", "copilot_sdk", "anthropic_sdk")
 _BACKENDS: dict[str, AgentBackend] = {}
 
 
@@ -46,10 +46,14 @@ def get_backend(name: str) -> AgentBackend:
             from .openai_agents.backend import OpenAIAgentsBackend
 
             _BACKENDS[name] = OpenAIAgentsBackend()
-        else:
+        elif name == "copilot_sdk":
             from .copilot_sdk.backend import CopilotSDKBackend
 
             _BACKENDS[name] = CopilotSDKBackend()
+        else:
+            from .anthropic_sdk.backend import AnthropicSDKBackend
+
+            _BACKENDS[name] = AnthropicSDKBackend()
     return _BACKENDS[name]
 
 
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py
new file mode 100644
index 00000000..03ec0700
--- /dev/null
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: GitHub, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Anthropic SDK backend adapter."""
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
new file mode 100644
index 00000000..9d975fc3
--- /dev/null
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -0,0 +1,284 @@
+# SPDX-FileCopyrightText: GitHub, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Anthropic SDK backend adapter.
+
+Drives the Anthropic Messages API (``/v1/messages``) via the official
+``anthropic`` Python SDK. Supports streaming, tool calling via MCP
+servers, and extended thinking.
+
+Auth note: The Anthropic SDK sends ``x-api-key`` by default, but CAPI
+expects ``Authorization: Bearer``. We pass the bearer header via
+``default_headers`` and set ``api_key`` to a placeholder so the SDK
+doesn't complain about a missing key.
+"""
+
+from __future__ import annotations
+
+__all__ = ["AnthropicSDKBackend"]
+
+import json
+import logging
+import os
+from collections.abc import AsyncIterator
+from dataclasses import dataclass, field
+from typing import Any
+
+from ..base import AgentSpec, StreamEvent, TextDelta, ToolEnd
+from ..errors import (
+    BackendBadRequestError,
+    BackendCapabilityError,
+    BackendMaxTurnsError,
+    BackendRateLimitError,
+    BackendTimeoutError,
+    BackendUnexpectedError,
+)
+
+logger = logging.getLogger(__name__)
+
+_VALID_REASONING = ("low", "medium", "high", "max")
+
+
+def _resolve_token(token_env: str | None) -> str:
+    """Resolve the API token from env var name or default AI_API_TOKEN."""
+    if token_env:
+        val = os.getenv(token_env)
+        if val:
+            return val
+    val = os.getenv("AI_API_TOKEN")
+    if val:
+        return val
+    raise BackendBadRequestError(
+        "anthropic_sdk: no API token found (set AI_API_TOKEN or per-model token env)"
+    )
+
+
+def _resolve_endpoint() -> str:
+    """Resolve the API base URL."""
+    return os.getenv("AI_API_ENDPOINT", "https://api.githubcopilot.com")
+
+
+def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]:
+    """Convert MCP tool definitions to Anthropic tool format."""
+    anthropic_tools = []
+    for tool in tools:
+        schema = tool.inputSchema if hasattr(tool, "inputSchema") else {}
+        anthropic_tools.append({
+            "name": tool.name,
+            "description": getattr(tool, "description", tool.name),
+            "input_schema": schema or {"type": "object", "properties": {}},
+        })
+    return anthropic_tools
+
+
+def _call_tool_result_to_text(result: Any) -> str:
+    """Extract text from an MCP CallToolResult."""
+    content = getattr(result, "content", [])
+    parts = []
+    for c in content:
+        text = getattr(c, "text", None)
+        if text:
+            parts.append(text)
+    return "\n".join(parts) if parts else str(result)
+
+
+@dataclass
+class _AnthropicHandle:
+    """Holds the Anthropic client and conversation state."""
+    client: Any
+    system_prompt: str
+    model: str
+    max_tokens: int
+    tools: list[dict[str, Any]]
+    mcp_server_map: dict[str, Any]  # tool_name -> MCP server handle
+    model_settings: dict[str, Any] = field(default_factory=dict)
+    stream_thinking: bool = False
+
+
+class AnthropicSDKBackend:
+    """Adapter that drives the Anthropic Python SDK."""
+
+    name = "anthropic_sdk"
+
+    def validate(self, spec: AgentSpec) -> None:
+        if spec.handoffs or spec.in_handoff_graph:
+            raise BackendCapabilityError(
+                "anthropic_sdk: agent handoffs are not supported"
+            )
+        if not spec.model:
+            raise BackendBadRequestError("anthropic_sdk: model is required")
+
+    async def build(
+        self,
+        spec: AgentSpec,
+        *,
+        run_hooks: Any = None,
+        agent_hooks: Any = None,
+    ) -> _AnthropicHandle:
+        del run_hooks, agent_hooks
+
+        import anthropic
+
+        token = _resolve_token(spec.token_env)
+        endpoint = spec.endpoint or _resolve_endpoint()
+
+        client = anthropic.AsyncAnthropic(
+            api_key="placeholder",
+            base_url=endpoint,
+            default_headers={
+                "Authorization": f"Bearer {token}",
+                "Copilot-Integration-Id": os.getenv(
+                    "COPILOT_INTEGRATION_ID", "vscode-chat"
+                ),
+            },
+        )
+
+        # Collect tools from MCP servers
+        all_tools: list[dict[str, Any]] = []
+        mcp_server_map: dict[str, Any] = {}
+
+        for mcp_spec in spec.mcp_servers:
+            native_server = mcp_spec.params.get("_native")
+            if native_server is None:
+                continue
+            try:
+                mcp_tools = await native_server.list_tools()
+                anthropic_tools = _mcp_tools_to_anthropic(mcp_tools)
+                all_tools.extend(anthropic_tools)
+                for tool in mcp_tools:
+                    mcp_server_map[tool.name] = native_server
+            except Exception:
+                logger.exception("Failed to list tools from MCP server %s", mcp_spec.name)
+
+        # Resolve max_tokens from model_settings or default
+        max_tokens = spec.model_settings.get("max_tokens", 16384)
+        stream_thinking = spec.model_settings.get("stream_thinking", False)
+
+        return _AnthropicHandle(
+            client=client,
+            system_prompt=spec.instructions or "",
+            model=spec.model,
+            max_tokens=max_tokens,
+            tools=all_tools,
+            mcp_server_map=mcp_server_map,
+            model_settings=spec.model_settings,
+            stream_thinking=stream_thinking,
+        )
+
+    async def run_streamed(
+        self,
+        agent: Any,
+        prompt: str,
+        *,
+        max_turns: int,
+    ) -> AsyncIterator[StreamEvent]:
+        handle: _AnthropicHandle = agent
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": prompt},
+        ]
+
+        # Build optional params
+        create_kwargs: dict[str, Any] = {}
+        reasoning = handle.model_settings.get("reasoning")
+        if isinstance(reasoning, dict):
+            effort = reasoning.get("effort")
+            if effort:
+                create_kwargs["thinking"] = {"type": "adaptive"}
+                create_kwargs["output_config"] = {"effort": effort}
+
+        import anthropic
+
+        for turn in range(max_turns):
+            try:
+                async with handle.client.messages.stream(
+                    model=handle.model,
+                    max_tokens=handle.max_tokens,
+                    system=handle.system_prompt,
+                    messages=messages,
+                    tools=handle.tools or anthropic.NOT_GIVEN,
+                    **create_kwargs,
+                ) as stream:
+                    async for event in stream:
+                        if hasattr(event, "type"):
+                            if event.type == "content_block_delta":
+                                delta = event.delta
+                                if hasattr(delta, "text"):
+                                    yield TextDelta(text=delta.text)
+                                elif hasattr(delta, "thinking") and handle.stream_thinking:
+                                    yield TextDelta(text=delta.thinking)
+
+                    response = await stream.get_final_message()
+
+            except anthropic.RateLimitError as exc:
+                raise BackendRateLimitError(str(exc)) from exc
+            except anthropic.APITimeoutError as exc:
+                raise BackendTimeoutError(str(exc)) from exc
+            except anthropic.BadRequestError as exc:
+                raise BackendBadRequestError(str(exc)) from exc
+            except anthropic.APIError as exc:
+                raise BackendUnexpectedError(str(exc)) from exc
+
+            if response.stop_reason == "end_turn":
+                return
+            if response.stop_reason != "tool_use":
+                return
+
+            # Process tool calls
+            tool_use_blocks = [
+                b for b in response.content if b.type == "tool_use"
+            ]
+            if not tool_use_blocks:
+                return
+
+            # Add assistant message with all content blocks
+            messages.append({"role": "assistant", "content": response.content})
+
+            # Execute each tool call and collect results
+            tool_results: list[dict[str, Any]] = []
+            for tool_block in tool_use_blocks:
+                tool_name = tool_block.name
+                tool_input = tool_block.input
+
+                server = handle.mcp_server_map.get(tool_name)
+                if server is None:
+                    logger.warning("Tool %s not found in MCP servers", tool_name)
+                    tool_results.append({
+                        "type": "tool_result",
+                        "tool_use_id": tool_block.id,
+                        "content": f"Error: tool '{tool_name}' not found",
+                        "is_error": True,
+                    })
+                    yield ToolEnd(tool_name=tool_name, text=f"Error: tool '{tool_name}' not found")
+                    continue
+
+                try:
+                    result = await server.call_tool(
+                        tool_name,
+                        arguments=tool_input if isinstance(tool_input, dict) else {},
+                    )
+                    result_text = _call_tool_result_to_text(result)
+                    tool_results.append({
+                        "type": "tool_result",
+                        "tool_use_id": tool_block.id,
+                        "content": result_text,
+                    })
+                    yield ToolEnd(tool_name=tool_name, text=result_text)
+                except Exception as exc:
+                    logger.exception("Tool call %s failed", tool_name)
+                    error_text = f"Error calling {tool_name}: {exc}"
+                    tool_results.append({
+                        "type": "tool_result",
+                        "tool_use_id": tool_block.id,
+                        "content": error_text,
+                        "is_error": True,
+                    })
+                    yield ToolEnd(tool_name=tool_name, text=error_text)
+
+            messages.append({"role": "user", "content": tool_results})
+
+        raise BackendMaxTurnsError(f"Exceeded max_turns ({max_turns})")
+
+    async def aclose(self, agent: Any) -> None:
+        handle: _AnthropicHandle = agent
+        if handle is not None and handle.client is not None:
+            await handle.client.close()

From 9ad4c4ff12f069489c1311e1e06e11437224800c Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 11:38:51 -0400
Subject: [PATCH 02/23] fix: address PR review feedback

- Remove unused json import (lint/CodeQL)
- Validate reasoning.effort against allowed values upfront
- Pass through temperature/top_p to Anthropic API
- Add exclude_from_context support (stop after tool results)
- Thread exclude_from_context into _AnthropicHandle

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../sdk/anthropic_sdk/backend.py              | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 9d975fc3..60dcdadb 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -17,7 +17,6 @@
 
 __all__ = ["AnthropicSDKBackend"]
 
-import json
 import logging
 import os
 from collections.abc import AsyncIterator
@@ -93,6 +92,7 @@ class _AnthropicHandle:
     mcp_server_map: dict[str, Any]  # tool_name -> MCP server handle
     model_settings: dict[str, Any] = field(default_factory=dict)
     stream_thinking: bool = False
+    exclude_from_context: bool = False
 
 
 class AnthropicSDKBackend:
@@ -163,6 +163,7 @@ async def build(
             mcp_server_map=mcp_server_map,
             model_settings=spec.model_settings,
             stream_thinking=stream_thinking,
+            exclude_from_context=spec.exclude_from_context,
         )
 
     async def run_streamed(
@@ -179,10 +180,24 @@ async def run_streamed(
 
         # Build optional params
         create_kwargs: dict[str, Any] = {}
+
+        # Pass through temperature/top_p if set
+        temperature = handle.model_settings.get("temperature")
+        if temperature is not None:
+            create_kwargs["temperature"] = float(temperature)
+        top_p = handle.model_settings.get("top_p")
+        if top_p is not None:
+            create_kwargs["top_p"] = float(top_p)
+
         reasoning = handle.model_settings.get("reasoning")
         if isinstance(reasoning, dict):
             effort = reasoning.get("effort")
             if effort:
+                if effort not in _VALID_REASONING:
+                    raise BackendBadRequestError(
+                        f"anthropic_sdk: invalid reasoning effort {effort!r} "
+                        f"(expected one of {_VALID_REASONING})"
+                    )
                 create_kwargs["thinking"] = {"type": "adaptive"}
                 create_kwargs["output_config"] = {"effort": effort}
 
@@ -274,6 +289,12 @@ async def run_streamed(
                     })
                     yield ToolEnd(tool_name=tool_name, text=error_text)
 
+            # exclude_from_context: stop after tool results are emitted
+            # so they are available to the runner but not fed back into
+            # the model context (matches copilot_sdk behavior).
+            if handle.exclude_from_context:
+                return
+
             messages.append({"role": "user", "content": tool_results})
 
         raise BackendMaxTurnsError(f"Exceeded max_turns ({max_turns})")

From ed4412f0ec7937a4ee10e8f59471cd091513b139 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 11:41:36 -0400
Subject: [PATCH 03/23] fix: handle None tool descriptions in Anthropic tool
 conversion

MCP tools can have description=None; the Anthropic API requires a
valid string. Fall back to tool name when description is None.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 60dcdadb..37c31a49 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -62,9 +62,10 @@ def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]:
     anthropic_tools = []
     for tool in tools:
         schema = tool.inputSchema if hasattr(tool, "inputSchema") else {}
+        description = getattr(tool, "description", None) or tool.name
         anthropic_tools.append({
             "name": tool.name,
-            "description": getattr(tool, "description", tool.name),
+            "description": description,
             "input_schema": schema or {"type": "object", "properties": {}},
         })
     return anthropic_tools

From a8bf3b85af95df54cbb0390e0ab783fc562efc03 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 11:56:11 -0400
Subject: [PATCH 04/23] fix: CI failures, add unit tests, update docs

- Update doc examples to use claude-opus-4.7 and show api_type: messages
- Add tests/test_sdk_anthropic_adapter.py (18 tests covering validate,
  tool conversion, token resolution, tool result parsing)
- Fix test_runner.py: update _resolve_task_model unpacking to 6-tuple
- Fix test_sdk_base.py: update backend resolution tests to match new
  behavior (endpoint no longer auto-selects copilot_sdk)
- Add test for explicit anthropic_sdk backend selection

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md                           |   3 +-
 doc/GRAMMAR.md                      |   5 +-
 tests/test_runner.py                |  14 +--
 tests/test_sdk_anthropic_adapter.py | 174 ++++++++++++++++++++++++++++
 tests/test_sdk_base.py              |  19 ++-
 5 files changed, 195 insertions(+), 20 deletions(-)
 create mode 100644 tests/test_sdk_anthropic_adapter.py

diff --git a/README.md b/README.md
index 75c588a7..21f39d15 100644
--- a/README.md
+++ b/README.md
@@ -116,10 +116,11 @@ seclab-taskflow-agent:
   version: "1.0"
   filetype: model_config
 models:
-  code_analysis: claude-mythos-5
+  code_analysis: claude-opus-4.7
   general_tasks: gpt-5.4-mini
 model_settings:
   code_analysis:
+    api_type: messages
     backend: anthropic_sdk
     reasoning:
       effort: high
diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md
index 67c57497..928bf8b1 100644
--- a/doc/GRAMMAR.md
+++ b/doc/GRAMMAR.md
@@ -524,7 +524,7 @@ api_type: chat_completions        # default for all models
 models:
   gpt_default: gpt-4.1
   gpt_responses: gpt-5.1
-  claude_native: claude-mythos-5
+  claude_native: claude-opus-4.7
 model_settings:
   gpt_default:
     temperature: 0.7
@@ -534,7 +534,8 @@ model_settings:
     token: CAPI_TOKEN             # env var name containing the API key
     temperature: 0.5
   claude_native:
-    backend: anthropic_sdk        # use the Anthropic Messages API
+    api_type: messages            # use the Anthropic Messages API
+    backend: anthropic_sdk
     reasoning:
       effort: high
 ```
diff --git a/tests/test_runner.py b/tests/test_runner.py
index a7713953..c34d803a 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -187,7 +187,7 @@ class TestResolveTaskModel:
 
     def test_logical_name_mapped_to_provider_id(self):
         """A logical model name is resolved to the provider model ID."""
-        model_id, _, _, _, _ = _resolve_task_model(
+        model_id, _, _, _, _, _ = _resolve_task_model(
             TaskDefinition(model="fast"),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
@@ -197,7 +197,7 @@ def test_logical_name_mapped_to_provider_id(self):
 
     def test_model_settings_from_config(self):
         """Settings from models_params are included in the result."""
-        _, settings, _, _, _ = _resolve_task_model(
+        _, settings, _, _, _, _ = _resolve_task_model(
             TaskDefinition(model="fast"),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
@@ -208,7 +208,7 @@ def test_model_settings_from_config(self):
 
     def test_task_level_settings_override_config(self):
         """Task-level model_settings override config-level settings."""
-        _, settings, _, _, _ = _resolve_task_model(
+        _, settings, _, _, _, _ = _resolve_task_model(
             TaskDefinition(model="fast", model_settings={"temperature": 0.2}),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
@@ -219,7 +219,7 @@ def test_task_level_settings_override_config(self):
 
     def test_engine_keys_extracted(self):
         """Engine keys (api_type, endpoint, token) are popped from settings."""
-        _, settings, api_type, endpoint, token = _resolve_task_model(
+        _, settings, api_type, endpoint, token, _ = _resolve_task_model(
             TaskDefinition(model="fast"),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
@@ -244,7 +244,7 @@ def test_default_model_when_empty(self):
         """Empty model string falls back to DEFAULT_MODEL."""
         from seclab_taskflow_agent.agent import DEFAULT_MODEL
 
-        model_id, _, _, _, _ = _resolve_task_model(
+        model_id, _, _, _, _, _ = _resolve_task_model(
             TaskDefinition(model=""),
             model_keys=[],
             model_dict={},
@@ -254,7 +254,7 @@ def test_default_model_when_empty(self):
 
     def test_model_not_in_keys_passes_through(self):
         """A model name not in model_keys passes through as-is."""
-        model_id, _, _, _, _ = _resolve_task_model(
+        model_id, _, _, _, _, _ = _resolve_task_model(
             TaskDefinition(model="claude-3-opus"),
             model_keys=["fast", "smart"],
             model_dict={"fast": "gpt-4o-mini", "smart": "gpt-4o"},
@@ -264,7 +264,7 @@ def test_model_not_in_keys_passes_through(self):
 
     def test_task_engine_keys_override_config(self):
         """Task-level model_settings can override engine keys from config."""
-        _, _, api_type, endpoint, token = _resolve_task_model(
+        _, _, api_type, endpoint, token, _ = _resolve_task_model(
             TaskDefinition(
                 model="fast",
                 model_settings={"api_type": "responses", "endpoint": "https://task.api"},
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
new file mode 100644
index 00000000..951f7a04
--- /dev/null
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: GitHub, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Tests for the Anthropic SDK adapter."""
+
+from __future__ import annotations
+
+import pytest
+
+from seclab_taskflow_agent.sdk import get_backend
+from seclab_taskflow_agent.sdk.base import AgentSpec, MCPServerSpec, TextDelta, ToolEnd
+from seclab_taskflow_agent.sdk.anthropic_sdk.backend import (
+    AnthropicSDKBackend,
+    _mcp_tools_to_anthropic,
+    _call_tool_result_to_text,
+    _resolve_token,
+    _VALID_REASONING,
+)
+from seclab_taskflow_agent.sdk.errors import (
+    BackendBadRequestError,
+    BackendCapabilityError,
+)
+
+
+def _spec(**overrides) -> AgentSpec:
+    base = {
+        "name": "a",
+        "instructions": "You are a test agent.",
+        "model": "claude-opus-4.7",
+    }
+    base.update(overrides)
+    return AgentSpec(**base)
+
+
+# -- Backend registration --
+
+
+def test_get_backend_returns_anthropic_sdk_instance():
+    backend = get_backend("anthropic_sdk")
+    assert isinstance(backend, AnthropicSDKBackend)
+    assert backend.name == "anthropic_sdk"
+
+
+# -- validate() --
+
+
+def test_validate_accepts_minimal_spec():
+    AnthropicSDKBackend().validate(_spec())
+
+
+def test_validate_rejects_handoffs():
+    backend = AnthropicSDKBackend()
+    with pytest.raises(BackendCapabilityError, match="handoffs"):
+        backend.validate(_spec(handoffs=[_spec(name="b")]))
+
+
+def test_validate_rejects_handoff_graph():
+    backend = AnthropicSDKBackend()
+    with pytest.raises(BackendCapabilityError, match="handoffs"):
+        backend.validate(_spec(in_handoff_graph=True))
+
+
+def test_validate_rejects_empty_model():
+    backend = AnthropicSDKBackend()
+    with pytest.raises(BackendBadRequestError, match="model is required"):
+        backend.validate(_spec(model=""))
+
+
+def test_validate_accepts_exclude_from_context():
+    AnthropicSDKBackend().validate(_spec(exclude_from_context=True))
+
+
+# -- _mcp_tools_to_anthropic() --
+
+
+class _FakeTool:
+    def __init__(self, name, description=None, inputSchema=None):
+        self.name = name
+        self.description = description
+        self.inputSchema = inputSchema
+
+
+def test_mcp_tools_to_anthropic_basic():
+    tools = [_FakeTool("read_file", "Read a file", {"type": "object", "properties": {"path": {"type": "string"}}})]
+    result = _mcp_tools_to_anthropic(tools)
+    assert len(result) == 1
+    assert result[0]["name"] == "read_file"
+    assert result[0]["description"] == "Read a file"
+    assert result[0]["input_schema"]["properties"]["path"]["type"] == "string"
+
+
+def test_mcp_tools_to_anthropic_none_description():
+    """Tools with None description should fall back to tool name."""
+    tools = [_FakeTool("my_tool", description=None)]
+    result = _mcp_tools_to_anthropic(tools)
+    assert result[0]["description"] == "my_tool"
+
+
+def test_mcp_tools_to_anthropic_empty_description():
+    """Tools with empty string description should fall back to tool name."""
+    tools = [_FakeTool("my_tool", description="")]
+    result = _mcp_tools_to_anthropic(tools)
+    assert result[0]["description"] == "my_tool"
+
+
+def test_mcp_tools_to_anthropic_no_schema():
+    """Tools without inputSchema should get a default empty object schema."""
+    tools = [_FakeTool("my_tool", "desc")]
+    result = _mcp_tools_to_anthropic(tools)
+    assert result[0]["input_schema"] == {"type": "object", "properties": {}}
+
+
+def test_mcp_tools_to_anthropic_none_schema():
+    """Tools with None inputSchema should get a default empty object schema."""
+    tools = [_FakeTool("my_tool", "desc", inputSchema=None)]
+    result = _mcp_tools_to_anthropic(tools)
+    assert result[0]["input_schema"] == {"type": "object", "properties": {}}
+
+
+# -- _call_tool_result_to_text() --
+
+
+class _FakeContent:
+    def __init__(self, text):
+        self.text = text
+
+
+class _FakeResult:
+    def __init__(self, contents):
+        self.content = contents
+
+
+def test_call_tool_result_to_text_single():
+    result = type("R", (), {"content": [_FakeContent("hello")]})()
+    assert _call_tool_result_to_text(result) == "hello"
+
+
+def test_call_tool_result_to_text_multiple():
+    result = type("R", (), {"content": [_FakeContent("a"), _FakeContent("b")]})()
+    assert _call_tool_result_to_text(result) == "a\nb"
+
+
+def test_call_tool_result_to_text_empty():
+    result = type("R", (), {"content": []})()
+    text = _call_tool_result_to_text(result)
+    assert isinstance(text, str)
+
+
+# -- _resolve_token() --
+
+
+def test_resolve_token_from_env(monkeypatch):
+    monkeypatch.setenv("MY_TOKEN", "secret123")
+    assert _resolve_token("MY_TOKEN") == "secret123"
+
+
+def test_resolve_token_fallback_to_ai_api_token(monkeypatch):
+    monkeypatch.setenv("AI_API_TOKEN", "fallback_token")
+    monkeypatch.delenv("MISSING_VAR", raising=False)
+    assert _resolve_token("MISSING_VAR") == "fallback_token"
+
+
+def test_resolve_token_raises_when_missing(monkeypatch):
+    monkeypatch.delenv("AI_API_TOKEN", raising=False)
+    monkeypatch.delenv("MISSING_VAR", raising=False)
+    with pytest.raises(BackendBadRequestError, match="no API token"):
+        _resolve_token("MISSING_VAR")
+
+
+# -- reasoning validation --
+
+
+def test_valid_reasoning_values():
+    assert _VALID_REASONING == ("low", "medium", "high", "max")
diff --git a/tests/test_sdk_base.py b/tests/test_sdk_base.py
index 54dd17e5..f2fb1c38 100644
--- a/tests/test_sdk_base.py
+++ b/tests/test_sdk_base.py
@@ -38,24 +38,23 @@ def test_resolve_backend_default_is_openai_agents(monkeypatch):
     assert sdk.resolve_backend_name() == "openai_agents"
 
 
-def test_resolve_backend_copilot_endpoint_prefers_copilot_when_installed(monkeypatch):
+def test_resolve_backend_copilot_endpoint_does_not_auto_select(monkeypatch):
+    """Backend selection is always explicit -- endpoint URL is not used."""
     monkeypatch.delenv("SECLAB_TASKFLOW_BACKEND", raising=False)
-    pytest.importorskip("copilot")
     assert (
         sdk.resolve_backend_name(endpoint="https://api.githubcopilot.com")
-        == "copilot_sdk"
+        == "openai_agents"
     )
 
 
-def test_resolve_backend_copilot_endpoint_falls_back_when_missing(monkeypatch):
+def test_resolve_backend_explicit_overrides_endpoint(monkeypatch):
     monkeypatch.delenv("SECLAB_TASKFLOW_BACKEND", raising=False)
-    # Force the optional import to fail by stashing a sentinel in sys.modules.
-    import sys
-
-    monkeypatch.setitem(sys.modules, "copilot", None)
     assert (
-        sdk.resolve_backend_name(endpoint="https://api.githubcopilot.com")
-        == "openai_agents"
+        sdk.resolve_backend_name(
+            explicit="anthropic_sdk",
+            endpoint="https://api.githubcopilot.com",
+        )
+        == "anthropic_sdk"
     )
 
 

From b6f00571b74041cc6ceeeca470f7cc6816c86597 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 12:01:50 -0400
Subject: [PATCH 05/23] fix: lint errors in test file (unused imports, N803
 camelCase)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_sdk_anthropic_adapter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index 951f7a04..bc347df0 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -8,7 +8,7 @@
 import pytest
 
 from seclab_taskflow_agent.sdk import get_backend
-from seclab_taskflow_agent.sdk.base import AgentSpec, MCPServerSpec, TextDelta, ToolEnd
+from seclab_taskflow_agent.sdk.base import AgentSpec
 from seclab_taskflow_agent.sdk.anthropic_sdk.backend import (
     AnthropicSDKBackend,
     _mcp_tools_to_anthropic,
@@ -74,10 +74,10 @@ def test_validate_accepts_exclude_from_context():
 
 
 class _FakeTool:
-    def __init__(self, name, description=None, inputSchema=None):
+    def __init__(self, name, description=None, input_schema=None):  # noqa: N803
         self.name = name
         self.description = description
-        self.inputSchema = inputSchema
+        self.inputSchema = input_schema
 
 
 def test_mcp_tools_to_anthropic_basic():
@@ -112,7 +112,7 @@ def test_mcp_tools_to_anthropic_no_schema():
 
 def test_mcp_tools_to_anthropic_none_schema():
     """Tools with None inputSchema should get a default empty object schema."""
-    tools = [_FakeTool("my_tool", "desc", inputSchema=None)]
+    tools = [_FakeTool("my_tool", "desc", input_schema=None)]
     result = _mcp_tools_to_anthropic(tools)
     assert result[0]["input_schema"] == {"type": "object", "properties": {}}
 

From 266c54b6fc70bd4ea8757fd0c7830cee72d7e2c6 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 12:33:44 -0400
Subject: [PATCH 06/23] test: add backend extraction coverage to
 _resolve_task_model tests

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_runner.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_runner.py b/tests/test_runner.py
index c34d803a..5cb1c26a 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -218,8 +218,8 @@ def test_task_level_settings_override_config(self):
         assert settings["max_tokens"] == 100
 
     def test_engine_keys_extracted(self):
-        """Engine keys (api_type, endpoint, token) are popped from settings."""
-        _, settings, api_type, endpoint, token, _ = _resolve_task_model(
+        """Engine keys (api_type, endpoint, token, backend) are popped from settings."""
+        _, settings, api_type, endpoint, token, backend = _resolve_task_model(
             TaskDefinition(model="fast"),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
@@ -228,6 +228,7 @@ def test_engine_keys_extracted(self):
                     "api_type": "responses",
                     "endpoint": "https://custom.api",
                     "token": "secret",
+                    "backend": "anthropic_sdk",
                     "temperature": 0.5,
                 }
             },
@@ -235,9 +236,11 @@ def test_engine_keys_extracted(self):
         assert api_type == "responses"
         assert endpoint == "https://custom.api"
         assert token == "secret"  # noqa: S105
+        assert backend == "anthropic_sdk"
         assert "api_type" not in settings
         assert "endpoint" not in settings
         assert "token" not in settings
+        assert "backend" not in settings
         assert settings["temperature"] == 0.5
 
     def test_default_model_when_empty(self):
@@ -264,17 +267,18 @@ def test_model_not_in_keys_passes_through(self):
 
     def test_task_engine_keys_override_config(self):
         """Task-level model_settings can override engine keys from config."""
-        _, _, api_type, endpoint, token, _ = _resolve_task_model(
+        _, _, api_type, endpoint, token, backend = _resolve_task_model(
             TaskDefinition(
                 model="fast",
-                model_settings={"api_type": "responses", "endpoint": "https://task.api"},
+                model_settings={"api_type": "responses", "endpoint": "https://task.api", "backend": "anthropic_sdk"},
             ),
             model_keys=["fast"],
             model_dict={"fast": "gpt-4o-mini"},
-            models_params={"fast": {"api_type": "chat_completions"}},
+            models_params={"fast": {"api_type": "chat_completions", "backend": "openai_agents"}},
         )
         assert api_type == "responses"
         assert endpoint == "https://task.api"
+        assert backend == "anthropic_sdk"
 
 
 # ===================================================================

From 44172790802c338dfb52280bee8bc89dab9b4860 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 15:01:17 -0400
Subject: [PATCH 07/23] fix: pass real token as api_key instead of placeholder

Allows the backend to work with both CAPI (Authorization: Bearer)
and direct Anthropic endpoints (x-api-key) without code changes.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 37c31a49..561fea2a 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -124,7 +124,7 @@ async def build(
         endpoint = spec.endpoint or _resolve_endpoint()
 
         client = anthropic.AsyncAnthropic(
-            api_key="placeholder",
+            api_key=token,
             base_url=endpoint,
             default_headers={
                 "Authorization": f"Bearer {token}",

From e16b20acb837566ff1629773f4f7c3df708835fe Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 15:15:22 -0400
Subject: [PATCH 08/23] fix: implement blocked_tools filtering in anthropic
 backend

Access the MCP session directly to get the raw tool list, bypassing
the openai-agents tool_filter which requires run_context/agent args
not available outside its run loop. Apply blocked_tools filtering
and namespace prefixing in our own code.

Tested: blocked tool correctly hidden from model, unblocked tools
work normally.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../sdk/anthropic_sdk/backend.py              | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 561fea2a..0b35a3bd 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -134,20 +134,47 @@ async def build(
             },
         )
 
-        # Collect tools from MCP servers
+        # Collect tools from MCP servers and apply blocked_tools filter.
+        # We filter tools ourselves rather than relying on the openai-agents
+        # SDK's tool_filter, which requires run_context/agent args that
+        # aren't available outside the openai-agents run loop.
         all_tools: list[dict[str, Any]] = []
         mcp_server_map: dict[str, Any] = {}
+        blocked = set(spec.blocked_tools or [])
 
         for mcp_spec in spec.mcp_servers:
             native_server = mcp_spec.params.get("_native")
             if native_server is None:
                 continue
             try:
-                mcp_tools = await native_server.list_tools()
-                anthropic_tools = _mcp_tools_to_anthropic(mcp_tools)
-                all_tools.extend(anthropic_tools)
+                # Access the underlying MCP session to get the raw tool
+                # list, bypassing the openai-agents tool_filter that
+                # requires run_context/agent we don't have.
+                raw_server = getattr(native_server, "_obj", native_server)
+                session = getattr(raw_server, "session", None)
+                if session is not None:
+                    result = await session.list_tools()
+                    raw_tools = result.tools
+                else:
+                    raw_tools = await native_server.list_tools()
+
+                # Apply namespace prefix (NamespacedMCPServer convention)
+                ns = getattr(native_server, "namespace", "")
+                mcp_tools = []
+                for tool in raw_tools:
+                    if hasattr(tool, "copy"):
+                        tool = tool.copy()
+                    if ns:
+                        tool.name = f"{ns}{tool.name}"
+                    mcp_tools.append(tool)
+
                 for tool in mcp_tools:
-                    mcp_server_map[tool.name] = native_server
+                    if tool.name not in blocked:
+                        mcp_server_map[tool.name] = native_server
+                anthropic_tools = _mcp_tools_to_anthropic(
+                    [t for t in mcp_tools if t.name not in blocked]
+                )
+                all_tools.extend(anthropic_tools)
             except Exception:
                 logger.exception("Failed to list tools from MCP server %s", mcp_spec.name)
 

From ccbf7c543b2f74d0089979863e9daafd57c367be Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 15:22:44 -0400
Subject: [PATCH 09/23] fix: address PR review feedback (round 2)

- Use placeholder api_key only for CAPI endpoints; pass real token
  for direct Anthropic endpoints (avoids leaking token via x-api-key
  to CAPI while preserving native Anthropic auth)
- Replace implicit else with explicit elif + else-error in get_backend
  (Kevin's review)
- Add test for invalid reasoning effort validation

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/__init__.py     |  4 +++-
 .../sdk/anthropic_sdk/backend.py              | 21 ++++++++++++-------
 tests/test_sdk_anthropic_adapter.py           | 19 +++++++++++++++++
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py
index 972afa90..0a413505 100644
--- a/src/seclab_taskflow_agent/sdk/__init__.py
+++ b/src/seclab_taskflow_agent/sdk/__init__.py
@@ -50,10 +50,12 @@ def get_backend(name: str) -> AgentBackend:
             from .copilot_sdk.backend import CopilotSDKBackend
 
             _BACKENDS[name] = CopilotSDKBackend()
-        else:
+        elif name == "anthropic_sdk":
             from .anthropic_sdk.backend import AnthropicSDKBackend
 
             _BACKENDS[name] = AnthropicSDKBackend()
+        else:
+            raise ValueError(f"No backend implementation for {name!r}")
     return _BACKENDS[name]
 
 
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 0b35a3bd..56c32477 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -123,15 +123,22 @@ async def build(
         token = _resolve_token(spec.token_env)
         endpoint = spec.endpoint or _resolve_endpoint()
 
+        # CAPI expects Authorization: Bearer, not x-api-key. Use a
+        # placeholder api_key so the SDK doesn't send the real token
+        # via x-api-key as well. For direct Anthropic endpoints, pass
+        # the real token as api_key (the SDK's native auth).
+        is_capi = "githubcopilot.com" in endpoint
+        headers: dict[str, str] = {}
+        if is_capi:
+            headers["Authorization"] = f"Bearer {token}"
+            headers["Copilot-Integration-Id"] = os.getenv(
+                "COPILOT_INTEGRATION_ID", "vscode-chat"
+            )
+
         client = anthropic.AsyncAnthropic(
-            api_key=token,
+            api_key="placeholder" if is_capi else token,
             base_url=endpoint,
-            default_headers={
-                "Authorization": f"Bearer {token}",
-                "Copilot-Integration-Id": os.getenv(
-                    "COPILOT_INTEGRATION_ID", "vscode-chat"
-                ),
-            },
+            default_headers=headers or None,
         )
 
         # Collect tools from MCP servers and apply blocked_tools filter.
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index bc347df0..d5aa9b51 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -172,3 +172,22 @@ def test_resolve_token_raises_when_missing(monkeypatch):
 
 def test_valid_reasoning_values():
     assert _VALID_REASONING == ("low", "medium", "high", "max")
+
+
+# -- reasoning effort validation (runtime) --
+
+
+def test_invalid_reasoning_effort_raises():
+    """Invalid reasoning.effort should raise BackendBadRequestError at runtime."""
+    import asyncio
+
+    backend = AnthropicSDKBackend()
+    spec = _spec(model_settings={"reasoning": {"effort": "ultra"}})
+
+    # build() would need a real API client, but we can test the validation
+    # by checking the constant directly
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING
+    assert "ultra" not in _VALID_REASONING
+    assert "high" in _VALID_REASONING
+    assert "low" in _VALID_REASONING
+    assert "max" in _VALID_REASONING

From 0c521e040d4837124c79bb8595e0fd65ee42738b Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 15:26:55 -0400
Subject: [PATCH 10/23] fix: lint errors + URL substring sanitization (CodeQL)

- Remove unused asyncio import, backend/spec variables from test
- Use urlparse().hostname for CAPI endpoint detection instead of
  substring match (CodeQL incomplete URL sanitization finding)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../sdk/anthropic_sdk/backend.py                      |  6 +++++-
 tests/test_sdk_anthropic_adapter.py                   | 11 ++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 56c32477..c343b032 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -127,7 +127,11 @@ async def build(
         # placeholder api_key so the SDK doesn't send the real token
         # via x-api-key as well. For direct Anthropic endpoints, pass
         # the real token as api_key (the SDK's native auth).
-        is_capi = "githubcopilot.com" in endpoint
+        from urllib.parse import urlparse
+        is_capi = urlparse(endpoint).hostname in (
+            "api.githubcopilot.com",
+            "models.github.ai",
+        )
         headers: dict[str, str] = {}
         if is_capi:
             headers["Authorization"] = f"Bearer {token}"
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index d5aa9b51..8587dbec 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -177,15 +177,8 @@ def test_valid_reasoning_values():
 # -- reasoning effort validation (runtime) --
 
 
-def test_invalid_reasoning_effort_raises():
-    """Invalid reasoning.effort should raise BackendBadRequestError at runtime."""
-    import asyncio
-
-    backend = AnthropicSDKBackend()
-    spec = _spec(model_settings={"reasoning": {"effort": "ultra"}})
-
-    # build() would need a real API client, but we can test the validation
-    # by checking the constant directly
+def test_invalid_reasoning_effort_not_in_valid():
+    """Invalid reasoning.effort values should not be in _VALID_REASONING."""
     from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING
     assert "ultra" not in _VALID_REASONING
     assert "high" in _VALID_REASONING

From b4da0a60ffb33f58b29566dce14330c4adf76cb4 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Thu, 11 Jun 2026 16:04:21 -0400
Subject: [PATCH 11/23] fix: address PR review feedback (round 3)

- Move CAPI endpoint detection to capi.py as is_capi_endpoint()
  (Kevin's review: keep provider logic centralized)
- Format test file with ruff (CI formatting check)
- Add runtime validation test for invalid reasoning effort
  (tests BackendBadRequestError from run_streamed, not just constant)
- Tool calls are sequential by design (MCP tools may have ordering
  dependencies); updated PR description to not claim parallel

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/capi.py             |  9 +++++
 .../sdk/anthropic_sdk/backend.py              |  8 ++---
 tests/test_sdk_anthropic_adapter.py           | 34 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py
index a605258f..2ce61a3a 100644
--- a/src/seclab_taskflow_agent/capi.py
+++ b/src/seclab_taskflow_agent/capi.py
@@ -29,6 +29,7 @@
     "get_AI_endpoint",
     "get_AI_token",
     "get_provider",
+    "is_capi_endpoint",
     "list_capi_models",
     "list_tool_call_models",
     "supports_tool_calls",
@@ -142,6 +143,14 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool:
 
 _DEFAULT_PROVIDER = "api.githubcopilot.com"
 
+# Hostnames that use CAPI-style auth (Authorization: Bearer, not x-api-key).
+_CAPI_HOSTS = frozenset(_PROVIDERS.keys())
+
+
+def is_capi_endpoint(endpoint: str) -> bool:
+    """Return True if *endpoint* is a GitHub CAPI proxy (needs Bearer auth)."""
+    return urlparse(endpoint).hostname in _CAPI_HOSTS
+
 def get_provider(endpoint: str | None = None) -> APIProvider:
     """Return the ``APIProvider`` for the given (or configured) endpoint URL.
 
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index c343b032..43f8ad2a 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -123,15 +123,13 @@ async def build(
         token = _resolve_token(spec.token_env)
         endpoint = spec.endpoint or _resolve_endpoint()
 
+        from ..capi import is_capi_endpoint
+
         # CAPI expects Authorization: Bearer, not x-api-key. Use a
         # placeholder api_key so the SDK doesn't send the real token
         # via x-api-key as well. For direct Anthropic endpoints, pass
         # the real token as api_key (the SDK's native auth).
-        from urllib.parse import urlparse
-        is_capi = urlparse(endpoint).hostname in (
-            "api.githubcopilot.com",
-            "models.github.ai",
-        )
+        is_capi = is_capi_endpoint(endpoint)
         headers: dict[str, str] = {}
         if is_capi:
             headers["Authorization"] = f"Bearer {token}"
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index 8587dbec..40132199 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -81,7 +81,13 @@ def __init__(self, name, description=None, input_schema=None):  # noqa: N803
 
 
 def test_mcp_tools_to_anthropic_basic():
-    tools = [_FakeTool("read_file", "Read a file", {"type": "object", "properties": {"path": {"type": "string"}}})]
+    tools = [
+        _FakeTool(
+            "read_file",
+            "Read a file",
+            {"type": "object", "properties": {"path": {"type": "string"}}},
+        )
+    ]
     result = _mcp_tools_to_anthropic(tools)
     assert len(result) == 1
     assert result[0]["name"] == "read_file"
@@ -180,7 +186,33 @@ def test_valid_reasoning_values():
 def test_invalid_reasoning_effort_not_in_valid():
     """Invalid reasoning.effort values should not be in _VALID_REASONING."""
     from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING
+
     assert "ultra" not in _VALID_REASONING
     assert "high" in _VALID_REASONING
     assert "low" in _VALID_REASONING
     assert "max" in _VALID_REASONING
+
+
+def test_invalid_reasoning_effort_raises_at_runtime():
+    """run_streamed raises BackendBadRequestError for invalid effort."""
+    import asyncio
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+
+    handle = _AnthropicHandle(
+        client=None,
+        system_prompt="",
+        model="test",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={"reasoning": {"effort": "ultra"}},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    with pytest.raises(BackendBadRequestError, match="invalid reasoning effort"):
+        asyncio.run(_run())

From 03ffdb9ac3822ae08619f8b7fcd57a95d630b0aa Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 10:16:13 -0400
Subject: [PATCH 12/23] fix: correct relative import for capi in anthropic_sdk
 backend

The backend.py module is 3 levels deep (sdk/anthropic_sdk/backend.py),
so the import needs '...capi' (3 dots) to reach the top-level
seclab_taskflow_agent.capi module, not '..capi' (2 dots) which only
reaches sdk/.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 43f8ad2a..23685b5e 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -123,7 +123,7 @@ async def build(
         token = _resolve_token(spec.token_env)
         endpoint = spec.endpoint or _resolve_endpoint()
 
-        from ..capi import is_capi_endpoint
+        from ...capi import is_capi_endpoint
 
         # CAPI expects Authorization: Bearer, not x-api-key. Use a
         # placeholder api_key so the SDK doesn't send the real token

From 2cb49b98fd3426df4184290ddc6516682aca7b76 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 11:05:38 -0400
Subject: [PATCH 13/23] refactor: use provider registry bearer_auth for
 anthropic backend auth

Replace is_capi_endpoint() with provider.bearer_auth from the existing
provider registry. Each registered provider (CAPI, GitHub Models, OpenAI)
sets bearer_auth=True; unknown/custom endpoints default to False (native
SDK auth via x-api-key).

Also replaces the duplicate _resolve_token/_resolve_endpoint helpers with
get_AI_token/get_AI_endpoint from capi.py, fixing COPILOT_TOKEN fallback.

The Copilot-Integration-Id header is now sourced from the provider's
extra_headers instead of being hardcoded in the backend.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/capi.py             | 15 ++---
 .../sdk/anthropic_sdk/backend.py              | 64 ++++++++-----------
 tests/test_sdk_anthropic_adapter.py           | 39 +++++++----
 3 files changed, 54 insertions(+), 64 deletions(-)

diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py
index 2ce61a3a..c897f4ed 100644
--- a/src/seclab_taskflow_agent/capi.py
+++ b/src/seclab_taskflow_agent/capi.py
@@ -29,7 +29,6 @@
     "get_AI_endpoint",
     "get_AI_token",
     "get_provider",
-    "is_capi_endpoint",
     "list_capi_models",
     "list_tool_call_models",
     "supports_tool_calls",
@@ -51,6 +50,7 @@ class APIProvider:
     models_catalog: str = "/models"
     default_model: str = "gpt-4.1"
     extra_headers: Mapping[str, str] = field(default_factory=dict)
+    bearer_auth: bool = True  # Use Authorization: Bearer (not x-api-key)
 
     def __post_init__(self) -> None:
         # Ensure base_url ends with / so httpx URL.join() preserves the path
@@ -143,14 +143,6 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool:
 
 _DEFAULT_PROVIDER = "api.githubcopilot.com"
 
-# Hostnames that use CAPI-style auth (Authorization: Bearer, not x-api-key).
-_CAPI_HOSTS = frozenset(_PROVIDERS.keys())
-
-
-def is_capi_endpoint(endpoint: str) -> bool:
-    """Return True if *endpoint* is a GitHub CAPI proxy (needs Bearer auth)."""
-    return urlparse(endpoint).hostname in _CAPI_HOSTS
-
 def get_provider(endpoint: str | None = None) -> APIProvider:
     """Return the ``APIProvider`` for the given (or configured) endpoint URL.
 
@@ -181,8 +173,9 @@ def get_provider(endpoint: str | None = None) -> APIProvider:
         if upstream:
             return dataclasses.replace(upstream, base_url=url)
 
-    # Unknown endpoint — return a generic provider with the given base URL
-    return APIProvider(name="custom", base_url=url, default_model="please-set-default-model-via-env")
+    # Unknown endpoint — return a generic provider using native SDK auth.
+    return APIProvider(name="custom", base_url=url, bearer_auth=False,
+                       default_model="please-set-default-model-via-env")
 
 
 # ---------------------------------------------------------------------------
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 23685b5e..315f6979 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -7,10 +7,11 @@
 ``anthropic`` Python SDK. Supports streaming, tool calling via MCP
 servers, and extended thinking.
 
-Auth note: The Anthropic SDK sends ``x-api-key`` by default, but CAPI
-expects ``Authorization: Bearer``. We pass the bearer header via
-``default_headers`` and set ``api_key`` to a placeholder so the SDK
-doesn't complain about a missing key.
+Auth note: The Anthropic SDK sends ``x-api-key`` by default, but
+providers that use Bearer auth (see ``APIProvider.bearer_auth``)
+need ``Authorization: Bearer`` instead.  We pass the bearer header
+via ``default_headers`` and set ``api_key`` to a placeholder so the
+SDK doesn't send the real token via x-api-key.
 """
 
 from __future__ import annotations
@@ -38,25 +39,6 @@
 _VALID_REASONING = ("low", "medium", "high", "max")
 
 
-def _resolve_token(token_env: str | None) -> str:
-    """Resolve the API token from env var name or default AI_API_TOKEN."""
-    if token_env:
-        val = os.getenv(token_env)
-        if val:
-            return val
-    val = os.getenv("AI_API_TOKEN")
-    if val:
-        return val
-    raise BackendBadRequestError(
-        "anthropic_sdk: no API token found (set AI_API_TOKEN or per-model token env)"
-    )
-
-
-def _resolve_endpoint() -> str:
-    """Resolve the API base URL."""
-    return os.getenv("AI_API_ENDPOINT", "https://api.githubcopilot.com")
-
-
 def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]:
     """Convert MCP tool definitions to Anthropic tool format."""
     anthropic_tools = []
@@ -120,25 +102,29 @@ async def build(
 
         import anthropic
 
-        token = _resolve_token(spec.token_env)
-        endpoint = spec.endpoint or _resolve_endpoint()
-
-        from ...capi import is_capi_endpoint
-
-        # CAPI expects Authorization: Bearer, not x-api-key. Use a
-        # placeholder api_key so the SDK doesn't send the real token
-        # via x-api-key as well. For direct Anthropic endpoints, pass
-        # the real token as api_key (the SDK's native auth).
-        is_capi = is_capi_endpoint(endpoint)
-        headers: dict[str, str] = {}
-        if is_capi:
+        from ...capi import get_AI_endpoint, get_AI_token, get_provider
+
+        # Resolve token: per-model env var override, then standard token chain
+        if spec.token_env:
+            token = os.getenv(spec.token_env, "")
+        else:
+            token = ""
+        if not token:
+            token = get_AI_token()
+
+        endpoint = spec.endpoint or get_AI_endpoint()
+        provider = get_provider(endpoint)
+
+        # Providers with bearer_auth=True need Authorization: Bearer instead
+        # of the Anthropic SDK's native x-api-key header. Use a placeholder
+        # api_key so the SDK doesn't also send the real token via x-api-key.
+        # Endpoints not in the provider registry default to native SDK auth.
+        headers: dict[str, str] = dict(provider.extra_headers)
+        if provider.bearer_auth:
             headers["Authorization"] = f"Bearer {token}"
-            headers["Copilot-Integration-Id"] = os.getenv(
-                "COPILOT_INTEGRATION_ID", "vscode-chat"
-            )
 
         client = anthropic.AsyncAnthropic(
-            api_key="placeholder" if is_capi else token,
+            api_key="placeholder" if provider.bearer_auth else token,
             base_url=endpoint,
             default_headers=headers or None,
         )
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index 40132199..496869be 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -13,7 +13,6 @@
     AnthropicSDKBackend,
     _mcp_tools_to_anthropic,
     _call_tool_result_to_text,
-    _resolve_token,
     _VALID_REASONING,
 )
 from seclab_taskflow_agent.sdk.errors import (
@@ -152,25 +151,37 @@ def test_call_tool_result_to_text_empty():
     assert isinstance(text, str)
 
 
-# -- _resolve_token() --
+# -- bearer_auth via provider registry --
 
 
-def test_resolve_token_from_env(monkeypatch):
-    monkeypatch.setenv("MY_TOKEN", "secret123")
-    assert _resolve_token("MY_TOKEN") == "secret123"
+def test_known_provider_uses_bearer_auth():
+    """Known providers (CAPI, GitHub Models) should have bearer_auth=True."""
+    from seclab_taskflow_agent.capi import get_provider
 
+    provider = get_provider("https://api.githubcopilot.com")
+    assert provider.bearer_auth is True
 
-def test_resolve_token_fallback_to_ai_api_token(monkeypatch):
-    monkeypatch.setenv("AI_API_TOKEN", "fallback_token")
-    monkeypatch.delenv("MISSING_VAR", raising=False)
-    assert _resolve_token("MISSING_VAR") == "fallback_token"
+    provider = get_provider("https://models.github.ai/inference")
+    assert provider.bearer_auth is True
 
 
-def test_resolve_token_raises_when_missing(monkeypatch):
-    monkeypatch.delenv("AI_API_TOKEN", raising=False)
-    monkeypatch.delenv("MISSING_VAR", raising=False)
-    with pytest.raises(BackendBadRequestError, match="no API token"):
-        _resolve_token("MISSING_VAR")
+def test_unknown_endpoint_uses_native_auth():
+    """Unknown endpoints should default to native SDK auth (bearer_auth=False)."""
+    from seclab_taskflow_agent.capi import get_provider
+
+    provider = get_provider("https://api.anthropic.com")
+    assert provider.bearer_auth is False
+    assert provider.name == "custom"
+
+
+def test_awf_proxy_inherits_upstream_bearer_auth(monkeypatch):
+    """AWF proxy should inherit bearer_auth from the upstream provider."""
+    from seclab_taskflow_agent.capi import get_provider
+
+    monkeypatch.setenv("AWF_COPILOT_PROXY", "api.githubcopilot.com")
+    provider = get_provider("http://localhost:8080")
+    assert provider.bearer_auth is True
+    assert provider.base_url == "http://localhost:8080/"
 
 
 # -- reasoning validation --

From ed19781ebb87463301c6ce3ded8b823a9e71caf6 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 11:12:42 -0400
Subject: [PATCH 14/23] style: use ternary for token resolution (ruff SIM108)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 315f6979..0174fa1d 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -105,10 +105,7 @@ async def build(
         from ...capi import get_AI_endpoint, get_AI_token, get_provider
 
         # Resolve token: per-model env var override, then standard token chain
-        if spec.token_env:
-            token = os.getenv(spec.token_env, "")
-        else:
-            token = ""
+        token = os.getenv(spec.token_env, "") if spec.token_env else ""
         if not token:
             token = get_AI_token()
 

From ffb017ee53001e30fdebb734bf71db2cb8754e2d Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 12:06:43 -0400
Subject: [PATCH 15/23] refactor: move unfiltered MCP tool listing into
 MCPNamespaceWrap

The anthropic backend was reaching into openai-agents private attrs
(`_obj`, `.session`) to bypass tool_filter at tool-enumeration time.
This required duplicating the namespace-prefix logic that already lives
on MCPNamespaceWrap and risked double-prefixing on the fallback path.

Move the 'list tools without invoking the agent-side tool_filter' logic
into MCPNamespaceWrap.list_tools_unfiltered(), where the wrapper already
owns its namespace and session reference. The anthropic backend becomes
a one-liner; double-prefix risk is eliminated; openai-agents internal
access is centralized in one place (mcp_utils.py).

Also bump default_model in the provider registry from gpt-4.1 to gpt-5.5
(Copilot and OpenAI direct), openai/gpt-4.1 to openai/gpt-5.5 (GitHub
Models). Only affects callers who do not specify a model -- the audit
pipeline always specifies models via model_config, so this is purely a
fallback for community users.

Tests added: tests/test_mcp_utils.py (6 tests covering prefix correctness,
no-double-prefix, tool attribute preservation, missing-session error,
caller-state isolation, regression of existing list_tools()).
Tests updated: test_capi_extended.py (default_model assertions).
274 tests pass, ruff clean. Local audit on anticomputer/vulnerable-test-app
produced 4 vulnerabilities (verifying MCP tools enumerated + called
correctly).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/capi.py             |   8 +-
 src/seclab_taskflow_agent/mcp_utils.py        |  27 ++++
 .../sdk/anthropic_sdk/backend.py              |  29 +---
 tests/test_capi_extended.py                   |   4 +-
 tests/test_mcp_utils.py                       | 130 ++++++++++++++++++
 5 files changed, 168 insertions(+), 30 deletions(-)
 create mode 100644 tests/test_mcp_utils.py

diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py
index c897f4ed..5065ebcc 100644
--- a/src/seclab_taskflow_agent/capi.py
+++ b/src/seclab_taskflow_agent/capi.py
@@ -48,7 +48,7 @@ class APIProvider:
     name: str
     base_url: str
     models_catalog: str = "/models"
-    default_model: str = "gpt-4.1"
+    default_model: str = "gpt-5.5"
     extra_headers: Mapping[str, str] = field(default_factory=dict)
     bearer_auth: bool = True  # Use Authorization: Bearer (not x-api-key)
 
@@ -124,20 +124,20 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool:
     "api.githubcopilot.com": _CopilotProvider(
         name="copilot",
         base_url="https://api.githubcopilot.com",
-        default_model="gpt-4.1",
+        default_model="gpt-5.5",
         extra_headers={"Copilot-Integration-Id": COPILOT_INTEGRATION_ID},
     ),
     "models.github.ai": _GitHubModelsProvider(
         name="github-models",
         base_url="https://models.github.ai/inference",
         models_catalog="/catalog/models",
-        default_model="openai/gpt-4.1",
+        default_model="openai/gpt-5.5",
     ),
     "api.openai.com": _OpenAIProvider(
         name="openai",
         base_url="https://api.openai.com/v1",
         models_catalog="/v1/models",
-        default_model="gpt-4.1",
+        default_model="gpt-5.5",
     ),
 }
 
diff --git a/src/seclab_taskflow_agent/mcp_utils.py b/src/seclab_taskflow_agent/mcp_utils.py
index 92968986..228f64ed 100644
--- a/src/seclab_taskflow_agent/mcp_utils.py
+++ b/src/seclab_taskflow_agent/mcp_utils.py
@@ -97,6 +97,33 @@ async def list_tools(self, *args: Any, **kwargs: Any) -> list[Any]:
             namespaced_tools.append(tool_copy)
         return namespaced_tools
 
+    async def list_tools_unfiltered(self) -> list[Any]:
+        """List tools directly from the MCP session, namespace-prefixed.
+
+        Bypasses any tool_filter configured on the wrapped openai-agents
+        server (which would require ``run_context`` and ``agent`` arguments
+        that aren't available when listing tools outside the openai-agents
+        run loop -- e.g. when handing tools to a different SDK at build
+        time).
+
+        Raises ``RuntimeError`` if the underlying server has no active
+        MCP session yet (caller should ensure the server is connected
+        before calling this).
+        """
+        session = getattr(self._obj, "session", None)
+        if session is None:
+            raise RuntimeError(
+                f"MCPNamespaceWrap({self._obj!r}): underlying server has no "
+                "active MCP session; cannot list tools unfiltered"
+            )
+        result = await session.list_tools()
+        namespaced_tools: list[Any] = []
+        for tool in result.tools:
+            tool_copy = tool.copy() if hasattr(tool, "copy") else tool
+            tool_copy.name = f"{self.namespace}{tool.name}"
+            namespaced_tools.append(tool_copy)
+        return namespaced_tools
+
     def confirm_tool(self, tool_name: str, args: list[Any]) -> bool:
         """Interactively prompt the user for tool-call confirmation.
 
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 0174fa1d..996eea90 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -127,9 +127,10 @@ async def build(
         )
 
         # Collect tools from MCP servers and apply blocked_tools filter.
-        # We filter tools ourselves rather than relying on the openai-agents
-        # SDK's tool_filter, which requires run_context/agent args that
-        # aren't available outside the openai-agents run loop.
+        # We get raw tool lists via list_tools_unfiltered() rather than
+        # list_tools(), which would require run_context/agent args to
+        # invoke the openai-agents tool_filter -- args we don't have
+        # outside the openai-agents run loop.
         all_tools: list[dict[str, Any]] = []
         mcp_server_map: dict[str, Any] = {}
         blocked = set(spec.blocked_tools or [])
@@ -139,27 +140,7 @@ async def build(
             if native_server is None:
                 continue
             try:
-                # Access the underlying MCP session to get the raw tool
-                # list, bypassing the openai-agents tool_filter that
-                # requires run_context/agent we don't have.
-                raw_server = getattr(native_server, "_obj", native_server)
-                session = getattr(raw_server, "session", None)
-                if session is not None:
-                    result = await session.list_tools()
-                    raw_tools = result.tools
-                else:
-                    raw_tools = await native_server.list_tools()
-
-                # Apply namespace prefix (NamespacedMCPServer convention)
-                ns = getattr(native_server, "namespace", "")
-                mcp_tools = []
-                for tool in raw_tools:
-                    if hasattr(tool, "copy"):
-                        tool = tool.copy()
-                    if ns:
-                        tool.name = f"{ns}{tool.name}"
-                    mcp_tools.append(tool)
-
+                mcp_tools = await native_server.list_tools_unfiltered()
                 for tool in mcp_tools:
                     if tool.name not in blocked:
                         mcp_server_map[tool.name] = native_server
diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py
index e3a1188b..1202df05 100644
--- a/tests/test_capi_extended.py
+++ b/tests/test_capi_extended.py
@@ -111,7 +111,7 @@ def test_github_models_provider(self):
         p = get_provider("https://models.github.ai/inference")
         assert p.name == "github-models"
         assert p.models_catalog == "/catalog/models"
-        assert p.default_model == "openai/gpt-4.1"
+        assert p.default_model == "openai/gpt-5.5"
 
     def test_openai_provider(self):
         p = get_provider("https://api.openai.com/v1")
@@ -129,7 +129,7 @@ def test_awf_proxy_bare_hostname(self, monkeypatch):
         p = get_provider("http://172.30.0.30:10002")
         assert p.name == "copilot"
         assert p.base_url == "http://172.30.0.30:10002/"
-        assert p.default_model == "gpt-4.1"
+        assert p.default_model == "gpt-5.5"
         assert "Copilot-Integration-Id" in p.extra_headers
 
     def test_awf_proxy_full_url(self, monkeypatch):
diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py
new file mode 100644
index 00000000..8cdb1abf
--- /dev/null
+++ b/tests/test_mcp_utils.py
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: GitHub, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Tests for MCPNamespaceWrap."""
+
+from __future__ import annotations
+
+import asyncio
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name
+
+
+class _FakeTool:
+    """Tool with a copy() method (mimics mcp.types.Tool)."""
+
+    def __init__(self, name: str, description: str = "", input_schema: dict | None = None):
+        self.name = name
+        self.description = description
+        self.inputSchema = input_schema or {}
+
+    def copy(self) -> "_FakeTool":
+        return _FakeTool(self.name, self.description, dict(self.inputSchema))
+
+
+def _make_wrapper(server_name: str, session=None) -> MCPNamespaceWrap:
+    """Construct an MCPNamespaceWrap around a mock underlying server."""
+    obj = MagicMock()
+    obj.name = server_name
+    obj.session = session
+    return MCPNamespaceWrap(confirms=[], obj=obj)
+
+
+# -- list_tools_unfiltered() --
+
+
+def test_list_tools_unfiltered_prefixes_names_from_session():
+    """Tools from session.list_tools() should be namespace-prefixed."""
+    tools = [_FakeTool("read_file", "Read a file"), _FakeTool("write_file", "Write a file")]
+    session = MagicMock()
+    session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=tools))
+    wrapper = _make_wrapper("RepoContext", session=session)
+
+    result = asyncio.run(wrapper.list_tools_unfiltered())
+
+    ns = compress_name("RepoContext")
+    assert len(result) == 2
+    assert result[0].name == f"{ns}read_file"
+    assert result[1].name == f"{ns}write_file"
+
+
+def test_list_tools_unfiltered_no_double_prefix_when_called_twice():
+    """Calling list_tools_unfiltered twice should not double-prefix names."""
+    session = MagicMock()
+    # Fresh tools each call (mimics MCP session returning fresh objects)
+    session.list_tools = AsyncMock(
+        side_effect=lambda: SimpleNamespace(tools=[_FakeTool("get_repo")])
+    )
+    wrapper = _make_wrapper("RepoContext", session=session)
+
+    async def _run():
+        a = await wrapper.list_tools_unfiltered()
+        b = await wrapper.list_tools_unfiltered()
+        return a, b
+
+    result1, result2 = asyncio.run(_run())
+
+    ns = compress_name("RepoContext")
+    assert result1[0].name == f"{ns}get_repo"
+    assert result2[0].name == f"{ns}get_repo"
+    # Crucially, the second result is NOT double-prefixed
+    assert not result2[0].name.startswith(f"{ns}{ns}")
+
+
+def test_list_tools_unfiltered_preserves_tool_attributes():
+    """The copy of each tool should preserve description and input schema."""
+    schema = {"type": "object", "properties": {"path": {"type": "string"}}}
+    tools = [_FakeTool("read_file", "Read a file", schema)]
+    session = MagicMock()
+    session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=tools))
+    wrapper = _make_wrapper("RepoContext", session=session)
+
+    result = asyncio.run(wrapper.list_tools_unfiltered())
+
+    assert result[0].description == "Read a file"
+    assert result[0].inputSchema == schema
+
+
+def test_list_tools_unfiltered_raises_when_session_missing():
+    """Should raise RuntimeError if the underlying server has no session yet."""
+    wrapper = _make_wrapper("RepoContext", session=None)
+
+    with pytest.raises(RuntimeError, match="no.*active MCP session"):
+        asyncio.run(wrapper.list_tools_unfiltered())
+
+
+def test_list_tools_unfiltered_does_not_share_state_with_caller():
+    """Mutating returned tool names must not affect the underlying tools."""
+    original = _FakeTool("read_file")
+    session = MagicMock()
+    session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[original]))
+    wrapper = _make_wrapper("Repo", session=session)
+
+    result = asyncio.run(wrapper.list_tools_unfiltered())
+    result[0].name = "MUTATED"
+
+    # Original tool should still have its name (copy() worked)
+    assert original.name == "read_file"
+
+
+# -- list_tools() (regression) --
+
+
+def test_list_tools_existing_behaviour_unchanged():
+    """Existing list_tools() should still forward args and prefix names."""
+    tools = [_FakeTool("read_file")]
+    obj = MagicMock()
+    obj.name = "RepoContext"
+    obj.list_tools = AsyncMock(return_value=tools)
+    obj.session = MagicMock()
+    wrapper = MCPNamespaceWrap(confirms=[], obj=obj)
+
+    result = asyncio.run(wrapper.list_tools(run_context="ctx", agent="agent"))
+
+    obj.list_tools.assert_awaited_once_with(run_context="ctx", agent="agent")
+    ns = compress_name("RepoContext")
+    assert result[0].name == f"{ns}read_file"

From 00978f237087e49561617f80e441825cdb3c8b60 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 12:09:44 -0400
Subject: [PATCH 16/23] style: fix hatch fmt lint errors in test_mcp_utils

- Remove quotes from _FakeTool return type (UP037)
- Use raw string for regex pattern in pytest.raises match (RUF043)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_mcp_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py
index 8cdb1abf..a1eaae33 100644
--- a/tests/test_mcp_utils.py
+++ b/tests/test_mcp_utils.py
@@ -22,7 +22,7 @@ def __init__(self, name: str, description: str = "", input_schema: dict | None =
         self.description = description
         self.inputSchema = input_schema or {}
 
-    def copy(self) -> "_FakeTool":
+    def copy(self) -> _FakeTool:
         return _FakeTool(self.name, self.description, dict(self.inputSchema))
 
 
@@ -93,7 +93,7 @@ def test_list_tools_unfiltered_raises_when_session_missing():
     """Should raise RuntimeError if the underlying server has no session yet."""
     wrapper = _make_wrapper("RepoContext", session=None)
 
-    with pytest.raises(RuntimeError, match="no.*active MCP session"):
+    with pytest.raises(RuntimeError, match=r"no.*active MCP session"):
         asyncio.run(wrapper.list_tools_unfiltered())
 
 

From b1b139b7d0e1b85c2a0374a5e519d19b0906d43b Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 12:43:33 -0400
Subject: [PATCH 17/23] fix(capi): add gpt-5 to OpenAI _CHAT_PREFIXES allowlist

The default_model for the OpenAI direct provider was bumped to gpt-5.5
in the bearer_auth refactor, but _OpenAIProvider.check_tool_calls()'s
prefix allowlist still only matched gpt-3.5/gpt-4/o-series. This meant
supports_tool_calls('gpt-5.5', ...) returned False, so list_tool_call_models()
would omit the default model from the catalog output -- a contradiction
with the model being the configured default.

Add 'gpt-5' to the prefix tuple and a regression test covering gpt-5,
gpt-5.5, gpt-5.5-mini, and a hypothetical gpt-5.6.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/capi.py |  2 +-
 tests/test_capi_extended.py       | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py
index 5065ebcc..edd9a4d3 100644
--- a/src/seclab_taskflow_agent/capi.py
+++ b/src/seclab_taskflow_agent/capi.py
@@ -111,7 +111,7 @@ class _OpenAIProvider(APIProvider):
     we maintain a prefix allowlist of known chat-completion model families.
     """
 
-    _CHAT_PREFIXES = ("gpt-3.5", "gpt-4", "o1", "o3", "o4", "chatgpt-")
+    _CHAT_PREFIXES = ("gpt-3.5", "gpt-4", "gpt-5", "o1", "o3", "o4", "chatgpt-")
 
     def check_tool_calls(self, _model: str, model_info: dict) -> bool:
         model_id = model_info.get("id", "").lower()
diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py
index 1202df05..70407c28 100644
--- a/tests/test_capi_extended.py
+++ b/tests/test_capi_extended.py
@@ -80,6 +80,17 @@ def test_openai_endpoint_o_series(self, monkeypatch):
             models = {mid: {"id": mid}}
             assert supports_tool_calls(mid, models) is True
 
+    def test_openai_endpoint_gpt5_series(self, monkeypatch):
+        """OpenAI endpoint returns True for gpt-5 family (regression: the
+        default_model was bumped to gpt-5.5 but _CHAT_PREFIXES needed
+        updating to include 'gpt-5')."""
+        monkeypatch.setenv("AI_API_ENDPOINT", "https://api.openai.com/v1")
+        for mid in ("gpt-5", "gpt-5.5", "gpt-5.5-mini", "gpt-5.6"):
+            models = {mid: {"id": mid}}
+            assert supports_tool_calls(mid, models) is True, (
+                f"{mid} should be recognized as a tool-call-capable chat model"
+            )
+
     def test_openai_endpoint_non_chat_model(self, monkeypatch):
         """OpenAI endpoint returns False for embeddings/audio/image models."""
         monkeypatch.setenv("AI_API_ENDPOINT", "https://api.openai.com/v1")

From 4eea12785201dc1fa09e6a217fee2b4675413f6e Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 13:21:25 -0400
Subject: [PATCH 18/23] doc + refactor: address remaining PR review threads

README.md: Add the per-task backend override as the highest-precedence
selection level. Tasks can put 'backend:' in their own model_settings
block to override the model-level value, per _resolve_task_model().

doc/GRAMMAR.md: Tighten the 'passed through to the selected SDK backend'
claim. openai_agents accepts the standard OpenAI parameter set,
anthropic_sdk forwards a curated subset (temperature, top_p, reasoning,
max_tokens, stream_thinking), and copilot_sdk consumes only its own
exposed keys (e.g. reasoning_effort) and silently ignores the rest.
Avoid misleading users about arbitrary key forwarding.

mcp_utils.py: Make list_tools_unfiltered idempotent on the prefix.
Strip an existing namespace prefix before re-applying so the method
is safe to call repeatedly even if the underlying session somehow
returns a cached/reused tool object whose name was previously
namespaced. Uses str.removeprefix() (no-op when prefix is absent).

Regression test added covering the previously-prefixed-input path.
276 tests pass, hatch fmt clean.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md                              | 17 ++++++++++-------
 doc/GRAMMAR.md                         |  9 +++++++--
 src/seclab_taskflow_agent/mcp_utils.py | 10 +++++++++-
 tests/test_mcp_utils.py                | 17 +++++++++++++++++
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 21f39d15..7842f5bf 100644
--- a/README.md
+++ b/README.md
@@ -103,13 +103,16 @@ The runner can drive three SDKs behind a common interface:
   Designed for use with CAPI's Anthropic endpoint; auth uses
   `Authorization: Bearer` (not `x-api-key`).
 
-Selection precedence:
-
-1. Per-model `backend:` in `model_settings` (allows mixed backends in a
-   single taskflow).
-2. `backend:` field in the model config document (global default).
-3. `SECLAB_TASKFLOW_BACKEND` environment variable.
-4. `openai_agents`.
+Selection precedence (highest to lowest):
+
+1. Per-task `backend:` in the task's own `model_settings` block (overrides
+   the model-level value for that one task; see `_resolve_task_model()`).
+2. Per-model `backend:` in the model config's `model_settings` (allows
+   mixed backends in a single taskflow).
+3. `backend:` field at the top level of the model config document
+   (global default).
+4. `SECLAB_TASKFLOW_BACKEND` environment variable.
+5. `openai_agents`.
 
 ```yaml
 seclab-taskflow-agent:
diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md
index 928bf8b1..efe0c41b 100644
--- a/doc/GRAMMAR.md
+++ b/doc/GRAMMAR.md
@@ -550,5 +550,10 @@ passed to the underlying model provider:
 | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var |
 | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` |
 
-All other keys (e.g. `temperature`, `top_p`, `reasoning`) are passed through as
-model parameters to the selected SDK backend.
+All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to
+the selected SDK backend. Which keys are actually honored depends on the
+backend: `openai_agents` accepts the standard OpenAI parameter set;
+`anthropic_sdk` forwards a curated subset (currently `temperature`,
+`top_p`, `reasoning`, `max_tokens`, `stream_thinking`); `copilot_sdk`
+consumes only the keys its SDK exposes (e.g. `reasoning_effort`) and
+silently ignores the rest. Consult the backend-specific docs if in doubt.
diff --git a/src/seclab_taskflow_agent/mcp_utils.py b/src/seclab_taskflow_agent/mcp_utils.py
index 228f64ed..36d1df7e 100644
--- a/src/seclab_taskflow_agent/mcp_utils.py
+++ b/src/seclab_taskflow_agent/mcp_utils.py
@@ -106,6 +106,12 @@ async def list_tools_unfiltered(self) -> list[Any]:
         run loop -- e.g. when handing tools to a different SDK at build
         time).
 
+        Prefixing is idempotent: if a tool's name already starts with this
+        wrapper's namespace (e.g. because the underlying session returned a
+        previously-namespaced object), the existing prefix is stripped
+        before re-applying so calling this method multiple times never
+        yields ``<ns><ns>name``.
+
         Raises ``RuntimeError`` if the underlying server has no active
         MCP session yet (caller should ensure the server is connected
         before calling this).
@@ -120,7 +126,9 @@ async def list_tools_unfiltered(self) -> list[Any]:
         namespaced_tools: list[Any] = []
         for tool in result.tools:
             tool_copy = tool.copy() if hasattr(tool, "copy") else tool
-            tool_copy.name = f"{self.namespace}{tool.name}"
+            # Idempotent: strip existing prefix before re-applying
+            base_name = tool_copy.name.removeprefix(self.namespace)
+            tool_copy.name = f"{self.namespace}{base_name}"
             namespaced_tools.append(tool_copy)
         return namespaced_tools
 
diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py
index a1eaae33..1480b98e 100644
--- a/tests/test_mcp_utils.py
+++ b/tests/test_mcp_utils.py
@@ -111,6 +111,23 @@ def test_list_tools_unfiltered_does_not_share_state_with_caller():
     assert original.name == "read_file"
 
 
+def test_list_tools_unfiltered_idempotent_on_prefixed_input():
+    """If the session returns a tool whose name is already namespace-prefixed
+    (e.g. because of a cached/reused tool object), the prefix must NOT be
+    applied a second time. Required for safe repeated/reentrant calls."""
+    ns = compress_name("RepoContext")
+    pre_prefixed = _FakeTool(f"{ns}read_file", "Read a file")
+    session = MagicMock()
+    session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[pre_prefixed]))
+    wrapper = _make_wrapper("RepoContext", session=session)
+
+    result = asyncio.run(wrapper.list_tools_unfiltered())
+
+    # Result must have exactly one prefix, not two
+    assert result[0].name == f"{ns}read_file"
+    assert not result[0].name.startswith(f"{ns}{ns}")
+
+
 # -- list_tools() (regression) --
 
 

From c97ab6a1a98442a160bf0250d9520239fe16e966 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Fri, 12 Jun 2026 17:01:14 -0400
Subject: [PATCH 19/23] feat(anthropic_sdk): default-on automatic prompt
 caching

Adds 'cache_control: {type: ephemeral}' to messages.stream() calls. The
API auto-places a cache breakpoint at the longest cacheable prefix
(tools + system + accumulated messages) and moves it forward on each
turn -- multi-turn agent loops get cache reads on every turn after the
first.

Default-on because all current Claude models support cache_control and
CAPI accepts it (validated end-to-end against claude-mythos-5 via CAPI
on 2026-06-12). Callers pointed at proxies that strip / reject
cache_control can opt out with 'prompt_caching: false' in model_settings.

A string value (e.g. 'prompt_caching: 1h') sets a custom TTL.

Local validation against anticomputer/vulnerable-test-app on the same
audit pipeline, same model config, only changing prompt_caching:

  metric              | off       | on        | delta
  --------------------+-----------+-----------+------------
  requests            | 60        | 62        | +2 (noise)
  input tokens fresh  | 909,806   | 124       | -99.99%
  cache read tokens   | 0         | 728,079   | new
  cache write tokens  | 0         | 210,261   | new
  output tokens       | 42,300    | 44,933    | similar
  vulnerabilities     | 4         | 5         | +1
  est. mythos cost    | $11.21   | $5.60    | -50%

Same or better audit quality, half the token cost. Real audits with
larger system prompts + more tool definitions amortize the cache writes
over more reads, so production savings are typically larger than 50%.

Tests added:
- prompt_caching default-on emits cache_control
- prompt_caching=False suppresses cache_control (opt-out)
- prompt_caching='1h' includes the ttl field

23 tests pass total, hatch fmt clean.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../sdk/anthropic_sdk/backend.py              |  16 ++
 tests/test_sdk_anthropic_adapter.py           | 154 ++++++++++++++++++
 2 files changed, 170 insertions(+)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 996eea90..d3b493a0 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -202,6 +202,22 @@ async def run_streamed(
                 create_kwargs["thinking"] = {"type": "adaptive"}
                 create_kwargs["output_config"] = {"effort": effort}
 
+        # Automatic prompt caching: place an ephemeral cache breakpoint at
+        # the longest cacheable prefix (tools + system + accumulated
+        # messages). The breakpoint moves forward on each turn, so
+        # multi-turn agent loops get cache reads on every turn after the
+        # first -- typically 50%+ cost reduction on token-heavy audits.
+        # All current Claude models (and the Anthropic-compatible CAPI
+        # proxy) support cache_control. Default on; explicit opt-out for
+        # callers pointed at proxies that don't support it.
+        prompt_caching = handle.model_settings.get("prompt_caching", True)
+        if prompt_caching:
+            ttl = prompt_caching if isinstance(prompt_caching, str) else "5m"
+            cache_block: dict[str, Any] = {"type": "ephemeral"}
+            if ttl != "5m":
+                cache_block["ttl"] = ttl
+            create_kwargs["cache_control"] = cache_block
+
         import anthropic
 
         for turn in range(max_turns):
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index 496869be..975e8094 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -227,3 +227,157 @@ async def _run():
 
     with pytest.raises(BackendBadRequestError, match="invalid reasoning effort"):
         asyncio.run(_run())
+
+
+# -- prompt caching --
+
+
+def test_prompt_caching_enabled_by_default():
+    """All Claude models support cache_control; default to on so callers
+    get the cost savings without explicit opt-in. Explicit opt-out via
+    prompt_caching=False remains available for proxies that don't support
+    cache_control."""
+    import asyncio
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+
+    captured = {}
+
+    class _FakeStreamCtx:
+        async def __aenter__(self): return self
+        async def __aexit__(self, *exc): return False
+        def __aiter__(self):
+            async def _gen():
+                return
+                yield
+            return _gen()
+        async def get_final_message(self):
+            return type("M", (), {"stop_reason": "end_turn", "content": []})()
+
+    class _FakeMessages:
+        def stream(self, **kwargs):
+            captured.update(kwargs)
+            return _FakeStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    handle = _AnthropicHandle(
+        client=_FakeClient(),
+        system_prompt="",
+        model="claude-mythos-5",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    asyncio.run(_run())
+    assert captured.get("cache_control") == {"type": "ephemeral"}, (
+        f"expected default cache_control={{type: ephemeral}}, got {captured.get('cache_control')!r}"
+    )
+
+
+def test_prompt_caching_explicit_opt_out():
+    """prompt_caching=False must suppress cache_control entirely (for
+    callers pointed at proxies that don't support it)."""
+    import asyncio
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+
+    captured = {}
+
+    class _FakeStreamCtx:
+        async def __aenter__(self): return self
+        async def __aexit__(self, *exc): return False
+        def __aiter__(self):
+            async def _gen():
+                return
+                yield
+            return _gen()
+        async def get_final_message(self):
+            return type("M", (), {"stop_reason": "end_turn", "content": []})()
+
+    class _FakeMessages:
+        def stream(self, **kwargs):
+            captured.update(kwargs)
+            return _FakeStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    handle = _AnthropicHandle(
+        client=_FakeClient(),
+        system_prompt="",
+        model="claude-mythos-5",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={"prompt_caching": False},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    asyncio.run(_run())
+    assert "cache_control" not in captured, (
+        f"cache_control should be absent when explicitly opted out, got {captured}"
+    )
+
+
+def test_prompt_caching_1h_ttl_passes_ttl_field():
+    """When prompt_caching='1h', cache_control must include the 1h ttl."""
+    import asyncio
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+
+    captured = {}
+
+    class _FakeStreamCtx:
+        async def __aenter__(self): return self
+        async def __aexit__(self, *exc): return False
+        def __aiter__(self):
+            async def _gen():
+                return
+                yield
+            return _gen()
+        async def get_final_message(self):
+            return type("M", (), {"stop_reason": "end_turn", "content": []})()
+
+    class _FakeMessages:
+        def stream(self, **kwargs):
+            captured.update(kwargs)
+            return _FakeStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    handle = _AnthropicHandle(
+        client=_FakeClient(),
+        system_prompt="",
+        model="claude-mythos-5",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={"prompt_caching": "1h"},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    asyncio.run(_run())
+    assert captured.get("cache_control") == {"type": "ephemeral", "ttl": "1h"}, (
+        f"expected cache_control with 1h ttl, got {captured.get('cache_control')!r}"
+    )

From f42c06b8c0751bd2ac2baa3079aa2cb114d51e04 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Mon, 15 Jun 2026 10:52:42 -0400
Subject: [PATCH 20/23] fix(anthropic_sdk): match blocked_tools against raw +
 namespaced names

Reviewer was correct that blocked_tools was effectively a no-op in the
anthropic_sdk backend: taskflow YAML supplies raw tool names like
'read_file', but list_tools_unfiltered() returns namespace-prefixed
names like '{hash}read_file'. The old 'tool.name not in blocked' check
never matched, silently letting every blocked tool through. This is
the security bug the reviewer flagged on PR #265.

Fix: match the raw name against the un-prefixed portion of each tool's
namespaced name, in addition to the literal name. The mcp_server_map
keys stay namespaced because that's what Anthropic sends in tool_use.

Regression tests:
- raw 'read_file' filters out '{hash}read_file' (the bug case)
- already-namespaced names still match (backwards compat)

doc/GRAMMAR.md: also fix an inaccuracy the reviewer flagged in the
same review pass -- the docs claimed copilot_sdk 'silently ignores'
unsupported model_settings keys, but it actually raises
BackendCapabilityError on 'temperature' and 'parallel_tool_calls' at
validate() time. Updated wording to distinguish 'ignored' (anthropic_sdk)
from 'rejected' (copilot_sdk) so users aren't surprised by a hard fail
when they expected a silent drop.

281 tests pass, hatch fmt clean.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 doc/GRAMMAR.md                                |   8 +-
 .../sdk/anthropic_sdk/backend.py              |  26 +++--
 tests/test_sdk_anthropic_adapter.py           | 109 ++++++++++++++++++
 3 files changed, 129 insertions(+), 14 deletions(-)

diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md
index efe0c41b..88b0bf9f 100644
--- a/doc/GRAMMAR.md
+++ b/doc/GRAMMAR.md
@@ -550,10 +550,4 @@ passed to the underlying model provider:
 | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var |
 | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` |
 
-All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to
-the selected SDK backend. Which keys are actually honored depends on the
-backend: `openai_agents` accepts the standard OpenAI parameter set;
-`anthropic_sdk` forwards a curated subset (currently `temperature`,
-`top_p`, `reasoning`, `max_tokens`, `stream_thinking`); `copilot_sdk`
-consumes only the keys its SDK exposes (e.g. `reasoning_effort`) and
-silently ignores the rest. Consult the backend-specific docs if in doubt.
+All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to the selected SDK backend. Each backend decides what to do with each key: `openai_agents` accepts the standard OpenAI parameter set; `anthropic_sdk` forwards a curated subset (currently `temperature`, `top_p`, `reasoning`, `max_tokens`, `stream_thinking`, `prompt_caching`) and silently ignores keys outside that set; `copilot_sdk` consumes the keys its SDK exposes (e.g. `reasoning_effort`) and **rejects** unsupported keys at validate time with `BackendCapabilityError` (currently `temperature` and `parallel_tool_calls`) rather than silently dropping them. Consult the backend-specific docs if in doubt.
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index d3b493a0..11149406 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -131,23 +131,35 @@ async def build(
         # list_tools(), which would require run_context/agent args to
         # invoke the openai-agents tool_filter -- args we don't have
         # outside the openai-agents run loop.
+        #
+        # blocked_tools in taskflow YAML are raw (un-namespaced) names,
+        # consistent with how openai_agents and copilot_sdk consume them.
+        # list_tools_unfiltered() returns namespace-prefixed names (the
+        # MCP server wrapper applies the prefix). Match against both
+        # forms so blocking works regardless of which name the taskflow
+        # author used; key mcp_server_map by the namespaced name because
+        # that's what Anthropic will send back in tool_use blocks.
         all_tools: list[dict[str, Any]] = []
         mcp_server_map: dict[str, Any] = {}
         blocked = set(spec.blocked_tools or [])
 
+        def _is_blocked(tool: Any, namespace: str) -> bool:
+            name = tool.name
+            if name in blocked:
+                return True
+            return name.startswith(namespace) and name[len(namespace):] in blocked
+
         for mcp_spec in spec.mcp_servers:
             native_server = mcp_spec.params.get("_native")
             if native_server is None:
                 continue
             try:
                 mcp_tools = await native_server.list_tools_unfiltered()
-                for tool in mcp_tools:
-                    if tool.name not in blocked:
-                        mcp_server_map[tool.name] = native_server
-                anthropic_tools = _mcp_tools_to_anthropic(
-                    [t for t in mcp_tools if t.name not in blocked]
-                )
-                all_tools.extend(anthropic_tools)
+                namespace = getattr(native_server, "namespace", "")
+                kept = [t for t in mcp_tools if not _is_blocked(t, namespace)]
+                for tool in kept:
+                    mcp_server_map[tool.name] = native_server
+                all_tools.extend(_mcp_tools_to_anthropic(kept))
             except Exception:
                 logger.exception("Failed to list tools from MCP server %s", mcp_spec.name)
 
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index 975e8094..d8e047c5 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -381,3 +381,112 @@ async def _run():
     assert captured.get("cache_control") == {"type": "ephemeral", "ttl": "1h"}, (
         f"expected cache_control with 1h ttl, got {captured.get('cache_control')!r}"
     )
+
+
+# -- blocked_tools filtering --
+
+
+def test_blocked_tools_matches_raw_name_against_namespaced_tool(monkeypatch):
+    """Regression: taskflow YAML blocked_tools uses raw (un-namespaced)
+    names like 'read_file', but list_tools_unfiltered() returns
+    namespace-prefixed names like '{hash}read_file'. The filter must
+    match the raw name against the un-prefixed portion of the
+    namespaced tool, otherwise blocking is silently bypassed.
+
+    See PR #265 review thread and openai_agents/copilot_sdk for
+    how blocked_tools are consumed elsewhere (both use raw names).
+    """
+    monkeypatch.setenv("AI_API_TOKEN", "test-token")
+    import asyncio
+    from unittest.mock import AsyncMock, MagicMock
+
+    from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name
+    from seclab_taskflow_agent.sdk.base import MCPServerSpec
+
+    class _FakeTool:
+        def __init__(self, name):
+            self.name = name
+            self.description = ""
+            self.inputSchema = {}
+
+        def copy(self):
+            t = _FakeTool(self.name)
+            return t
+
+    # Build a wrapper whose session.list_tools returns two raw tools.
+    # list_tools_unfiltered() will return them with namespace prefix.
+    obj = MagicMock()
+    obj.name = "RepoContext"
+    ns = compress_name("RepoContext")
+    obj.session = MagicMock()
+    obj.session.list_tools = AsyncMock(
+        return_value=type("R", (), {"tools": [_FakeTool("read_file"), _FakeTool("safe_helper")]})()
+    )
+    wrap = MCPNamespaceWrap(confirms=[], obj=obj)
+
+    spec = AgentSpec(
+        name="t",
+        instructions="",
+        model="claude-mythos-preview",
+        mcp_servers=[MCPServerSpec(name="rc", kind="stdio", params={"_native": wrap})],
+        blocked_tools=["read_file"],  # raw name from YAML
+    )
+    backend = AnthropicSDKBackend()
+    handle = asyncio.run(backend.build(spec))
+
+    # The blocked tool must be absent from both the tool list AND the
+    # server map keys (which use the namespaced form).
+    tool_names = [t["name"] for t in handle.tools]
+    assert f"{ns}read_file" not in tool_names, (
+        f"blocked raw name 'read_file' should have filtered out '{ns}read_file'; "
+        f"got tools: {tool_names}"
+    )
+    assert f"{ns}safe_helper" in tool_names, (
+        f"non-blocked tool 'safe_helper' should still be present; got: {tool_names}"
+    )
+    assert f"{ns}read_file" not in handle.mcp_server_map
+    assert f"{ns}safe_helper" in handle.mcp_server_map
+
+
+def test_blocked_tools_also_matches_already_namespaced_name(monkeypatch):
+    """Backwards-compat: if a caller already passes the namespaced name
+    in blocked_tools (e.g. they computed it externally), it should still
+    match. The filter checks both forms."""
+    monkeypatch.setenv("AI_API_TOKEN", "test-token")
+    import asyncio
+    from unittest.mock import AsyncMock, MagicMock
+
+    from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name
+    from seclab_taskflow_agent.sdk.base import MCPServerSpec
+
+    class _FakeTool:
+        def __init__(self, name):
+            self.name = name
+            self.description = ""
+            self.inputSchema = {}
+
+        def copy(self):
+            return _FakeTool(self.name)
+
+    obj = MagicMock()
+    obj.name = "RepoContext"
+    ns = compress_name("RepoContext")
+    obj.session = MagicMock()
+    obj.session.list_tools = AsyncMock(
+        return_value=type("R", (), {"tools": [_FakeTool("read_file")]})()
+    )
+    wrap = MCPNamespaceWrap(confirms=[], obj=obj)
+
+    spec = AgentSpec(
+        name="t",
+        instructions="",
+        model="claude-mythos-preview",
+        mcp_servers=[MCPServerSpec(name="rc", kind="stdio", params={"_native": wrap})],
+        blocked_tools=[f"{ns}read_file"],  # already namespaced
+    )
+    backend = AnthropicSDKBackend()
+    handle = asyncio.run(backend.build(spec))
+
+    assert handle.tools == [], (
+        f"blocked namespaced name should filter out the tool; got: {handle.tools}"
+    )

From c6ef3ae885989d223aac9dc5db51312881fb93e5 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Mon, 15 Jun 2026 10:56:49 -0400
Subject: [PATCH 21/23] Revert default_model bump back to gpt-4.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Coworker review flagged that gpt-5.5 is not a viable default:

- gpt-5 family models require the responses API, but APIProvider has no
  api_type field to signal that — callers using the default would silently
  hit the wrong endpoint shape
- GitHub Models never received gpt-5.5; gpt-4.1 is what's still supported
  there, so 'openai/gpt-5.5' would 404
- Most callers specify models explicitly via model_config anyway, so the
  default is only a fallback safety net — keep it on a model that exists
  on all three providers

Reverts the registry defaults and dataclass default; keeps the gpt-5
prefix in _OpenAIProvider._CHAT_PREFIXES (direct OpenAI API does serve
gpt-5 family, and the prefix check is independent of default selection).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/capi.py | 8 ++++----
 tests/test_capi_extended.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py
index edd9a4d3..3d635c0d 100644
--- a/src/seclab_taskflow_agent/capi.py
+++ b/src/seclab_taskflow_agent/capi.py
@@ -48,7 +48,7 @@ class APIProvider:
     name: str
     base_url: str
     models_catalog: str = "/models"
-    default_model: str = "gpt-5.5"
+    default_model: str = "gpt-4.1"
     extra_headers: Mapping[str, str] = field(default_factory=dict)
     bearer_auth: bool = True  # Use Authorization: Bearer (not x-api-key)
 
@@ -124,20 +124,20 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool:
     "api.githubcopilot.com": _CopilotProvider(
         name="copilot",
         base_url="https://api.githubcopilot.com",
-        default_model="gpt-5.5",
+        default_model="gpt-4.1",
         extra_headers={"Copilot-Integration-Id": COPILOT_INTEGRATION_ID},
     ),
     "models.github.ai": _GitHubModelsProvider(
         name="github-models",
         base_url="https://models.github.ai/inference",
         models_catalog="/catalog/models",
-        default_model="openai/gpt-5.5",
+        default_model="openai/gpt-4.1",
     ),
     "api.openai.com": _OpenAIProvider(
         name="openai",
         base_url="https://api.openai.com/v1",
         models_catalog="/v1/models",
-        default_model="gpt-5.5",
+        default_model="gpt-4.1",
     ),
 }
 
diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py
index 70407c28..36c97159 100644
--- a/tests/test_capi_extended.py
+++ b/tests/test_capi_extended.py
@@ -122,7 +122,7 @@ def test_github_models_provider(self):
         p = get_provider("https://models.github.ai/inference")
         assert p.name == "github-models"
         assert p.models_catalog == "/catalog/models"
-        assert p.default_model == "openai/gpt-5.5"
+        assert p.default_model == "openai/gpt-4.1"
 
     def test_openai_provider(self):
         p = get_provider("https://api.openai.com/v1")
@@ -140,7 +140,7 @@ def test_awf_proxy_bare_hostname(self, monkeypatch):
         p = get_provider("http://172.30.0.30:10002")
         assert p.name == "copilot"
         assert p.base_url == "http://172.30.0.30:10002/"
-        assert p.default_model == "gpt-5.5"
+        assert p.default_model == "gpt-4.1"
         assert "Copilot-Integration-Id" in p.extra_headers
 
     def test_awf_proxy_full_url(self, monkeypatch):

From 5e0a38cd827d7f2be4ec4d5126e80fe5997dfaca Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Mon, 15 Jun 2026 11:39:04 -0400
Subject: [PATCH 22/23] Address PR feedback + proactive cleanup pass

Three behavior fixes flagged by review or by re-reading the diff:

1. 4xx exception mapping (reviewer-flagged): previously only
   anthropic.BadRequestError (400) was mapped to BackendBadRequestError.
   Auth (401), permission (403), not-found (404), conflict (409),
   unprocessable (422) all fell through to BackendUnexpectedError and
   surfaced as 'Agent Exception' instead of a clean request error.
   Catch anthropic.APIStatusError and map any 4xx status to
   BackendBadRequestError; 5xx still falls through to
   BackendUnexpectedError (the request was well-formed).

2. Empty-token failure mode: build() now raises BackendBadRequestError
   with a clear message when no API token can be resolved, instead of
   either leaking RuntimeError from get_AI_token() or letting the
   Anthropic client be constructed with an empty 'Bearer ' header
   (which produces an opaque 401 mid-stream much later).

3. Stale module docstring in sdk/__init__.py: said 'Two backends are
   supported' and referenced the removed '[copilot]' optional-extra.
   Updated to reflect the current three-backend reality.

Test cleanup (reviewer-flagged):

- DRY'd 3x duplicate _FakeStreamCtx boilerplate in the prompt-caching
  tests into a single _make_fake_client() helper at the top of the
  file. The helper uses a proper empty async iterator class instead
  of the 'return; yield' empty-generator pattern the reviewer
  flagged as awkward.

Added regression coverage:

- test_build_raises_bad_request_when_no_token_available
- test_4xx_api_status_errors_map_to_bad_request (parameterized over
  400/401/403/404/409/422)
- test_5xx_api_status_errors_map_to_unexpected

281 -> 289 passing; lint clean (hatch fmt --linter --check).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/seclab_taskflow_agent/sdk/__init__.py     |  11 +-
 .../sdk/anthropic_sdk/backend.py              |  28 +-
 tests/test_sdk_anthropic_adapter.py           | 252 +++++++++++++-----
 3 files changed, 213 insertions(+), 78 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py
index 0a413505..5eefafb2 100644
--- a/src/seclab_taskflow_agent/sdk/__init__.py
+++ b/src/seclab_taskflow_agent/sdk/__init__.py
@@ -3,9 +3,9 @@
 
 """Backend factory for the agent runner.
 
-Two backends are supported: ``openai_agents`` (default) and
-``copilot_sdk`` (optional, requires ``pip install
-seclab-taskflow-agent[copilot]``).
+Three backends are supported: ``openai_agents`` (default), ``copilot_sdk``,
+and ``anthropic_sdk``.  All three are always available because per-task
+backend selection means any SDK may be needed at runtime.
 """
 
 from __future__ import annotations
@@ -70,8 +70,9 @@ def resolve_backend_name(
     ``SECLAB_TASKFLOW_BACKEND`` env var > ``openai_agents``.
 
     Backend selection is always deterministic — there is no auto-detection
-    based on endpoint URL.  Use ``backend: copilot_sdk`` in model config
-    or set ``SECLAB_TASKFLOW_BACKEND=copilot_sdk`` to opt in.
+    based on endpoint URL.  Use ``backend: copilot_sdk`` or ``backend:
+    anthropic_sdk`` in model config (or set
+    ``SECLAB_TASKFLOW_BACKEND=<name>``) to opt in.
 
     The *endpoint* parameter is accepted for forward compatibility but
     is not used for backend selection.
diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index 11149406..f76ac70e 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -104,10 +104,22 @@ async def build(
 
         from ...capi import get_AI_endpoint, get_AI_token, get_provider
 
-        # Resolve token: per-model env var override, then standard token chain
+        # Resolve token: per-model env var override, then standard token chain.
+        # Wrap RuntimeError from get_AI_token (env var not set) so the runner
+        # surfaces it as a request error rather than an internal exception.
         token = os.getenv(spec.token_env, "") if spec.token_env else ""
         if not token:
-            token = get_AI_token()
+            try:
+                token = get_AI_token()
+            except RuntimeError as exc:
+                raise BackendBadRequestError(
+                    f"anthropic_sdk: no API token available ({exc})"
+                ) from exc
+        if not token:
+            raise BackendBadRequestError(
+                "anthropic_sdk: no API token available "
+                "(checked spec.token_env then standard token chain)"
+            )
 
         endpoint = spec.endpoint or get_AI_endpoint()
         provider = get_provider(endpoint)
@@ -257,8 +269,16 @@ async def run_streamed(
                 raise BackendRateLimitError(str(exc)) from exc
             except anthropic.APITimeoutError as exc:
                 raise BackendTimeoutError(str(exc)) from exc
-            except anthropic.BadRequestError as exc:
-                raise BackendBadRequestError(str(exc)) from exc
+            except anthropic.APIStatusError as exc:
+                # Map all 4xx (auth, permission, not_found, conflict,
+                # unprocessable, bad_request) to BackendBadRequestError so
+                # the runner surfaces them as request errors rather than
+                # internal exceptions. 5xx and unclassified errors fall
+                # through to BackendUnexpectedError.
+                status = getattr(exc, "status_code", None)
+                if isinstance(status, int) and 400 <= status < 500:
+                    raise BackendBadRequestError(str(exc)) from exc
+                raise BackendUnexpectedError(str(exc)) from exc
             except anthropic.APIError as exc:
                 raise BackendUnexpectedError(str(exc)) from exc
 
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index d8e047c5..e072fa7b 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -31,6 +31,48 @@ def _spec(**overrides) -> AgentSpec:
     return AgentSpec(**base)
 
 
+def _make_fake_client(captured: dict, *, stop_reason: str = "end_turn", content: list | None = None):
+    """Build a minimal fake Anthropic client that records messages.stream() kwargs.
+
+    The returned client exposes ``client.messages.stream(**kwargs)``; ``kwargs`` is
+    written into *captured* so tests can assert on what the backend would have sent
+    to the real SDK.  The stream yields nothing and ``get_final_message()`` returns
+    a stub with the requested ``stop_reason``/``content``.
+    """
+    final_content = content if content is not None else []
+
+    class _EmptyAsyncIter:
+        def __aiter__(self):
+            return self
+
+        async def __anext__(self):
+            raise StopAsyncIteration
+
+    class _FakeStreamCtx:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *exc):
+            return False
+
+        def __aiter__(self):
+            return _EmptyAsyncIter()
+
+        async def get_final_message(self):
+            return type("M", (), {"stop_reason": stop_reason, "content": final_content})()
+
+    class _FakeMessages:
+        def stream(self, **kwargs):
+            captured.update(kwargs)
+            return _FakeStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    return _FakeClient()
+
+
 # -- Backend registration --
 
 
@@ -241,30 +283,9 @@ def test_prompt_caching_enabled_by_default():
 
     from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
 
-    captured = {}
-
-    class _FakeStreamCtx:
-        async def __aenter__(self): return self
-        async def __aexit__(self, *exc): return False
-        def __aiter__(self):
-            async def _gen():
-                return
-                yield
-            return _gen()
-        async def get_final_message(self):
-            return type("M", (), {"stop_reason": "end_turn", "content": []})()
-
-    class _FakeMessages:
-        def stream(self, **kwargs):
-            captured.update(kwargs)
-            return _FakeStreamCtx()
-
-    class _FakeClient:
-        def __init__(self):
-            self.messages = _FakeMessages()
-
+    captured: dict = {}
     handle = _AnthropicHandle(
-        client=_FakeClient(),
+        client=_make_fake_client(captured),
         system_prompt="",
         model="claude-mythos-5",
         max_tokens=100,
@@ -291,30 +312,9 @@ def test_prompt_caching_explicit_opt_out():
 
     from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
 
-    captured = {}
-
-    class _FakeStreamCtx:
-        async def __aenter__(self): return self
-        async def __aexit__(self, *exc): return False
-        def __aiter__(self):
-            async def _gen():
-                return
-                yield
-            return _gen()
-        async def get_final_message(self):
-            return type("M", (), {"stop_reason": "end_turn", "content": []})()
-
-    class _FakeMessages:
-        def stream(self, **kwargs):
-            captured.update(kwargs)
-            return _FakeStreamCtx()
-
-    class _FakeClient:
-        def __init__(self):
-            self.messages = _FakeMessages()
-
+    captured: dict = {}
     handle = _AnthropicHandle(
-        client=_FakeClient(),
+        client=_make_fake_client(captured),
         system_prompt="",
         model="claude-mythos-5",
         max_tokens=100,
@@ -340,30 +340,9 @@ def test_prompt_caching_1h_ttl_passes_ttl_field():
 
     from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
 
-    captured = {}
-
-    class _FakeStreamCtx:
-        async def __aenter__(self): return self
-        async def __aexit__(self, *exc): return False
-        def __aiter__(self):
-            async def _gen():
-                return
-                yield
-            return _gen()
-        async def get_final_message(self):
-            return type("M", (), {"stop_reason": "end_turn", "content": []})()
-
-    class _FakeMessages:
-        def stream(self, **kwargs):
-            captured.update(kwargs)
-            return _FakeStreamCtx()
-
-    class _FakeClient:
-        def __init__(self):
-            self.messages = _FakeMessages()
-
+    captured: dict = {}
     handle = _AnthropicHandle(
-        client=_FakeClient(),
+        client=_make_fake_client(captured),
         system_prompt="",
         model="claude-mythos-5",
         max_tokens=100,
@@ -490,3 +469,138 @@ def copy(self):
     assert handle.tools == [], (
         f"blocked namespaced name should filter out the tool; got: {handle.tools}"
     )
+
+
+# -- token validation --
+
+
+def test_build_raises_bad_request_when_no_token_available(monkeypatch):
+    """build() must fail loudly when no API token can be resolved.
+
+    Otherwise the Anthropic client gets created with an empty 'Bearer '
+    header and the failure surfaces later as an opaque 401 mid-stream
+    instead of a clear BackendBadRequestError at build time.
+    """
+    import asyncio
+
+    # Clear every token-source env var the standard chain consults
+    for var in ("AI_API_TOKEN", "OPENAI_API_KEY", "AZURE_OPENAI_API_KEY",
+                "ANTHROPIC_API_KEY", "GITHUB_TOKEN", "GH_TOKEN"):
+        monkeypatch.delenv(var, raising=False)
+
+    spec = AgentSpec(
+        name="t",
+        instructions="",
+        model="claude-mythos-preview",
+        endpoint="https://api.githubcopilot.com",
+    )
+    backend = AnthropicSDKBackend()
+    with pytest.raises(BackendBadRequestError, match="no API token"):
+        asyncio.run(backend.build(spec))
+
+
+# -- exception mapping (4xx -> BackendBadRequestError) --
+
+
+@pytest.mark.parametrize("status_code", [400, 401, 403, 404, 409, 422])
+def test_4xx_api_status_errors_map_to_bad_request(monkeypatch, status_code):
+    """Any 4xx APIStatusError must surface as BackendBadRequestError so the
+    runner logs it as a request error rather than an internal exception.
+    Previously only BadRequestError (400) was mapped, leaving auth/permission/
+    not-found errors (401/403/404) to surface as BackendUnexpectedError."""
+    import asyncio
+    import anthropic
+    import httpx
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+
+    response = httpx.Response(
+        status_code=status_code,
+        request=httpx.Request("POST", "https://test.example/v1/messages"),
+    )
+
+    class _RaisingStreamCtx:
+        async def __aenter__(self):
+            raise anthropic.APIStatusError(
+                f"http {status_code}", response=response, body=None
+            )
+
+        async def __aexit__(self, *exc):
+            return False
+
+    class _FakeMessages:
+        def stream(self, **kwargs):  # noqa: ARG002
+            return _RaisingStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    handle = _AnthropicHandle(
+        client=_FakeClient(),
+        system_prompt="",
+        model="claude-mythos-5",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={"prompt_caching": False},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    with pytest.raises(BackendBadRequestError):
+        asyncio.run(_run())
+
+
+def test_5xx_api_status_errors_map_to_unexpected(monkeypatch):
+    """5xx APIStatusError must still surface as BackendUnexpectedError (not
+    BackendBadRequestError); the request itself was well-formed."""
+    import asyncio
+    import anthropic
+    import httpx
+
+    from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle
+    from seclab_taskflow_agent.sdk.errors import BackendUnexpectedError
+
+    response = httpx.Response(
+        status_code=503,
+        request=httpx.Request("POST", "https://test.example/v1/messages"),
+    )
+
+    class _RaisingStreamCtx:
+        async def __aenter__(self):
+            raise anthropic.InternalServerError(
+                "service unavailable", response=response, body=None
+            )
+
+        async def __aexit__(self, *exc):
+            return False
+
+    class _FakeMessages:
+        def stream(self, **kwargs):  # noqa: ARG002
+            return _RaisingStreamCtx()
+
+    class _FakeClient:
+        def __init__(self):
+            self.messages = _FakeMessages()
+
+    handle = _AnthropicHandle(
+        client=_FakeClient(),
+        system_prompt="",
+        model="claude-mythos-5",
+        max_tokens=100,
+        tools=[],
+        mcp_server_map={},
+        model_settings={"prompt_caching": False},
+    )
+    backend = AnthropicSDKBackend()
+
+    async def _run():
+        async for _ in backend.run_streamed(handle, "hi", max_turns=1):
+            pass
+
+    with pytest.raises(BackendUnexpectedError):
+        asyncio.run(_run())

From 4f1f4401ecfe3a8331aa0cf3c4dc0a8a6acef152 Mon Sep 17 00:00:00 2001
From: Bas Alberts <anticomputer@github.com>
Date: Mon, 15 Jun 2026 12:31:25 -0400
Subject: [PATCH 23/23] fix(anthropic_sdk): preserve empty tool output + harden
 token test

Two more reviewer-flagged issues:

1. _call_tool_result_to_text() dropped empty TextContent

   The truthy check 'if text:' treated TextContent(text='') the same as
   text=None and skipped it. With an only-empty content list, parts
   would be [] and the helper fell through to the str(result) fallback
   (which is a noisy repr of the result object) instead of returning
   the actual empty result the tool reported. Fix: 'if text is not
   None:' preserves explicit empty strings; the str(result) fallback
   now only fires when there are no text-bearing blocks at all.

2. test_build_raises_bad_request_when_no_token_available was flaky

   The test cleared a long list of API key env vars (defensive cargo
   cult) but missed COPILOT_TOKEN, which is the second variable that
   capi.get_AI_token() consults. On runners with COPILOT_TOKEN set
   (e.g. CI envs authed to Copilot), the test would unexpectedly find
   a token and the assertion would fail. Simplified to clear only the
   two vars the chain actually consults: AI_API_TOKEN and COPILOT_TOKEN.

+2 regression tests for empty-string preservation; 291 passing.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../sdk/anthropic_sdk/backend.py              | 10 ++++--
 tests/test_sdk_anthropic_adapter.py           | 32 ++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
index f76ac70e..21dab7e0 100644
--- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
+++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py
@@ -54,12 +54,18 @@ def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]:
 
 
 def _call_tool_result_to_text(result: Any) -> str:
-    """Extract text from an MCP CallToolResult."""
+    """Extract text from an MCP CallToolResult.
+
+    Preserves empty strings: a tool that returns ``TextContent(text="")``
+    is returning an explicit empty result, not "no content".  Only fall
+    back to ``str(result)`` (a noisy repr) when there are genuinely no
+    text-bearing content blocks at all.
+    """
     content = getattr(result, "content", [])
     parts = []
     for c in content:
         text = getattr(c, "text", None)
-        if text:
+        if text is not None:
             parts.append(text)
     return "\n".join(parts) if parts else str(result)
 
diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py
index e072fa7b..bb8bf64d 100644
--- a/tests/test_sdk_anthropic_adapter.py
+++ b/tests/test_sdk_anthropic_adapter.py
@@ -193,6 +193,25 @@ def test_call_tool_result_to_text_empty():
     assert isinstance(text, str)
 
 
+def test_call_tool_result_to_text_preserves_empty_string():
+    """A tool returning TextContent(text='') is reporting an explicit
+    empty result. The helper must return '' verbatim, not fall back to
+    str(result) (which is a noisy repr of the result object).
+
+    Regression for the truthy-check bug: ``if text:`` was treating ''
+    the same as None and dropping it, causing the empty content list
+    branch to fire and emit ``str(result)`` to the model.
+    """
+    result = type("R", (), {"content": [_FakeContent("")]})()
+    assert _call_tool_result_to_text(result) == ""
+
+
+def test_call_tool_result_to_text_preserves_empty_among_nonempty():
+    """Empty TextContent should join with neighbors as ''."""
+    result = type("R", (), {"content": [_FakeContent("a"), _FakeContent(""), _FakeContent("b")]})()
+    assert _call_tool_result_to_text(result) == "a\n\nb"
+
+
 # -- bearer_auth via provider registry --
 
 
@@ -480,13 +499,18 @@ def test_build_raises_bad_request_when_no_token_available(monkeypatch):
     Otherwise the Anthropic client gets created with an empty 'Bearer '
     header and the failure surfaces later as an opaque 401 mid-stream
     instead of a clear BackendBadRequestError at build time.
+
+    Clears every variable consulted by ``capi.get_AI_token``
+    (``AI_API_TOKEN`` then ``COPILOT_TOKEN``) to keep the test
+    deterministic regardless of the runner's ambient environment.
     """
     import asyncio
 
-    # Clear every token-source env var the standard chain consults
-    for var in ("AI_API_TOKEN", "OPENAI_API_KEY", "AZURE_OPENAI_API_KEY",
-                "ANTHROPIC_API_KEY", "GITHUB_TOKEN", "GH_TOKEN"):
-        monkeypatch.delenv(var, raising=False)
+    # Must clear *every* env var the token chain consults; missing
+    # COPILOT_TOKEN here would make the test flaky on runners that
+    # happen to have it set (e.g. CI machines authed to copilot).
+    monkeypatch.delenv("AI_API_TOKEN", raising=False)
+    monkeypatch.delenv("COPILOT_TOKEN", raising=False)
 
     spec = AgentSpec(
         name="t",