From b544cf22f48b5ae3d6a609e1ab983e95ad78b8c3 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 11:30:10 -0400 Subject: [PATCH 01/23] feat: add Anthropic SDK backend + per-model backend selection Adds anthropic_sdk as a third backend adapter driving the native Anthropic Messages API (/v1/messages) via the official anthropic Python SDK. Supports streaming, MCP tool calling, and adaptive thinking with configurable reasoning effort. Key changes: - New backend: sdk/anthropic_sdk/backend.py implementing AgentBackend - Per-model backend selection via model_settings.backend (allows mixed backends in a single taskflow, e.g. Anthropic for code_analysis + OpenAI for general_tasks) - Both anthropic and github-copilot-sdk are now regular dependencies (not optional) since per-model backend config means any SDK could be needed at runtime - BackendSdk/ApiType Literals extended for anthropic_sdk/messages - _resolve_task_model() returns per-task backend override - stream_thinking model_settings option (opt-in, default off) - README and GRAMMAR.md updated with backend docs Auth: CAPI's /v1/messages expects Authorization: Bearer (not x-api-key); the adapter passes the bearer header via default_headers. Thinking: Uses adaptive thinking with output_config.effort. CAPI returns encrypted thinking signatures (content not readable); the stream_thinking flag is ready for when/if thinking content is exposed. Tested: basic messages, streaming, multi-turn tool calling via MCP, mixed-backend taskflows, all reasoning effort levels (low/medium/high/ max), error handling, openai_agents regression. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 44 ++- doc/GRAMMAR.md | 12 +- pyproject.toml | 11 +- src/seclab_taskflow_agent/models.py | 4 +- src/seclab_taskflow_agent/runner.py | 17 +- src/seclab_taskflow_agent/sdk/__init__.py | 8 +- .../sdk/anthropic_sdk/__init__.py | 4 + .../sdk/anthropic_sdk/backend.py | 284 ++++++++++++++++++ 8 files changed, 345 insertions(+), 39 deletions(-) create mode 100644 src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py create mode 100644 src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py diff --git a/README.md b/README.md index 3e44d86d..75c588a7 100644 --- a/README.md +++ b/README.md @@ -83,37 +83,49 @@ Per-model `model_settings` can include: ### Backends -The runner can drive two SDKs behind a common interface: +The runner can drive three SDKs behind a common interface: - **`openai_agents`** (default) — the OpenAI Agents Python SDK. Supports multi-personality handoffs, both `chat_completions` and `responses` `api_type`, `temperature`, `parallel_tool_calls`, `exclude_from_context`, and MCP over stdio, SSE, and streamable HTTP. -- **`copilot_sdk`** (optional, `pip install seclab-taskflow-agent[copilot]`) - — the GitHub Copilot Python SDK. Supports streaming, `reasoning_effort`, - MCP over stdio/SSE/HTTP, and per-tool permission gating. The SDK - selects its own wire protocol per model, so the YAML `api_type` field - is not honoured; multi-personality handoffs, `temperature`, and - `parallel_tool_calls` are likewise not available. Taskflows that use - unsupported fields fail at load time with a `BackendCapabilityError` - naming the offending field. +- **`copilot_sdk`** — the GitHub Copilot Python SDK. Supports streaming, + `reasoning_effort`, MCP over stdio/SSE/HTTP, and per-tool permission + gating. The SDK selects its own wire protocol per model, so the YAML + `api_type` field is not honoured; multi-personality handoffs, + `temperature`, and `parallel_tool_calls` are likewise not available. + Taskflows that use unsupported fields fail at load time with a + `BackendCapabilityError` naming the offending field. +- **`anthropic_sdk`** — the Anthropic Python SDK, driving the native + Messages API (`/v1/messages`). Supports streaming, tool calling via + MCP, and adaptive thinking with configurable `reasoning.effort` + (`low`, `medium`, `high`, `max`). Handoffs are not supported. + Designed for use with CAPI's Anthropic endpoint; auth uses + `Authorization: Bearer` (not `x-api-key`). Selection precedence: -1. `backend:` field in the model config document. -2. `SECLAB_TASKFLOW_BACKEND` environment variable. -3. Endpoint auto-default (`api.githubcopilot.com` prefers `copilot_sdk` - when the optional dependency is installed). +1. Per-model `backend:` in `model_settings` (allows mixed backends in a + single taskflow). +2. `backend:` field in the model config document (global default). +3. `SECLAB_TASKFLOW_BACKEND` environment variable. 4. `openai_agents`. ```yaml seclab-taskflow-agent: version: "1.0" filetype: model_config -backend: copilot_sdk models: - fast: gpt-5-mini - slow: claude-opus-4.6 + code_analysis: claude-mythos-5 + general_tasks: gpt-5.4-mini +model_settings: + code_analysis: + backend: anthropic_sdk + reasoning: + effort: high + general_tasks: + api_type: responses + backend: openai_agents ``` ### Session Recovery diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md index b7e16ee9..67c57497 100644 --- a/doc/GRAMMAR.md +++ b/doc/GRAMMAR.md @@ -524,6 +524,7 @@ api_type: chat_completions # default for all models models: gpt_default: gpt-4.1 gpt_responses: gpt-5.1 + claude_native: claude-mythos-5 model_settings: gpt_default: temperature: 0.7 @@ -532,6 +533,10 @@ model_settings: endpoint: https://api.githubcopilot.com token: CAPI_TOKEN # env var name containing the API key temperature: 0.5 + claude_native: + backend: anthropic_sdk # use the Anthropic Messages API + reasoning: + effort: high ``` The following keys in `model_settings` are handled by the engine and are not @@ -539,9 +544,10 @@ passed to the underlying model provider: | Key | Description | Default | |-----|-------------|---------| -| `api_type` | `"chat_completions"` or `"responses"` | Inherited from top-level `api_type`, or `"chat_completions"` | +| `api_type` | `"chat_completions"`, `"responses"`, or `"messages"` | Inherited from top-level `api_type`, or `"chat_completions"` | +| `backend` | SDK adapter: `"openai_agents"`, `"copilot_sdk"`, or `"anthropic_sdk"` | Inherited from top-level `backend`, or `"openai_agents"` | | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var | | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` | -All other keys (e.g. `temperature`, `top_p`) are passed through as model -parameters to the OpenAI SDK. +All other keys (e.g. `temperature`, `top_p`, `reasoning`) are passed through as +model parameters to the selected SDK backend. diff --git a/pyproject.toml b/pyproject.toml index 6f805a7f..aa163f75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ classifiers = [ dependencies = [ "aiofiles==24.1.0", "annotated-types==0.7.0", + "anthropic>=0.50,<1", "anyio==4.9.0", "attrs==25.3.0", "Authlib==1.6.12", @@ -55,6 +56,7 @@ dependencies = [ "email-validator==2.3.0", "exceptiongroup==1.3.0", "fastmcp==3.2.0", + "github-copilot-sdk>=0.2.2,<0.3", "griffe==1.7.3", "h11==0.16.0", "httpcore==1.0.9", @@ -124,15 +126,6 @@ dependencies = [ [project.scripts] seclab-taskflow-agent = "seclab_taskflow_agent.cli:app" -[project.optional-dependencies] -# Pulls in the GitHub Copilot SDK (public preview) so the copilot_sdk -# backend can be selected. Requires Python >= 3.11. Pinned to the -# 0.2.x line because the SDK may ship breaking changes between minor -# versions while still in preview. -copilot = [ - "github-copilot-sdk>=0.2.2,<0.3", -] - [project.urls] Source = "https://github.com/GitHubSecurityLab/seclab-taskflow-agent" Issues = "https://github.com/GitHubSecurityLab/seclab-taskflow-agent/issues" diff --git a/src/seclab_taskflow_agent/models.py b/src/seclab_taskflow_agent/models.py index eff05ee6..837e4e2c 100644 --- a/src/seclab_taskflow_agent/models.py +++ b/src/seclab_taskflow_agent/models.py @@ -31,10 +31,10 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator # Valid API type values for model configuration. -ApiType = Literal["chat_completions", "responses"] +ApiType = Literal["chat_completions", "responses", "messages"] # Valid backend names. Must stay in sync with ``sdk._KNOWN``. -BackendSdk = Literal["openai_agents", "copilot_sdk"] +BackendSdk = Literal["openai_agents", "copilot_sdk", "anthropic_sdk"] # --------------------------------------------------------------------------- diff --git a/src/seclab_taskflow_agent/runner.py b/src/seclab_taskflow_agent/runner.py index 12d36bd8..b5ed43fd 100644 --- a/src/seclab_taskflow_agent/runner.py +++ b/src/seclab_taskflow_agent/runner.py @@ -126,12 +126,12 @@ def _resolve_task_model( model_dict: dict[str, str], models_params: dict[str, dict[str, Any]], default_api_type: str = "chat_completions", -) -> tuple[str, dict[str, Any], str, str | None, str | None]: +) -> tuple[str, dict[str, Any], str, str | None, str | None, str | None]: """Resolve the final model name, settings, and per-model overrides. Returns: - A tuple of ``(model_id, model_settings, api_type, endpoint, token)`` - where *endpoint* and *token* are ``None`` when not overridden. + A tuple of ``(model_id, model_settings, api_type, endpoint, token, backend)`` + where *endpoint*, *token*, and *backend* are ``None`` when not overridden. Raises: ValueError: If task-level model_settings is not a dictionary. @@ -141,6 +141,7 @@ def _resolve_task_model( api_type: str = default_api_type endpoint: str | None = None token: str | None = None + backend: str | None = None if logical_name in model_keys: if logical_name in models_params: @@ -151,6 +152,7 @@ def _resolve_task_model( api_type = model_settings.pop("api_type", api_type) endpoint = model_settings.pop("endpoint", None) token = model_settings.pop("token", None) + backend = model_settings.pop("backend", None) task_model_settings: dict[str, Any] | Any = task.model_settings or {} if not isinstance(task_model_settings, dict): @@ -161,9 +163,10 @@ def _resolve_task_model( api_type = task_settings.pop("api_type", api_type) endpoint = task_settings.pop("endpoint", endpoint) token = task_settings.pop("token", token) + backend = task_settings.pop("backend", backend) model_settings.update(task_settings) - return logical_name, model_settings, api_type, endpoint, token + return logical_name, model_settings, api_type, endpoint, token, backend async def _build_prompts_to_run( @@ -600,8 +603,8 @@ async def on_handoff_hook(context: RunContextWrapper[TContext], agent: Agent[TCo if task.uses: task = _merge_reusable_task(available_tools, task) - # Resolve model (name, settings, api_type, optional endpoint/token) - model, model_settings, task_api_type, task_endpoint, task_token = _resolve_task_model( + # Resolve model (name, settings, api_type, optional endpoint/token/backend) + model, model_settings, task_api_type, task_endpoint, task_token, task_backend = _resolve_task_model( task, model_keys, model_dict, models_params, default_api_type=api_type, ) @@ -697,7 +700,7 @@ async def _deploy(ra: dict, pp: str) -> bool: api_type=task_api_type, endpoint=task_endpoint, token=task_token, - backend=backend, + backend=task_backend or backend, agent_hooks=TaskAgentHooks(on_handoff=on_handoff_hook), ) diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py index 15086922..972afa90 100644 --- a/src/seclab_taskflow_agent/sdk/__init__.py +++ b/src/seclab_taskflow_agent/sdk/__init__.py @@ -33,7 +33,7 @@ ) _ENV_VAR = "SECLAB_TASKFLOW_BACKEND" -_KNOWN = ("openai_agents", "copilot_sdk") +_KNOWN = ("openai_agents", "copilot_sdk", "anthropic_sdk") _BACKENDS: dict[str, AgentBackend] = {} @@ -46,10 +46,14 @@ def get_backend(name: str) -> AgentBackend: from .openai_agents.backend import OpenAIAgentsBackend _BACKENDS[name] = OpenAIAgentsBackend() - else: + elif name == "copilot_sdk": from .copilot_sdk.backend import CopilotSDKBackend _BACKENDS[name] = CopilotSDKBackend() + else: + from .anthropic_sdk.backend import AnthropicSDKBackend + + _BACKENDS[name] = AnthropicSDKBackend() return _BACKENDS[name] diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py new file mode 100644 index 00000000..03ec0700 --- /dev/null +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: GitHub, Inc. +# SPDX-License-Identifier: MIT + +"""Anthropic SDK backend adapter.""" diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py new file mode 100644 index 00000000..9d975fc3 --- /dev/null +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -0,0 +1,284 @@ +# SPDX-FileCopyrightText: GitHub, Inc. +# SPDX-License-Identifier: MIT + +"""Anthropic SDK backend adapter. + +Drives the Anthropic Messages API (``/v1/messages``) via the official +``anthropic`` Python SDK. Supports streaming, tool calling via MCP +servers, and extended thinking. + +Auth note: The Anthropic SDK sends ``x-api-key`` by default, but CAPI +expects ``Authorization: Bearer``. We pass the bearer header via +``default_headers`` and set ``api_key`` to a placeholder so the SDK +doesn't complain about a missing key. +""" + +from __future__ import annotations + +__all__ = ["AnthropicSDKBackend"] + +import json +import logging +import os +from collections.abc import AsyncIterator +from dataclasses import dataclass, field +from typing import Any + +from ..base import AgentSpec, StreamEvent, TextDelta, ToolEnd +from ..errors import ( + BackendBadRequestError, + BackendCapabilityError, + BackendMaxTurnsError, + BackendRateLimitError, + BackendTimeoutError, + BackendUnexpectedError, +) + +logger = logging.getLogger(__name__) + +_VALID_REASONING = ("low", "medium", "high", "max") + + +def _resolve_token(token_env: str | None) -> str: + """Resolve the API token from env var name or default AI_API_TOKEN.""" + if token_env: + val = os.getenv(token_env) + if val: + return val + val = os.getenv("AI_API_TOKEN") + if val: + return val + raise BackendBadRequestError( + "anthropic_sdk: no API token found (set AI_API_TOKEN or per-model token env)" + ) + + +def _resolve_endpoint() -> str: + """Resolve the API base URL.""" + return os.getenv("AI_API_ENDPOINT", "https://api.githubcopilot.com") + + +def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]: + """Convert MCP tool definitions to Anthropic tool format.""" + anthropic_tools = [] + for tool in tools: + schema = tool.inputSchema if hasattr(tool, "inputSchema") else {} + anthropic_tools.append({ + "name": tool.name, + "description": getattr(tool, "description", tool.name), + "input_schema": schema or {"type": "object", "properties": {}}, + }) + return anthropic_tools + + +def _call_tool_result_to_text(result: Any) -> str: + """Extract text from an MCP CallToolResult.""" + content = getattr(result, "content", []) + parts = [] + for c in content: + text = getattr(c, "text", None) + if text: + parts.append(text) + return "\n".join(parts) if parts else str(result) + + +@dataclass +class _AnthropicHandle: + """Holds the Anthropic client and conversation state.""" + client: Any + system_prompt: str + model: str + max_tokens: int + tools: list[dict[str, Any]] + mcp_server_map: dict[str, Any] # tool_name -> MCP server handle + model_settings: dict[str, Any] = field(default_factory=dict) + stream_thinking: bool = False + + +class AnthropicSDKBackend: + """Adapter that drives the Anthropic Python SDK.""" + + name = "anthropic_sdk" + + def validate(self, spec: AgentSpec) -> None: + if spec.handoffs or spec.in_handoff_graph: + raise BackendCapabilityError( + "anthropic_sdk: agent handoffs are not supported" + ) + if not spec.model: + raise BackendBadRequestError("anthropic_sdk: model is required") + + async def build( + self, + spec: AgentSpec, + *, + run_hooks: Any = None, + agent_hooks: Any = None, + ) -> _AnthropicHandle: + del run_hooks, agent_hooks + + import anthropic + + token = _resolve_token(spec.token_env) + endpoint = spec.endpoint or _resolve_endpoint() + + client = anthropic.AsyncAnthropic( + api_key="placeholder", + base_url=endpoint, + default_headers={ + "Authorization": f"Bearer {token}", + "Copilot-Integration-Id": os.getenv( + "COPILOT_INTEGRATION_ID", "vscode-chat" + ), + }, + ) + + # Collect tools from MCP servers + all_tools: list[dict[str, Any]] = [] + mcp_server_map: dict[str, Any] = {} + + for mcp_spec in spec.mcp_servers: + native_server = mcp_spec.params.get("_native") + if native_server is None: + continue + try: + mcp_tools = await native_server.list_tools() + anthropic_tools = _mcp_tools_to_anthropic(mcp_tools) + all_tools.extend(anthropic_tools) + for tool in mcp_tools: + mcp_server_map[tool.name] = native_server + except Exception: + logger.exception("Failed to list tools from MCP server %s", mcp_spec.name) + + # Resolve max_tokens from model_settings or default + max_tokens = spec.model_settings.get("max_tokens", 16384) + stream_thinking = spec.model_settings.get("stream_thinking", False) + + return _AnthropicHandle( + client=client, + system_prompt=spec.instructions or "", + model=spec.model, + max_tokens=max_tokens, + tools=all_tools, + mcp_server_map=mcp_server_map, + model_settings=spec.model_settings, + stream_thinking=stream_thinking, + ) + + async def run_streamed( + self, + agent: Any, + prompt: str, + *, + max_turns: int, + ) -> AsyncIterator[StreamEvent]: + handle: _AnthropicHandle = agent + messages: list[dict[str, Any]] = [ + {"role": "user", "content": prompt}, + ] + + # Build optional params + create_kwargs: dict[str, Any] = {} + reasoning = handle.model_settings.get("reasoning") + if isinstance(reasoning, dict): + effort = reasoning.get("effort") + if effort: + create_kwargs["thinking"] = {"type": "adaptive"} + create_kwargs["output_config"] = {"effort": effort} + + import anthropic + + for turn in range(max_turns): + try: + async with handle.client.messages.stream( + model=handle.model, + max_tokens=handle.max_tokens, + system=handle.system_prompt, + messages=messages, + tools=handle.tools or anthropic.NOT_GIVEN, + **create_kwargs, + ) as stream: + async for event in stream: + if hasattr(event, "type"): + if event.type == "content_block_delta": + delta = event.delta + if hasattr(delta, "text"): + yield TextDelta(text=delta.text) + elif hasattr(delta, "thinking") and handle.stream_thinking: + yield TextDelta(text=delta.thinking) + + response = await stream.get_final_message() + + except anthropic.RateLimitError as exc: + raise BackendRateLimitError(str(exc)) from exc + except anthropic.APITimeoutError as exc: + raise BackendTimeoutError(str(exc)) from exc + except anthropic.BadRequestError as exc: + raise BackendBadRequestError(str(exc)) from exc + except anthropic.APIError as exc: + raise BackendUnexpectedError(str(exc)) from exc + + if response.stop_reason == "end_turn": + return + if response.stop_reason != "tool_use": + return + + # Process tool calls + tool_use_blocks = [ + b for b in response.content if b.type == "tool_use" + ] + if not tool_use_blocks: + return + + # Add assistant message with all content blocks + messages.append({"role": "assistant", "content": response.content}) + + # Execute each tool call and collect results + tool_results: list[dict[str, Any]] = [] + for tool_block in tool_use_blocks: + tool_name = tool_block.name + tool_input = tool_block.input + + server = handle.mcp_server_map.get(tool_name) + if server is None: + logger.warning("Tool %s not found in MCP servers", tool_name) + tool_results.append({ + "type": "tool_result", + "tool_use_id": tool_block.id, + "content": f"Error: tool '{tool_name}' not found", + "is_error": True, + }) + yield ToolEnd(tool_name=tool_name, text=f"Error: tool '{tool_name}' not found") + continue + + try: + result = await server.call_tool( + tool_name, + arguments=tool_input if isinstance(tool_input, dict) else {}, + ) + result_text = _call_tool_result_to_text(result) + tool_results.append({ + "type": "tool_result", + "tool_use_id": tool_block.id, + "content": result_text, + }) + yield ToolEnd(tool_name=tool_name, text=result_text) + except Exception as exc: + logger.exception("Tool call %s failed", tool_name) + error_text = f"Error calling {tool_name}: {exc}" + tool_results.append({ + "type": "tool_result", + "tool_use_id": tool_block.id, + "content": error_text, + "is_error": True, + }) + yield ToolEnd(tool_name=tool_name, text=error_text) + + messages.append({"role": "user", "content": tool_results}) + + raise BackendMaxTurnsError(f"Exceeded max_turns ({max_turns})") + + async def aclose(self, agent: Any) -> None: + handle: _AnthropicHandle = agent + if handle is not None and handle.client is not None: + await handle.client.close() From 9ad4c4ff12f069489c1311e1e06e11437224800c Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 11:38:51 -0400 Subject: [PATCH 02/23] fix: address PR review feedback - Remove unused json import (lint/CodeQL) - Validate reasoning.effort against allowed values upfront - Pass through temperature/top_p to Anthropic API - Add exclude_from_context support (stop after tool results) - Thread exclude_from_context into _AnthropicHandle Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sdk/anthropic_sdk/backend.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 9d975fc3..60dcdadb 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -17,7 +17,6 @@ __all__ = ["AnthropicSDKBackend"] -import json import logging import os from collections.abc import AsyncIterator @@ -93,6 +92,7 @@ class _AnthropicHandle: mcp_server_map: dict[str, Any] # tool_name -> MCP server handle model_settings: dict[str, Any] = field(default_factory=dict) stream_thinking: bool = False + exclude_from_context: bool = False class AnthropicSDKBackend: @@ -163,6 +163,7 @@ async def build( mcp_server_map=mcp_server_map, model_settings=spec.model_settings, stream_thinking=stream_thinking, + exclude_from_context=spec.exclude_from_context, ) async def run_streamed( @@ -179,10 +180,24 @@ async def run_streamed( # Build optional params create_kwargs: dict[str, Any] = {} + + # Pass through temperature/top_p if set + temperature = handle.model_settings.get("temperature") + if temperature is not None: + create_kwargs["temperature"] = float(temperature) + top_p = handle.model_settings.get("top_p") + if top_p is not None: + create_kwargs["top_p"] = float(top_p) + reasoning = handle.model_settings.get("reasoning") if isinstance(reasoning, dict): effort = reasoning.get("effort") if effort: + if effort not in _VALID_REASONING: + raise BackendBadRequestError( + f"anthropic_sdk: invalid reasoning effort {effort!r} " + f"(expected one of {_VALID_REASONING})" + ) create_kwargs["thinking"] = {"type": "adaptive"} create_kwargs["output_config"] = {"effort": effort} @@ -274,6 +289,12 @@ async def run_streamed( }) yield ToolEnd(tool_name=tool_name, text=error_text) + # exclude_from_context: stop after tool results are emitted + # so they are available to the runner but not fed back into + # the model context (matches copilot_sdk behavior). + if handle.exclude_from_context: + return + messages.append({"role": "user", "content": tool_results}) raise BackendMaxTurnsError(f"Exceeded max_turns ({max_turns})") From ed4412f0ec7937a4ee10e8f59471cd091513b139 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 11:41:36 -0400 Subject: [PATCH 03/23] fix: handle None tool descriptions in Anthropic tool conversion MCP tools can have description=None; the Anthropic API requires a valid string. Fall back to tool name when description is None. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 60dcdadb..37c31a49 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -62,9 +62,10 @@ def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]: anthropic_tools = [] for tool in tools: schema = tool.inputSchema if hasattr(tool, "inputSchema") else {} + description = getattr(tool, "description", None) or tool.name anthropic_tools.append({ "name": tool.name, - "description": getattr(tool, "description", tool.name), + "description": description, "input_schema": schema or {"type": "object", "properties": {}}, }) return anthropic_tools From a8bf3b85af95df54cbb0390e0ab783fc562efc03 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 11:56:11 -0400 Subject: [PATCH 04/23] fix: CI failures, add unit tests, update docs - Update doc examples to use claude-opus-4.7 and show api_type: messages - Add tests/test_sdk_anthropic_adapter.py (18 tests covering validate, tool conversion, token resolution, tool result parsing) - Fix test_runner.py: update _resolve_task_model unpacking to 6-tuple - Fix test_sdk_base.py: update backend resolution tests to match new behavior (endpoint no longer auto-selects copilot_sdk) - Add test for explicit anthropic_sdk backend selection Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 3 +- doc/GRAMMAR.md | 5 +- tests/test_runner.py | 14 +-- tests/test_sdk_anthropic_adapter.py | 174 ++++++++++++++++++++++++++++ tests/test_sdk_base.py | 19 ++- 5 files changed, 195 insertions(+), 20 deletions(-) create mode 100644 tests/test_sdk_anthropic_adapter.py diff --git a/README.md b/README.md index 75c588a7..21f39d15 100644 --- a/README.md +++ b/README.md @@ -116,10 +116,11 @@ seclab-taskflow-agent: version: "1.0" filetype: model_config models: - code_analysis: claude-mythos-5 + code_analysis: claude-opus-4.7 general_tasks: gpt-5.4-mini model_settings: code_analysis: + api_type: messages backend: anthropic_sdk reasoning: effort: high diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md index 67c57497..928bf8b1 100644 --- a/doc/GRAMMAR.md +++ b/doc/GRAMMAR.md @@ -524,7 +524,7 @@ api_type: chat_completions # default for all models models: gpt_default: gpt-4.1 gpt_responses: gpt-5.1 - claude_native: claude-mythos-5 + claude_native: claude-opus-4.7 model_settings: gpt_default: temperature: 0.7 @@ -534,7 +534,8 @@ model_settings: token: CAPI_TOKEN # env var name containing the API key temperature: 0.5 claude_native: - backend: anthropic_sdk # use the Anthropic Messages API + api_type: messages # use the Anthropic Messages API + backend: anthropic_sdk reasoning: effort: high ``` diff --git a/tests/test_runner.py b/tests/test_runner.py index a7713953..c34d803a 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -187,7 +187,7 @@ class TestResolveTaskModel: def test_logical_name_mapped_to_provider_id(self): """A logical model name is resolved to the provider model ID.""" - model_id, _, _, _, _ = _resolve_task_model( + model_id, _, _, _, _, _ = _resolve_task_model( TaskDefinition(model="fast"), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, @@ -197,7 +197,7 @@ def test_logical_name_mapped_to_provider_id(self): def test_model_settings_from_config(self): """Settings from models_params are included in the result.""" - _, settings, _, _, _ = _resolve_task_model( + _, settings, _, _, _, _ = _resolve_task_model( TaskDefinition(model="fast"), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, @@ -208,7 +208,7 @@ def test_model_settings_from_config(self): def test_task_level_settings_override_config(self): """Task-level model_settings override config-level settings.""" - _, settings, _, _, _ = _resolve_task_model( + _, settings, _, _, _, _ = _resolve_task_model( TaskDefinition(model="fast", model_settings={"temperature": 0.2}), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, @@ -219,7 +219,7 @@ def test_task_level_settings_override_config(self): def test_engine_keys_extracted(self): """Engine keys (api_type, endpoint, token) are popped from settings.""" - _, settings, api_type, endpoint, token = _resolve_task_model( + _, settings, api_type, endpoint, token, _ = _resolve_task_model( TaskDefinition(model="fast"), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, @@ -244,7 +244,7 @@ def test_default_model_when_empty(self): """Empty model string falls back to DEFAULT_MODEL.""" from seclab_taskflow_agent.agent import DEFAULT_MODEL - model_id, _, _, _, _ = _resolve_task_model( + model_id, _, _, _, _, _ = _resolve_task_model( TaskDefinition(model=""), model_keys=[], model_dict={}, @@ -254,7 +254,7 @@ def test_default_model_when_empty(self): def test_model_not_in_keys_passes_through(self): """A model name not in model_keys passes through as-is.""" - model_id, _, _, _, _ = _resolve_task_model( + model_id, _, _, _, _, _ = _resolve_task_model( TaskDefinition(model="claude-3-opus"), model_keys=["fast", "smart"], model_dict={"fast": "gpt-4o-mini", "smart": "gpt-4o"}, @@ -264,7 +264,7 @@ def test_model_not_in_keys_passes_through(self): def test_task_engine_keys_override_config(self): """Task-level model_settings can override engine keys from config.""" - _, _, api_type, endpoint, token = _resolve_task_model( + _, _, api_type, endpoint, token, _ = _resolve_task_model( TaskDefinition( model="fast", model_settings={"api_type": "responses", "endpoint": "https://task.api"}, diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py new file mode 100644 index 00000000..951f7a04 --- /dev/null +++ b/tests/test_sdk_anthropic_adapter.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: GitHub, Inc. +# SPDX-License-Identifier: MIT + +"""Tests for the Anthropic SDK adapter.""" + +from __future__ import annotations + +import pytest + +from seclab_taskflow_agent.sdk import get_backend +from seclab_taskflow_agent.sdk.base import AgentSpec, MCPServerSpec, TextDelta, ToolEnd +from seclab_taskflow_agent.sdk.anthropic_sdk.backend import ( + AnthropicSDKBackend, + _mcp_tools_to_anthropic, + _call_tool_result_to_text, + _resolve_token, + _VALID_REASONING, +) +from seclab_taskflow_agent.sdk.errors import ( + BackendBadRequestError, + BackendCapabilityError, +) + + +def _spec(**overrides) -> AgentSpec: + base = { + "name": "a", + "instructions": "You are a test agent.", + "model": "claude-opus-4.7", + } + base.update(overrides) + return AgentSpec(**base) + + +# -- Backend registration -- + + +def test_get_backend_returns_anthropic_sdk_instance(): + backend = get_backend("anthropic_sdk") + assert isinstance(backend, AnthropicSDKBackend) + assert backend.name == "anthropic_sdk" + + +# -- validate() -- + + +def test_validate_accepts_minimal_spec(): + AnthropicSDKBackend().validate(_spec()) + + +def test_validate_rejects_handoffs(): + backend = AnthropicSDKBackend() + with pytest.raises(BackendCapabilityError, match="handoffs"): + backend.validate(_spec(handoffs=[_spec(name="b")])) + + +def test_validate_rejects_handoff_graph(): + backend = AnthropicSDKBackend() + with pytest.raises(BackendCapabilityError, match="handoffs"): + backend.validate(_spec(in_handoff_graph=True)) + + +def test_validate_rejects_empty_model(): + backend = AnthropicSDKBackend() + with pytest.raises(BackendBadRequestError, match="model is required"): + backend.validate(_spec(model="")) + + +def test_validate_accepts_exclude_from_context(): + AnthropicSDKBackend().validate(_spec(exclude_from_context=True)) + + +# -- _mcp_tools_to_anthropic() -- + + +class _FakeTool: + def __init__(self, name, description=None, inputSchema=None): + self.name = name + self.description = description + self.inputSchema = inputSchema + + +def test_mcp_tools_to_anthropic_basic(): + tools = [_FakeTool("read_file", "Read a file", {"type": "object", "properties": {"path": {"type": "string"}}})] + result = _mcp_tools_to_anthropic(tools) + assert len(result) == 1 + assert result[0]["name"] == "read_file" + assert result[0]["description"] == "Read a file" + assert result[0]["input_schema"]["properties"]["path"]["type"] == "string" + + +def test_mcp_tools_to_anthropic_none_description(): + """Tools with None description should fall back to tool name.""" + tools = [_FakeTool("my_tool", description=None)] + result = _mcp_tools_to_anthropic(tools) + assert result[0]["description"] == "my_tool" + + +def test_mcp_tools_to_anthropic_empty_description(): + """Tools with empty string description should fall back to tool name.""" + tools = [_FakeTool("my_tool", description="")] + result = _mcp_tools_to_anthropic(tools) + assert result[0]["description"] == "my_tool" + + +def test_mcp_tools_to_anthropic_no_schema(): + """Tools without inputSchema should get a default empty object schema.""" + tools = [_FakeTool("my_tool", "desc")] + result = _mcp_tools_to_anthropic(tools) + assert result[0]["input_schema"] == {"type": "object", "properties": {}} + + +def test_mcp_tools_to_anthropic_none_schema(): + """Tools with None inputSchema should get a default empty object schema.""" + tools = [_FakeTool("my_tool", "desc", inputSchema=None)] + result = _mcp_tools_to_anthropic(tools) + assert result[0]["input_schema"] == {"type": "object", "properties": {}} + + +# -- _call_tool_result_to_text() -- + + +class _FakeContent: + def __init__(self, text): + self.text = text + + +class _FakeResult: + def __init__(self, contents): + self.content = contents + + +def test_call_tool_result_to_text_single(): + result = type("R", (), {"content": [_FakeContent("hello")]})() + assert _call_tool_result_to_text(result) == "hello" + + +def test_call_tool_result_to_text_multiple(): + result = type("R", (), {"content": [_FakeContent("a"), _FakeContent("b")]})() + assert _call_tool_result_to_text(result) == "a\nb" + + +def test_call_tool_result_to_text_empty(): + result = type("R", (), {"content": []})() + text = _call_tool_result_to_text(result) + assert isinstance(text, str) + + +# -- _resolve_token() -- + + +def test_resolve_token_from_env(monkeypatch): + monkeypatch.setenv("MY_TOKEN", "secret123") + assert _resolve_token("MY_TOKEN") == "secret123" + + +def test_resolve_token_fallback_to_ai_api_token(monkeypatch): + monkeypatch.setenv("AI_API_TOKEN", "fallback_token") + monkeypatch.delenv("MISSING_VAR", raising=False) + assert _resolve_token("MISSING_VAR") == "fallback_token" + + +def test_resolve_token_raises_when_missing(monkeypatch): + monkeypatch.delenv("AI_API_TOKEN", raising=False) + monkeypatch.delenv("MISSING_VAR", raising=False) + with pytest.raises(BackendBadRequestError, match="no API token"): + _resolve_token("MISSING_VAR") + + +# -- reasoning validation -- + + +def test_valid_reasoning_values(): + assert _VALID_REASONING == ("low", "medium", "high", "max") diff --git a/tests/test_sdk_base.py b/tests/test_sdk_base.py index 54dd17e5..f2fb1c38 100644 --- a/tests/test_sdk_base.py +++ b/tests/test_sdk_base.py @@ -38,24 +38,23 @@ def test_resolve_backend_default_is_openai_agents(monkeypatch): assert sdk.resolve_backend_name() == "openai_agents" -def test_resolve_backend_copilot_endpoint_prefers_copilot_when_installed(monkeypatch): +def test_resolve_backend_copilot_endpoint_does_not_auto_select(monkeypatch): + """Backend selection is always explicit -- endpoint URL is not used.""" monkeypatch.delenv("SECLAB_TASKFLOW_BACKEND", raising=False) - pytest.importorskip("copilot") assert ( sdk.resolve_backend_name(endpoint="https://api.githubcopilot.com") - == "copilot_sdk" + == "openai_agents" ) -def test_resolve_backend_copilot_endpoint_falls_back_when_missing(monkeypatch): +def test_resolve_backend_explicit_overrides_endpoint(monkeypatch): monkeypatch.delenv("SECLAB_TASKFLOW_BACKEND", raising=False) - # Force the optional import to fail by stashing a sentinel in sys.modules. - import sys - - monkeypatch.setitem(sys.modules, "copilot", None) assert ( - sdk.resolve_backend_name(endpoint="https://api.githubcopilot.com") - == "openai_agents" + sdk.resolve_backend_name( + explicit="anthropic_sdk", + endpoint="https://api.githubcopilot.com", + ) + == "anthropic_sdk" ) From b6f00571b74041cc6ceeeca470f7cc6816c86597 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 12:01:50 -0400 Subject: [PATCH 05/23] fix: lint errors in test file (unused imports, N803 camelCase) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_sdk_anthropic_adapter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index 951f7a04..bc347df0 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -8,7 +8,7 @@ import pytest from seclab_taskflow_agent.sdk import get_backend -from seclab_taskflow_agent.sdk.base import AgentSpec, MCPServerSpec, TextDelta, ToolEnd +from seclab_taskflow_agent.sdk.base import AgentSpec from seclab_taskflow_agent.sdk.anthropic_sdk.backend import ( AnthropicSDKBackend, _mcp_tools_to_anthropic, @@ -74,10 +74,10 @@ def test_validate_accepts_exclude_from_context(): class _FakeTool: - def __init__(self, name, description=None, inputSchema=None): + def __init__(self, name, description=None, input_schema=None): # noqa: N803 self.name = name self.description = description - self.inputSchema = inputSchema + self.inputSchema = input_schema def test_mcp_tools_to_anthropic_basic(): @@ -112,7 +112,7 @@ def test_mcp_tools_to_anthropic_no_schema(): def test_mcp_tools_to_anthropic_none_schema(): """Tools with None inputSchema should get a default empty object schema.""" - tools = [_FakeTool("my_tool", "desc", inputSchema=None)] + tools = [_FakeTool("my_tool", "desc", input_schema=None)] result = _mcp_tools_to_anthropic(tools) assert result[0]["input_schema"] == {"type": "object", "properties": {}} From 266c54b6fc70bd4ea8757fd0c7830cee72d7e2c6 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 12:33:44 -0400 Subject: [PATCH 06/23] test: add backend extraction coverage to _resolve_task_model tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_runner.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_runner.py b/tests/test_runner.py index c34d803a..5cb1c26a 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -218,8 +218,8 @@ def test_task_level_settings_override_config(self): assert settings["max_tokens"] == 100 def test_engine_keys_extracted(self): - """Engine keys (api_type, endpoint, token) are popped from settings.""" - _, settings, api_type, endpoint, token, _ = _resolve_task_model( + """Engine keys (api_type, endpoint, token, backend) are popped from settings.""" + _, settings, api_type, endpoint, token, backend = _resolve_task_model( TaskDefinition(model="fast"), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, @@ -228,6 +228,7 @@ def test_engine_keys_extracted(self): "api_type": "responses", "endpoint": "https://custom.api", "token": "secret", + "backend": "anthropic_sdk", "temperature": 0.5, } }, @@ -235,9 +236,11 @@ def test_engine_keys_extracted(self): assert api_type == "responses" assert endpoint == "https://custom.api" assert token == "secret" # noqa: S105 + assert backend == "anthropic_sdk" assert "api_type" not in settings assert "endpoint" not in settings assert "token" not in settings + assert "backend" not in settings assert settings["temperature"] == 0.5 def test_default_model_when_empty(self): @@ -264,17 +267,18 @@ def test_model_not_in_keys_passes_through(self): def test_task_engine_keys_override_config(self): """Task-level model_settings can override engine keys from config.""" - _, _, api_type, endpoint, token, _ = _resolve_task_model( + _, _, api_type, endpoint, token, backend = _resolve_task_model( TaskDefinition( model="fast", - model_settings={"api_type": "responses", "endpoint": "https://task.api"}, + model_settings={"api_type": "responses", "endpoint": "https://task.api", "backend": "anthropic_sdk"}, ), model_keys=["fast"], model_dict={"fast": "gpt-4o-mini"}, - models_params={"fast": {"api_type": "chat_completions"}}, + models_params={"fast": {"api_type": "chat_completions", "backend": "openai_agents"}}, ) assert api_type == "responses" assert endpoint == "https://task.api" + assert backend == "anthropic_sdk" # =================================================================== From 44172790802c338dfb52280bee8bc89dab9b4860 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 15:01:17 -0400 Subject: [PATCH 07/23] fix: pass real token as api_key instead of placeholder Allows the backend to work with both CAPI (Authorization: Bearer) and direct Anthropic endpoints (x-api-key) without code changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 37c31a49..561fea2a 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -124,7 +124,7 @@ async def build( endpoint = spec.endpoint or _resolve_endpoint() client = anthropic.AsyncAnthropic( - api_key="placeholder", + api_key=token, base_url=endpoint, default_headers={ "Authorization": f"Bearer {token}", From e16b20acb837566ff1629773f4f7c3df708835fe Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 15:15:22 -0400 Subject: [PATCH 08/23] fix: implement blocked_tools filtering in anthropic backend Access the MCP session directly to get the raw tool list, bypassing the openai-agents tool_filter which requires run_context/agent args not available outside its run loop. Apply blocked_tools filtering and namespace prefixing in our own code. Tested: blocked tool correctly hidden from model, unblocked tools work normally. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sdk/anthropic_sdk/backend.py | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 561fea2a..0b35a3bd 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -134,20 +134,47 @@ async def build( }, ) - # Collect tools from MCP servers + # Collect tools from MCP servers and apply blocked_tools filter. + # We filter tools ourselves rather than relying on the openai-agents + # SDK's tool_filter, which requires run_context/agent args that + # aren't available outside the openai-agents run loop. all_tools: list[dict[str, Any]] = [] mcp_server_map: dict[str, Any] = {} + blocked = set(spec.blocked_tools or []) for mcp_spec in spec.mcp_servers: native_server = mcp_spec.params.get("_native") if native_server is None: continue try: - mcp_tools = await native_server.list_tools() - anthropic_tools = _mcp_tools_to_anthropic(mcp_tools) - all_tools.extend(anthropic_tools) + # Access the underlying MCP session to get the raw tool + # list, bypassing the openai-agents tool_filter that + # requires run_context/agent we don't have. + raw_server = getattr(native_server, "_obj", native_server) + session = getattr(raw_server, "session", None) + if session is not None: + result = await session.list_tools() + raw_tools = result.tools + else: + raw_tools = await native_server.list_tools() + + # Apply namespace prefix (NamespacedMCPServer convention) + ns = getattr(native_server, "namespace", "") + mcp_tools = [] + for tool in raw_tools: + if hasattr(tool, "copy"): + tool = tool.copy() + if ns: + tool.name = f"{ns}{tool.name}" + mcp_tools.append(tool) + for tool in mcp_tools: - mcp_server_map[tool.name] = native_server + if tool.name not in blocked: + mcp_server_map[tool.name] = native_server + anthropic_tools = _mcp_tools_to_anthropic( + [t for t in mcp_tools if t.name not in blocked] + ) + all_tools.extend(anthropic_tools) except Exception: logger.exception("Failed to list tools from MCP server %s", mcp_spec.name) From ccbf7c543b2f74d0089979863e9daafd57c367be Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 15:22:44 -0400 Subject: [PATCH 09/23] fix: address PR review feedback (round 2) - Use placeholder api_key only for CAPI endpoints; pass real token for direct Anthropic endpoints (avoids leaking token via x-api-key to CAPI while preserving native Anthropic auth) - Replace implicit else with explicit elif + else-error in get_backend (Kevin's review) - Add test for invalid reasoning effort validation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/__init__.py | 4 +++- .../sdk/anthropic_sdk/backend.py | 21 ++++++++++++------- tests/test_sdk_anthropic_adapter.py | 19 +++++++++++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py index 972afa90..0a413505 100644 --- a/src/seclab_taskflow_agent/sdk/__init__.py +++ b/src/seclab_taskflow_agent/sdk/__init__.py @@ -50,10 +50,12 @@ def get_backend(name: str) -> AgentBackend: from .copilot_sdk.backend import CopilotSDKBackend _BACKENDS[name] = CopilotSDKBackend() - else: + elif name == "anthropic_sdk": from .anthropic_sdk.backend import AnthropicSDKBackend _BACKENDS[name] = AnthropicSDKBackend() + else: + raise ValueError(f"No backend implementation for {name!r}") return _BACKENDS[name] diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 0b35a3bd..56c32477 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -123,15 +123,22 @@ async def build( token = _resolve_token(spec.token_env) endpoint = spec.endpoint or _resolve_endpoint() + # CAPI expects Authorization: Bearer, not x-api-key. Use a + # placeholder api_key so the SDK doesn't send the real token + # via x-api-key as well. For direct Anthropic endpoints, pass + # the real token as api_key (the SDK's native auth). + is_capi = "githubcopilot.com" in endpoint + headers: dict[str, str] = {} + if is_capi: + headers["Authorization"] = f"Bearer {token}" + headers["Copilot-Integration-Id"] = os.getenv( + "COPILOT_INTEGRATION_ID", "vscode-chat" + ) + client = anthropic.AsyncAnthropic( - api_key=token, + api_key="placeholder" if is_capi else token, base_url=endpoint, - default_headers={ - "Authorization": f"Bearer {token}", - "Copilot-Integration-Id": os.getenv( - "COPILOT_INTEGRATION_ID", "vscode-chat" - ), - }, + default_headers=headers or None, ) # Collect tools from MCP servers and apply blocked_tools filter. diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index bc347df0..d5aa9b51 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -172,3 +172,22 @@ def test_resolve_token_raises_when_missing(monkeypatch): def test_valid_reasoning_values(): assert _VALID_REASONING == ("low", "medium", "high", "max") + + +# -- reasoning effort validation (runtime) -- + + +def test_invalid_reasoning_effort_raises(): + """Invalid reasoning.effort should raise BackendBadRequestError at runtime.""" + import asyncio + + backend = AnthropicSDKBackend() + spec = _spec(model_settings={"reasoning": {"effort": "ultra"}}) + + # build() would need a real API client, but we can test the validation + # by checking the constant directly + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING + assert "ultra" not in _VALID_REASONING + assert "high" in _VALID_REASONING + assert "low" in _VALID_REASONING + assert "max" in _VALID_REASONING From 0c521e040d4837124c79bb8595e0fd65ee42738b Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 15:26:55 -0400 Subject: [PATCH 10/23] fix: lint errors + URL substring sanitization (CodeQL) - Remove unused asyncio import, backend/spec variables from test - Use urlparse().hostname for CAPI endpoint detection instead of substring match (CodeQL incomplete URL sanitization finding) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sdk/anthropic_sdk/backend.py | 6 +++++- tests/test_sdk_anthropic_adapter.py | 11 ++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 56c32477..c343b032 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -127,7 +127,11 @@ async def build( # placeholder api_key so the SDK doesn't send the real token # via x-api-key as well. For direct Anthropic endpoints, pass # the real token as api_key (the SDK's native auth). - is_capi = "githubcopilot.com" in endpoint + from urllib.parse import urlparse + is_capi = urlparse(endpoint).hostname in ( + "api.githubcopilot.com", + "models.github.ai", + ) headers: dict[str, str] = {} if is_capi: headers["Authorization"] = f"Bearer {token}" diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index d5aa9b51..8587dbec 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -177,15 +177,8 @@ def test_valid_reasoning_values(): # -- reasoning effort validation (runtime) -- -def test_invalid_reasoning_effort_raises(): - """Invalid reasoning.effort should raise BackendBadRequestError at runtime.""" - import asyncio - - backend = AnthropicSDKBackend() - spec = _spec(model_settings={"reasoning": {"effort": "ultra"}}) - - # build() would need a real API client, but we can test the validation - # by checking the constant directly +def test_invalid_reasoning_effort_not_in_valid(): + """Invalid reasoning.effort values should not be in _VALID_REASONING.""" from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING assert "ultra" not in _VALID_REASONING assert "high" in _VALID_REASONING From b4da0a60ffb33f58b29566dce14330c4adf76cb4 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 11 Jun 2026 16:04:21 -0400 Subject: [PATCH 11/23] fix: address PR review feedback (round 3) - Move CAPI endpoint detection to capi.py as is_capi_endpoint() (Kevin's review: keep provider logic centralized) - Format test file with ruff (CI formatting check) - Add runtime validation test for invalid reasoning effort (tests BackendBadRequestError from run_streamed, not just constant) - Tool calls are sequential by design (MCP tools may have ordering dependencies); updated PR description to not claim parallel Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/capi.py | 9 +++++ .../sdk/anthropic_sdk/backend.py | 8 ++--- tests/test_sdk_anthropic_adapter.py | 34 ++++++++++++++++++- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py index a605258f..2ce61a3a 100644 --- a/src/seclab_taskflow_agent/capi.py +++ b/src/seclab_taskflow_agent/capi.py @@ -29,6 +29,7 @@ "get_AI_endpoint", "get_AI_token", "get_provider", + "is_capi_endpoint", "list_capi_models", "list_tool_call_models", "supports_tool_calls", @@ -142,6 +143,14 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool: _DEFAULT_PROVIDER = "api.githubcopilot.com" +# Hostnames that use CAPI-style auth (Authorization: Bearer, not x-api-key). +_CAPI_HOSTS = frozenset(_PROVIDERS.keys()) + + +def is_capi_endpoint(endpoint: str) -> bool: + """Return True if *endpoint* is a GitHub CAPI proxy (needs Bearer auth).""" + return urlparse(endpoint).hostname in _CAPI_HOSTS + def get_provider(endpoint: str | None = None) -> APIProvider: """Return the ``APIProvider`` for the given (or configured) endpoint URL. diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index c343b032..43f8ad2a 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -123,15 +123,13 @@ async def build( token = _resolve_token(spec.token_env) endpoint = spec.endpoint or _resolve_endpoint() + from ..capi import is_capi_endpoint + # CAPI expects Authorization: Bearer, not x-api-key. Use a # placeholder api_key so the SDK doesn't send the real token # via x-api-key as well. For direct Anthropic endpoints, pass # the real token as api_key (the SDK's native auth). - from urllib.parse import urlparse - is_capi = urlparse(endpoint).hostname in ( - "api.githubcopilot.com", - "models.github.ai", - ) + is_capi = is_capi_endpoint(endpoint) headers: dict[str, str] = {} if is_capi: headers["Authorization"] = f"Bearer {token}" diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index 8587dbec..40132199 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -81,7 +81,13 @@ def __init__(self, name, description=None, input_schema=None): # noqa: N803 def test_mcp_tools_to_anthropic_basic(): - tools = [_FakeTool("read_file", "Read a file", {"type": "object", "properties": {"path": {"type": "string"}}})] + tools = [ + _FakeTool( + "read_file", + "Read a file", + {"type": "object", "properties": {"path": {"type": "string"}}}, + ) + ] result = _mcp_tools_to_anthropic(tools) assert len(result) == 1 assert result[0]["name"] == "read_file" @@ -180,7 +186,33 @@ def test_valid_reasoning_values(): def test_invalid_reasoning_effort_not_in_valid(): """Invalid reasoning.effort values should not be in _VALID_REASONING.""" from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _VALID_REASONING + assert "ultra" not in _VALID_REASONING assert "high" in _VALID_REASONING assert "low" in _VALID_REASONING assert "max" in _VALID_REASONING + + +def test_invalid_reasoning_effort_raises_at_runtime(): + """run_streamed raises BackendBadRequestError for invalid effort.""" + import asyncio + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + + handle = _AnthropicHandle( + client=None, + system_prompt="", + model="test", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={"reasoning": {"effort": "ultra"}}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + with pytest.raises(BackendBadRequestError, match="invalid reasoning effort"): + asyncio.run(_run()) From 03ffdb9ac3822ae08619f8b7fcd57a95d630b0aa Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 10:16:13 -0400 Subject: [PATCH 12/23] fix: correct relative import for capi in anthropic_sdk backend The backend.py module is 3 levels deep (sdk/anthropic_sdk/backend.py), so the import needs '...capi' (3 dots) to reach the top-level seclab_taskflow_agent.capi module, not '..capi' (2 dots) which only reaches sdk/. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 43f8ad2a..23685b5e 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -123,7 +123,7 @@ async def build( token = _resolve_token(spec.token_env) endpoint = spec.endpoint or _resolve_endpoint() - from ..capi import is_capi_endpoint + from ...capi import is_capi_endpoint # CAPI expects Authorization: Bearer, not x-api-key. Use a # placeholder api_key so the SDK doesn't send the real token From 2cb49b98fd3426df4184290ddc6516682aca7b76 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 11:05:38 -0400 Subject: [PATCH 13/23] refactor: use provider registry bearer_auth for anthropic backend auth Replace is_capi_endpoint() with provider.bearer_auth from the existing provider registry. Each registered provider (CAPI, GitHub Models, OpenAI) sets bearer_auth=True; unknown/custom endpoints default to False (native SDK auth via x-api-key). Also replaces the duplicate _resolve_token/_resolve_endpoint helpers with get_AI_token/get_AI_endpoint from capi.py, fixing COPILOT_TOKEN fallback. The Copilot-Integration-Id header is now sourced from the provider's extra_headers instead of being hardcoded in the backend. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/capi.py | 15 ++--- .../sdk/anthropic_sdk/backend.py | 64 ++++++++----------- tests/test_sdk_anthropic_adapter.py | 39 +++++++---- 3 files changed, 54 insertions(+), 64 deletions(-) diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py index 2ce61a3a..c897f4ed 100644 --- a/src/seclab_taskflow_agent/capi.py +++ b/src/seclab_taskflow_agent/capi.py @@ -29,7 +29,6 @@ "get_AI_endpoint", "get_AI_token", "get_provider", - "is_capi_endpoint", "list_capi_models", "list_tool_call_models", "supports_tool_calls", @@ -51,6 +50,7 @@ class APIProvider: models_catalog: str = "/models" default_model: str = "gpt-4.1" extra_headers: Mapping[str, str] = field(default_factory=dict) + bearer_auth: bool = True # Use Authorization: Bearer (not x-api-key) def __post_init__(self) -> None: # Ensure base_url ends with / so httpx URL.join() preserves the path @@ -143,14 +143,6 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool: _DEFAULT_PROVIDER = "api.githubcopilot.com" -# Hostnames that use CAPI-style auth (Authorization: Bearer, not x-api-key). -_CAPI_HOSTS = frozenset(_PROVIDERS.keys()) - - -def is_capi_endpoint(endpoint: str) -> bool: - """Return True if *endpoint* is a GitHub CAPI proxy (needs Bearer auth).""" - return urlparse(endpoint).hostname in _CAPI_HOSTS - def get_provider(endpoint: str | None = None) -> APIProvider: """Return the ``APIProvider`` for the given (or configured) endpoint URL. @@ -181,8 +173,9 @@ def get_provider(endpoint: str | None = None) -> APIProvider: if upstream: return dataclasses.replace(upstream, base_url=url) - # Unknown endpoint — return a generic provider with the given base URL - return APIProvider(name="custom", base_url=url, default_model="please-set-default-model-via-env") + # Unknown endpoint — return a generic provider using native SDK auth. + return APIProvider(name="custom", base_url=url, bearer_auth=False, + default_model="please-set-default-model-via-env") # --------------------------------------------------------------------------- diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 23685b5e..315f6979 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -7,10 +7,11 @@ ``anthropic`` Python SDK. Supports streaming, tool calling via MCP servers, and extended thinking. -Auth note: The Anthropic SDK sends ``x-api-key`` by default, but CAPI -expects ``Authorization: Bearer``. We pass the bearer header via -``default_headers`` and set ``api_key`` to a placeholder so the SDK -doesn't complain about a missing key. +Auth note: The Anthropic SDK sends ``x-api-key`` by default, but +providers that use Bearer auth (see ``APIProvider.bearer_auth``) +need ``Authorization: Bearer`` instead. We pass the bearer header +via ``default_headers`` and set ``api_key`` to a placeholder so the +SDK doesn't send the real token via x-api-key. """ from __future__ import annotations @@ -38,25 +39,6 @@ _VALID_REASONING = ("low", "medium", "high", "max") -def _resolve_token(token_env: str | None) -> str: - """Resolve the API token from env var name or default AI_API_TOKEN.""" - if token_env: - val = os.getenv(token_env) - if val: - return val - val = os.getenv("AI_API_TOKEN") - if val: - return val - raise BackendBadRequestError( - "anthropic_sdk: no API token found (set AI_API_TOKEN or per-model token env)" - ) - - -def _resolve_endpoint() -> str: - """Resolve the API base URL.""" - return os.getenv("AI_API_ENDPOINT", "https://api.githubcopilot.com") - - def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]: """Convert MCP tool definitions to Anthropic tool format.""" anthropic_tools = [] @@ -120,25 +102,29 @@ async def build( import anthropic - token = _resolve_token(spec.token_env) - endpoint = spec.endpoint or _resolve_endpoint() - - from ...capi import is_capi_endpoint - - # CAPI expects Authorization: Bearer, not x-api-key. Use a - # placeholder api_key so the SDK doesn't send the real token - # via x-api-key as well. For direct Anthropic endpoints, pass - # the real token as api_key (the SDK's native auth). - is_capi = is_capi_endpoint(endpoint) - headers: dict[str, str] = {} - if is_capi: + from ...capi import get_AI_endpoint, get_AI_token, get_provider + + # Resolve token: per-model env var override, then standard token chain + if spec.token_env: + token = os.getenv(spec.token_env, "") + else: + token = "" + if not token: + token = get_AI_token() + + endpoint = spec.endpoint or get_AI_endpoint() + provider = get_provider(endpoint) + + # Providers with bearer_auth=True need Authorization: Bearer instead + # of the Anthropic SDK's native x-api-key header. Use a placeholder + # api_key so the SDK doesn't also send the real token via x-api-key. + # Endpoints not in the provider registry default to native SDK auth. + headers: dict[str, str] = dict(provider.extra_headers) + if provider.bearer_auth: headers["Authorization"] = f"Bearer {token}" - headers["Copilot-Integration-Id"] = os.getenv( - "COPILOT_INTEGRATION_ID", "vscode-chat" - ) client = anthropic.AsyncAnthropic( - api_key="placeholder" if is_capi else token, + api_key="placeholder" if provider.bearer_auth else token, base_url=endpoint, default_headers=headers or None, ) diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index 40132199..496869be 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -13,7 +13,6 @@ AnthropicSDKBackend, _mcp_tools_to_anthropic, _call_tool_result_to_text, - _resolve_token, _VALID_REASONING, ) from seclab_taskflow_agent.sdk.errors import ( @@ -152,25 +151,37 @@ def test_call_tool_result_to_text_empty(): assert isinstance(text, str) -# -- _resolve_token() -- +# -- bearer_auth via provider registry -- -def test_resolve_token_from_env(monkeypatch): - monkeypatch.setenv("MY_TOKEN", "secret123") - assert _resolve_token("MY_TOKEN") == "secret123" +def test_known_provider_uses_bearer_auth(): + """Known providers (CAPI, GitHub Models) should have bearer_auth=True.""" + from seclab_taskflow_agent.capi import get_provider + provider = get_provider("https://api.githubcopilot.com") + assert provider.bearer_auth is True -def test_resolve_token_fallback_to_ai_api_token(monkeypatch): - monkeypatch.setenv("AI_API_TOKEN", "fallback_token") - monkeypatch.delenv("MISSING_VAR", raising=False) - assert _resolve_token("MISSING_VAR") == "fallback_token" + provider = get_provider("https://models.github.ai/inference") + assert provider.bearer_auth is True -def test_resolve_token_raises_when_missing(monkeypatch): - monkeypatch.delenv("AI_API_TOKEN", raising=False) - monkeypatch.delenv("MISSING_VAR", raising=False) - with pytest.raises(BackendBadRequestError, match="no API token"): - _resolve_token("MISSING_VAR") +def test_unknown_endpoint_uses_native_auth(): + """Unknown endpoints should default to native SDK auth (bearer_auth=False).""" + from seclab_taskflow_agent.capi import get_provider + + provider = get_provider("https://api.anthropic.com") + assert provider.bearer_auth is False + assert provider.name == "custom" + + +def test_awf_proxy_inherits_upstream_bearer_auth(monkeypatch): + """AWF proxy should inherit bearer_auth from the upstream provider.""" + from seclab_taskflow_agent.capi import get_provider + + monkeypatch.setenv("AWF_COPILOT_PROXY", "api.githubcopilot.com") + provider = get_provider("http://localhost:8080") + assert provider.bearer_auth is True + assert provider.base_url == "http://localhost:8080/" # -- reasoning validation -- From ed19781ebb87463301c6ce3ded8b823a9e71caf6 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 11:12:42 -0400 Subject: [PATCH 14/23] style: use ternary for token resolution (ruff SIM108) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 315f6979..0174fa1d 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -105,10 +105,7 @@ async def build( from ...capi import get_AI_endpoint, get_AI_token, get_provider # Resolve token: per-model env var override, then standard token chain - if spec.token_env: - token = os.getenv(spec.token_env, "") - else: - token = "" + token = os.getenv(spec.token_env, "") if spec.token_env else "" if not token: token = get_AI_token() From ffb017ee53001e30fdebb734bf71db2cb8754e2d Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 12:06:43 -0400 Subject: [PATCH 15/23] refactor: move unfiltered MCP tool listing into MCPNamespaceWrap The anthropic backend was reaching into openai-agents private attrs (`_obj`, `.session`) to bypass tool_filter at tool-enumeration time. This required duplicating the namespace-prefix logic that already lives on MCPNamespaceWrap and risked double-prefixing on the fallback path. Move the 'list tools without invoking the agent-side tool_filter' logic into MCPNamespaceWrap.list_tools_unfiltered(), where the wrapper already owns its namespace and session reference. The anthropic backend becomes a one-liner; double-prefix risk is eliminated; openai-agents internal access is centralized in one place (mcp_utils.py). Also bump default_model in the provider registry from gpt-4.1 to gpt-5.5 (Copilot and OpenAI direct), openai/gpt-4.1 to openai/gpt-5.5 (GitHub Models). Only affects callers who do not specify a model -- the audit pipeline always specifies models via model_config, so this is purely a fallback for community users. Tests added: tests/test_mcp_utils.py (6 tests covering prefix correctness, no-double-prefix, tool attribute preservation, missing-session error, caller-state isolation, regression of existing list_tools()). Tests updated: test_capi_extended.py (default_model assertions). 274 tests pass, ruff clean. Local audit on anticomputer/vulnerable-test-app produced 4 vulnerabilities (verifying MCP tools enumerated + called correctly). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/capi.py | 8 +- src/seclab_taskflow_agent/mcp_utils.py | 27 ++++ .../sdk/anthropic_sdk/backend.py | 29 +--- tests/test_capi_extended.py | 4 +- tests/test_mcp_utils.py | 130 ++++++++++++++++++ 5 files changed, 168 insertions(+), 30 deletions(-) create mode 100644 tests/test_mcp_utils.py diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py index c897f4ed..5065ebcc 100644 --- a/src/seclab_taskflow_agent/capi.py +++ b/src/seclab_taskflow_agent/capi.py @@ -48,7 +48,7 @@ class APIProvider: name: str base_url: str models_catalog: str = "/models" - default_model: str = "gpt-4.1" + default_model: str = "gpt-5.5" extra_headers: Mapping[str, str] = field(default_factory=dict) bearer_auth: bool = True # Use Authorization: Bearer (not x-api-key) @@ -124,20 +124,20 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool: "api.githubcopilot.com": _CopilotProvider( name="copilot", base_url="https://api.githubcopilot.com", - default_model="gpt-4.1", + default_model="gpt-5.5", extra_headers={"Copilot-Integration-Id": COPILOT_INTEGRATION_ID}, ), "models.github.ai": _GitHubModelsProvider( name="github-models", base_url="https://models.github.ai/inference", models_catalog="/catalog/models", - default_model="openai/gpt-4.1", + default_model="openai/gpt-5.5", ), "api.openai.com": _OpenAIProvider( name="openai", base_url="https://api.openai.com/v1", models_catalog="/v1/models", - default_model="gpt-4.1", + default_model="gpt-5.5", ), } diff --git a/src/seclab_taskflow_agent/mcp_utils.py b/src/seclab_taskflow_agent/mcp_utils.py index 92968986..228f64ed 100644 --- a/src/seclab_taskflow_agent/mcp_utils.py +++ b/src/seclab_taskflow_agent/mcp_utils.py @@ -97,6 +97,33 @@ async def list_tools(self, *args: Any, **kwargs: Any) -> list[Any]: namespaced_tools.append(tool_copy) return namespaced_tools + async def list_tools_unfiltered(self) -> list[Any]: + """List tools directly from the MCP session, namespace-prefixed. + + Bypasses any tool_filter configured on the wrapped openai-agents + server (which would require ``run_context`` and ``agent`` arguments + that aren't available when listing tools outside the openai-agents + run loop -- e.g. when handing tools to a different SDK at build + time). + + Raises ``RuntimeError`` if the underlying server has no active + MCP session yet (caller should ensure the server is connected + before calling this). + """ + session = getattr(self._obj, "session", None) + if session is None: + raise RuntimeError( + f"MCPNamespaceWrap({self._obj!r}): underlying server has no " + "active MCP session; cannot list tools unfiltered" + ) + result = await session.list_tools() + namespaced_tools: list[Any] = [] + for tool in result.tools: + tool_copy = tool.copy() if hasattr(tool, "copy") else tool + tool_copy.name = f"{self.namespace}{tool.name}" + namespaced_tools.append(tool_copy) + return namespaced_tools + def confirm_tool(self, tool_name: str, args: list[Any]) -> bool: """Interactively prompt the user for tool-call confirmation. diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 0174fa1d..996eea90 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -127,9 +127,10 @@ async def build( ) # Collect tools from MCP servers and apply blocked_tools filter. - # We filter tools ourselves rather than relying on the openai-agents - # SDK's tool_filter, which requires run_context/agent args that - # aren't available outside the openai-agents run loop. + # We get raw tool lists via list_tools_unfiltered() rather than + # list_tools(), which would require run_context/agent args to + # invoke the openai-agents tool_filter -- args we don't have + # outside the openai-agents run loop. all_tools: list[dict[str, Any]] = [] mcp_server_map: dict[str, Any] = {} blocked = set(spec.blocked_tools or []) @@ -139,27 +140,7 @@ async def build( if native_server is None: continue try: - # Access the underlying MCP session to get the raw tool - # list, bypassing the openai-agents tool_filter that - # requires run_context/agent we don't have. - raw_server = getattr(native_server, "_obj", native_server) - session = getattr(raw_server, "session", None) - if session is not None: - result = await session.list_tools() - raw_tools = result.tools - else: - raw_tools = await native_server.list_tools() - - # Apply namespace prefix (NamespacedMCPServer convention) - ns = getattr(native_server, "namespace", "") - mcp_tools = [] - for tool in raw_tools: - if hasattr(tool, "copy"): - tool = tool.copy() - if ns: - tool.name = f"{ns}{tool.name}" - mcp_tools.append(tool) - + mcp_tools = await native_server.list_tools_unfiltered() for tool in mcp_tools: if tool.name not in blocked: mcp_server_map[tool.name] = native_server diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py index e3a1188b..1202df05 100644 --- a/tests/test_capi_extended.py +++ b/tests/test_capi_extended.py @@ -111,7 +111,7 @@ def test_github_models_provider(self): p = get_provider("https://models.github.ai/inference") assert p.name == "github-models" assert p.models_catalog == "/catalog/models" - assert p.default_model == "openai/gpt-4.1" + assert p.default_model == "openai/gpt-5.5" def test_openai_provider(self): p = get_provider("https://api.openai.com/v1") @@ -129,7 +129,7 @@ def test_awf_proxy_bare_hostname(self, monkeypatch): p = get_provider("http://172.30.0.30:10002") assert p.name == "copilot" assert p.base_url == "http://172.30.0.30:10002/" - assert p.default_model == "gpt-4.1" + assert p.default_model == "gpt-5.5" assert "Copilot-Integration-Id" in p.extra_headers def test_awf_proxy_full_url(self, monkeypatch): diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py new file mode 100644 index 00000000..8cdb1abf --- /dev/null +++ b/tests/test_mcp_utils.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: GitHub, Inc. +# SPDX-License-Identifier: MIT + +"""Tests for MCPNamespaceWrap.""" + +from __future__ import annotations + +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name + + +class _FakeTool: + """Tool with a copy() method (mimics mcp.types.Tool).""" + + def __init__(self, name: str, description: str = "", input_schema: dict | None = None): + self.name = name + self.description = description + self.inputSchema = input_schema or {} + + def copy(self) -> "_FakeTool": + return _FakeTool(self.name, self.description, dict(self.inputSchema)) + + +def _make_wrapper(server_name: str, session=None) -> MCPNamespaceWrap: + """Construct an MCPNamespaceWrap around a mock underlying server.""" + obj = MagicMock() + obj.name = server_name + obj.session = session + return MCPNamespaceWrap(confirms=[], obj=obj) + + +# -- list_tools_unfiltered() -- + + +def test_list_tools_unfiltered_prefixes_names_from_session(): + """Tools from session.list_tools() should be namespace-prefixed.""" + tools = [_FakeTool("read_file", "Read a file"), _FakeTool("write_file", "Write a file")] + session = MagicMock() + session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=tools)) + wrapper = _make_wrapper("RepoContext", session=session) + + result = asyncio.run(wrapper.list_tools_unfiltered()) + + ns = compress_name("RepoContext") + assert len(result) == 2 + assert result[0].name == f"{ns}read_file" + assert result[1].name == f"{ns}write_file" + + +def test_list_tools_unfiltered_no_double_prefix_when_called_twice(): + """Calling list_tools_unfiltered twice should not double-prefix names.""" + session = MagicMock() + # Fresh tools each call (mimics MCP session returning fresh objects) + session.list_tools = AsyncMock( + side_effect=lambda: SimpleNamespace(tools=[_FakeTool("get_repo")]) + ) + wrapper = _make_wrapper("RepoContext", session=session) + + async def _run(): + a = await wrapper.list_tools_unfiltered() + b = await wrapper.list_tools_unfiltered() + return a, b + + result1, result2 = asyncio.run(_run()) + + ns = compress_name("RepoContext") + assert result1[0].name == f"{ns}get_repo" + assert result2[0].name == f"{ns}get_repo" + # Crucially, the second result is NOT double-prefixed + assert not result2[0].name.startswith(f"{ns}{ns}") + + +def test_list_tools_unfiltered_preserves_tool_attributes(): + """The copy of each tool should preserve description and input schema.""" + schema = {"type": "object", "properties": {"path": {"type": "string"}}} + tools = [_FakeTool("read_file", "Read a file", schema)] + session = MagicMock() + session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=tools)) + wrapper = _make_wrapper("RepoContext", session=session) + + result = asyncio.run(wrapper.list_tools_unfiltered()) + + assert result[0].description == "Read a file" + assert result[0].inputSchema == schema + + +def test_list_tools_unfiltered_raises_when_session_missing(): + """Should raise RuntimeError if the underlying server has no session yet.""" + wrapper = _make_wrapper("RepoContext", session=None) + + with pytest.raises(RuntimeError, match="no.*active MCP session"): + asyncio.run(wrapper.list_tools_unfiltered()) + + +def test_list_tools_unfiltered_does_not_share_state_with_caller(): + """Mutating returned tool names must not affect the underlying tools.""" + original = _FakeTool("read_file") + session = MagicMock() + session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[original])) + wrapper = _make_wrapper("Repo", session=session) + + result = asyncio.run(wrapper.list_tools_unfiltered()) + result[0].name = "MUTATED" + + # Original tool should still have its name (copy() worked) + assert original.name == "read_file" + + +# -- list_tools() (regression) -- + + +def test_list_tools_existing_behaviour_unchanged(): + """Existing list_tools() should still forward args and prefix names.""" + tools = [_FakeTool("read_file")] + obj = MagicMock() + obj.name = "RepoContext" + obj.list_tools = AsyncMock(return_value=tools) + obj.session = MagicMock() + wrapper = MCPNamespaceWrap(confirms=[], obj=obj) + + result = asyncio.run(wrapper.list_tools(run_context="ctx", agent="agent")) + + obj.list_tools.assert_awaited_once_with(run_context="ctx", agent="agent") + ns = compress_name("RepoContext") + assert result[0].name == f"{ns}read_file" From 00978f237087e49561617f80e441825cdb3c8b60 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 12:09:44 -0400 Subject: [PATCH 16/23] style: fix hatch fmt lint errors in test_mcp_utils - Remove quotes from _FakeTool return type (UP037) - Use raw string for regex pattern in pytest.raises match (RUF043) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_mcp_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py index 8cdb1abf..a1eaae33 100644 --- a/tests/test_mcp_utils.py +++ b/tests/test_mcp_utils.py @@ -22,7 +22,7 @@ def __init__(self, name: str, description: str = "", input_schema: dict | None = self.description = description self.inputSchema = input_schema or {} - def copy(self) -> "_FakeTool": + def copy(self) -> _FakeTool: return _FakeTool(self.name, self.description, dict(self.inputSchema)) @@ -93,7 +93,7 @@ def test_list_tools_unfiltered_raises_when_session_missing(): """Should raise RuntimeError if the underlying server has no session yet.""" wrapper = _make_wrapper("RepoContext", session=None) - with pytest.raises(RuntimeError, match="no.*active MCP session"): + with pytest.raises(RuntimeError, match=r"no.*active MCP session"): asyncio.run(wrapper.list_tools_unfiltered()) From b1b139b7d0e1b85c2a0374a5e519d19b0906d43b Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 12:43:33 -0400 Subject: [PATCH 17/23] fix(capi): add gpt-5 to OpenAI _CHAT_PREFIXES allowlist The default_model for the OpenAI direct provider was bumped to gpt-5.5 in the bearer_auth refactor, but _OpenAIProvider.check_tool_calls()'s prefix allowlist still only matched gpt-3.5/gpt-4/o-series. This meant supports_tool_calls('gpt-5.5', ...) returned False, so list_tool_call_models() would omit the default model from the catalog output -- a contradiction with the model being the configured default. Add 'gpt-5' to the prefix tuple and a regression test covering gpt-5, gpt-5.5, gpt-5.5-mini, and a hypothetical gpt-5.6. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/capi.py | 2 +- tests/test_capi_extended.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py index 5065ebcc..edd9a4d3 100644 --- a/src/seclab_taskflow_agent/capi.py +++ b/src/seclab_taskflow_agent/capi.py @@ -111,7 +111,7 @@ class _OpenAIProvider(APIProvider): we maintain a prefix allowlist of known chat-completion model families. """ - _CHAT_PREFIXES = ("gpt-3.5", "gpt-4", "o1", "o3", "o4", "chatgpt-") + _CHAT_PREFIXES = ("gpt-3.5", "gpt-4", "gpt-5", "o1", "o3", "o4", "chatgpt-") def check_tool_calls(self, _model: str, model_info: dict) -> bool: model_id = model_info.get("id", "").lower() diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py index 1202df05..70407c28 100644 --- a/tests/test_capi_extended.py +++ b/tests/test_capi_extended.py @@ -80,6 +80,17 @@ def test_openai_endpoint_o_series(self, monkeypatch): models = {mid: {"id": mid}} assert supports_tool_calls(mid, models) is True + def test_openai_endpoint_gpt5_series(self, monkeypatch): + """OpenAI endpoint returns True for gpt-5 family (regression: the + default_model was bumped to gpt-5.5 but _CHAT_PREFIXES needed + updating to include 'gpt-5').""" + monkeypatch.setenv("AI_API_ENDPOINT", "https://api.openai.com/v1") + for mid in ("gpt-5", "gpt-5.5", "gpt-5.5-mini", "gpt-5.6"): + models = {mid: {"id": mid}} + assert supports_tool_calls(mid, models) is True, ( + f"{mid} should be recognized as a tool-call-capable chat model" + ) + def test_openai_endpoint_non_chat_model(self, monkeypatch): """OpenAI endpoint returns False for embeddings/audio/image models.""" monkeypatch.setenv("AI_API_ENDPOINT", "https://api.openai.com/v1") From 4eea12785201dc1fa09e6a217fee2b4675413f6e Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 13:21:25 -0400 Subject: [PATCH 18/23] doc + refactor: address remaining PR review threads README.md: Add the per-task backend override as the highest-precedence selection level. Tasks can put 'backend:' in their own model_settings block to override the model-level value, per _resolve_task_model(). doc/GRAMMAR.md: Tighten the 'passed through to the selected SDK backend' claim. openai_agents accepts the standard OpenAI parameter set, anthropic_sdk forwards a curated subset (temperature, top_p, reasoning, max_tokens, stream_thinking), and copilot_sdk consumes only its own exposed keys (e.g. reasoning_effort) and silently ignores the rest. Avoid misleading users about arbitrary key forwarding. mcp_utils.py: Make list_tools_unfiltered idempotent on the prefix. Strip an existing namespace prefix before re-applying so the method is safe to call repeatedly even if the underlying session somehow returns a cached/reused tool object whose name was previously namespaced. Uses str.removeprefix() (no-op when prefix is absent). Regression test added covering the previously-prefixed-input path. 276 tests pass, hatch fmt clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 17 ++++++++++------- doc/GRAMMAR.md | 9 +++++++-- src/seclab_taskflow_agent/mcp_utils.py | 10 +++++++++- tests/test_mcp_utils.py | 17 +++++++++++++++++ 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 21f39d15..7842f5bf 100644 --- a/README.md +++ b/README.md @@ -103,13 +103,16 @@ The runner can drive three SDKs behind a common interface: Designed for use with CAPI's Anthropic endpoint; auth uses `Authorization: Bearer` (not `x-api-key`). -Selection precedence: - -1. Per-model `backend:` in `model_settings` (allows mixed backends in a - single taskflow). -2. `backend:` field in the model config document (global default). -3. `SECLAB_TASKFLOW_BACKEND` environment variable. -4. `openai_agents`. +Selection precedence (highest to lowest): + +1. Per-task `backend:` in the task's own `model_settings` block (overrides + the model-level value for that one task; see `_resolve_task_model()`). +2. Per-model `backend:` in the model config's `model_settings` (allows + mixed backends in a single taskflow). +3. `backend:` field at the top level of the model config document + (global default). +4. `SECLAB_TASKFLOW_BACKEND` environment variable. +5. `openai_agents`. ```yaml seclab-taskflow-agent: diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md index 928bf8b1..efe0c41b 100644 --- a/doc/GRAMMAR.md +++ b/doc/GRAMMAR.md @@ -550,5 +550,10 @@ passed to the underlying model provider: | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var | | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` | -All other keys (e.g. `temperature`, `top_p`, `reasoning`) are passed through as -model parameters to the selected SDK backend. +All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to +the selected SDK backend. Which keys are actually honored depends on the +backend: `openai_agents` accepts the standard OpenAI parameter set; +`anthropic_sdk` forwards a curated subset (currently `temperature`, +`top_p`, `reasoning`, `max_tokens`, `stream_thinking`); `copilot_sdk` +consumes only the keys its SDK exposes (e.g. `reasoning_effort`) and +silently ignores the rest. Consult the backend-specific docs if in doubt. diff --git a/src/seclab_taskflow_agent/mcp_utils.py b/src/seclab_taskflow_agent/mcp_utils.py index 228f64ed..36d1df7e 100644 --- a/src/seclab_taskflow_agent/mcp_utils.py +++ b/src/seclab_taskflow_agent/mcp_utils.py @@ -106,6 +106,12 @@ async def list_tools_unfiltered(self) -> list[Any]: run loop -- e.g. when handing tools to a different SDK at build time). + Prefixing is idempotent: if a tool's name already starts with this + wrapper's namespace (e.g. because the underlying session returned a + previously-namespaced object), the existing prefix is stripped + before re-applying so calling this method multiple times never + yields ``name``. + Raises ``RuntimeError`` if the underlying server has no active MCP session yet (caller should ensure the server is connected before calling this). @@ -120,7 +126,9 @@ async def list_tools_unfiltered(self) -> list[Any]: namespaced_tools: list[Any] = [] for tool in result.tools: tool_copy = tool.copy() if hasattr(tool, "copy") else tool - tool_copy.name = f"{self.namespace}{tool.name}" + # Idempotent: strip existing prefix before re-applying + base_name = tool_copy.name.removeprefix(self.namespace) + tool_copy.name = f"{self.namespace}{base_name}" namespaced_tools.append(tool_copy) return namespaced_tools diff --git a/tests/test_mcp_utils.py b/tests/test_mcp_utils.py index a1eaae33..1480b98e 100644 --- a/tests/test_mcp_utils.py +++ b/tests/test_mcp_utils.py @@ -111,6 +111,23 @@ def test_list_tools_unfiltered_does_not_share_state_with_caller(): assert original.name == "read_file" +def test_list_tools_unfiltered_idempotent_on_prefixed_input(): + """If the session returns a tool whose name is already namespace-prefixed + (e.g. because of a cached/reused tool object), the prefix must NOT be + applied a second time. Required for safe repeated/reentrant calls.""" + ns = compress_name("RepoContext") + pre_prefixed = _FakeTool(f"{ns}read_file", "Read a file") + session = MagicMock() + session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[pre_prefixed])) + wrapper = _make_wrapper("RepoContext", session=session) + + result = asyncio.run(wrapper.list_tools_unfiltered()) + + # Result must have exactly one prefix, not two + assert result[0].name == f"{ns}read_file" + assert not result[0].name.startswith(f"{ns}{ns}") + + # -- list_tools() (regression) -- From c97ab6a1a98442a160bf0250d9520239fe16e966 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Fri, 12 Jun 2026 17:01:14 -0400 Subject: [PATCH 19/23] feat(anthropic_sdk): default-on automatic prompt caching Adds 'cache_control: {type: ephemeral}' to messages.stream() calls. The API auto-places a cache breakpoint at the longest cacheable prefix (tools + system + accumulated messages) and moves it forward on each turn -- multi-turn agent loops get cache reads on every turn after the first. Default-on because all current Claude models support cache_control and CAPI accepts it (validated end-to-end against claude-mythos-5 via CAPI on 2026-06-12). Callers pointed at proxies that strip / reject cache_control can opt out with 'prompt_caching: false' in model_settings. A string value (e.g. 'prompt_caching: 1h') sets a custom TTL. Local validation against anticomputer/vulnerable-test-app on the same audit pipeline, same model config, only changing prompt_caching: metric | off | on | delta --------------------+-----------+-----------+------------ requests | 60 | 62 | +2 (noise) input tokens fresh | 909,806 | 124 | -99.99% cache read tokens | 0 | 728,079 | new cache write tokens | 0 | 210,261 | new output tokens | 42,300 | 44,933 | similar vulnerabilities | 4 | 5 | +1 est. mythos cost | $11.21 | $5.60 | -50% Same or better audit quality, half the token cost. Real audits with larger system prompts + more tool definitions amortize the cache writes over more reads, so production savings are typically larger than 50%. Tests added: - prompt_caching default-on emits cache_control - prompt_caching=False suppresses cache_control (opt-out) - prompt_caching='1h' includes the ttl field 23 tests pass total, hatch fmt clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sdk/anthropic_sdk/backend.py | 16 ++ tests/test_sdk_anthropic_adapter.py | 154 ++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 996eea90..d3b493a0 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -202,6 +202,22 @@ async def run_streamed( create_kwargs["thinking"] = {"type": "adaptive"} create_kwargs["output_config"] = {"effort": effort} + # Automatic prompt caching: place an ephemeral cache breakpoint at + # the longest cacheable prefix (tools + system + accumulated + # messages). The breakpoint moves forward on each turn, so + # multi-turn agent loops get cache reads on every turn after the + # first -- typically 50%+ cost reduction on token-heavy audits. + # All current Claude models (and the Anthropic-compatible CAPI + # proxy) support cache_control. Default on; explicit opt-out for + # callers pointed at proxies that don't support it. + prompt_caching = handle.model_settings.get("prompt_caching", True) + if prompt_caching: + ttl = prompt_caching if isinstance(prompt_caching, str) else "5m" + cache_block: dict[str, Any] = {"type": "ephemeral"} + if ttl != "5m": + cache_block["ttl"] = ttl + create_kwargs["cache_control"] = cache_block + import anthropic for turn in range(max_turns): diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index 496869be..975e8094 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -227,3 +227,157 @@ async def _run(): with pytest.raises(BackendBadRequestError, match="invalid reasoning effort"): asyncio.run(_run()) + + +# -- prompt caching -- + + +def test_prompt_caching_enabled_by_default(): + """All Claude models support cache_control; default to on so callers + get the cost savings without explicit opt-in. Explicit opt-out via + prompt_caching=False remains available for proxies that don't support + cache_control.""" + import asyncio + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + + captured = {} + + class _FakeStreamCtx: + async def __aenter__(self): return self + async def __aexit__(self, *exc): return False + def __aiter__(self): + async def _gen(): + return + yield + return _gen() + async def get_final_message(self): + return type("M", (), {"stop_reason": "end_turn", "content": []})() + + class _FakeMessages: + def stream(self, **kwargs): + captured.update(kwargs) + return _FakeStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + handle = _AnthropicHandle( + client=_FakeClient(), + system_prompt="", + model="claude-mythos-5", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + asyncio.run(_run()) + assert captured.get("cache_control") == {"type": "ephemeral"}, ( + f"expected default cache_control={{type: ephemeral}}, got {captured.get('cache_control')!r}" + ) + + +def test_prompt_caching_explicit_opt_out(): + """prompt_caching=False must suppress cache_control entirely (for + callers pointed at proxies that don't support it).""" + import asyncio + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + + captured = {} + + class _FakeStreamCtx: + async def __aenter__(self): return self + async def __aexit__(self, *exc): return False + def __aiter__(self): + async def _gen(): + return + yield + return _gen() + async def get_final_message(self): + return type("M", (), {"stop_reason": "end_turn", "content": []})() + + class _FakeMessages: + def stream(self, **kwargs): + captured.update(kwargs) + return _FakeStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + handle = _AnthropicHandle( + client=_FakeClient(), + system_prompt="", + model="claude-mythos-5", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={"prompt_caching": False}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + asyncio.run(_run()) + assert "cache_control" not in captured, ( + f"cache_control should be absent when explicitly opted out, got {captured}" + ) + + +def test_prompt_caching_1h_ttl_passes_ttl_field(): + """When prompt_caching='1h', cache_control must include the 1h ttl.""" + import asyncio + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + + captured = {} + + class _FakeStreamCtx: + async def __aenter__(self): return self + async def __aexit__(self, *exc): return False + def __aiter__(self): + async def _gen(): + return + yield + return _gen() + async def get_final_message(self): + return type("M", (), {"stop_reason": "end_turn", "content": []})() + + class _FakeMessages: + def stream(self, **kwargs): + captured.update(kwargs) + return _FakeStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + handle = _AnthropicHandle( + client=_FakeClient(), + system_prompt="", + model="claude-mythos-5", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={"prompt_caching": "1h"}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + asyncio.run(_run()) + assert captured.get("cache_control") == {"type": "ephemeral", "ttl": "1h"}, ( + f"expected cache_control with 1h ttl, got {captured.get('cache_control')!r}" + ) From f42c06b8c0751bd2ac2baa3079aa2cb114d51e04 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Mon, 15 Jun 2026 10:52:42 -0400 Subject: [PATCH 20/23] fix(anthropic_sdk): match blocked_tools against raw + namespaced names Reviewer was correct that blocked_tools was effectively a no-op in the anthropic_sdk backend: taskflow YAML supplies raw tool names like 'read_file', but list_tools_unfiltered() returns namespace-prefixed names like '{hash}read_file'. The old 'tool.name not in blocked' check never matched, silently letting every blocked tool through. This is the security bug the reviewer flagged on PR #265. Fix: match the raw name against the un-prefixed portion of each tool's namespaced name, in addition to the literal name. The mcp_server_map keys stay namespaced because that's what Anthropic sends in tool_use. Regression tests: - raw 'read_file' filters out '{hash}read_file' (the bug case) - already-namespaced names still match (backwards compat) doc/GRAMMAR.md: also fix an inaccuracy the reviewer flagged in the same review pass -- the docs claimed copilot_sdk 'silently ignores' unsupported model_settings keys, but it actually raises BackendCapabilityError on 'temperature' and 'parallel_tool_calls' at validate() time. Updated wording to distinguish 'ignored' (anthropic_sdk) from 'rejected' (copilot_sdk) so users aren't surprised by a hard fail when they expected a silent drop. 281 tests pass, hatch fmt clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/GRAMMAR.md | 8 +- .../sdk/anthropic_sdk/backend.py | 26 +++-- tests/test_sdk_anthropic_adapter.py | 109 ++++++++++++++++++ 3 files changed, 129 insertions(+), 14 deletions(-) diff --git a/doc/GRAMMAR.md b/doc/GRAMMAR.md index efe0c41b..88b0bf9f 100644 --- a/doc/GRAMMAR.md +++ b/doc/GRAMMAR.md @@ -550,10 +550,4 @@ passed to the underlying model provider: | `endpoint` | API base URL for this model | The global `AI_API_ENDPOINT` env var | | `token` | Name of an environment variable containing the API key | Uses `AI_API_TOKEN` / `COPILOT_TOKEN` | -All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to -the selected SDK backend. Which keys are actually honored depends on the -backend: `openai_agents` accepts the standard OpenAI parameter set; -`anthropic_sdk` forwards a curated subset (currently `temperature`, -`top_p`, `reasoning`, `max_tokens`, `stream_thinking`); `copilot_sdk` -consumes only the keys its SDK exposes (e.g. `reasoning_effort`) and -silently ignores the rest. Consult the backend-specific docs if in doubt. +All other keys (e.g. `temperature`, `top_p`, `reasoning`) are forwarded to the selected SDK backend. Each backend decides what to do with each key: `openai_agents` accepts the standard OpenAI parameter set; `anthropic_sdk` forwards a curated subset (currently `temperature`, `top_p`, `reasoning`, `max_tokens`, `stream_thinking`, `prompt_caching`) and silently ignores keys outside that set; `copilot_sdk` consumes the keys its SDK exposes (e.g. `reasoning_effort`) and **rejects** unsupported keys at validate time with `BackendCapabilityError` (currently `temperature` and `parallel_tool_calls`) rather than silently dropping them. Consult the backend-specific docs if in doubt. diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index d3b493a0..11149406 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -131,23 +131,35 @@ async def build( # list_tools(), which would require run_context/agent args to # invoke the openai-agents tool_filter -- args we don't have # outside the openai-agents run loop. + # + # blocked_tools in taskflow YAML are raw (un-namespaced) names, + # consistent with how openai_agents and copilot_sdk consume them. + # list_tools_unfiltered() returns namespace-prefixed names (the + # MCP server wrapper applies the prefix). Match against both + # forms so blocking works regardless of which name the taskflow + # author used; key mcp_server_map by the namespaced name because + # that's what Anthropic will send back in tool_use blocks. all_tools: list[dict[str, Any]] = [] mcp_server_map: dict[str, Any] = {} blocked = set(spec.blocked_tools or []) + def _is_blocked(tool: Any, namespace: str) -> bool: + name = tool.name + if name in blocked: + return True + return name.startswith(namespace) and name[len(namespace):] in blocked + for mcp_spec in spec.mcp_servers: native_server = mcp_spec.params.get("_native") if native_server is None: continue try: mcp_tools = await native_server.list_tools_unfiltered() - for tool in mcp_tools: - if tool.name not in blocked: - mcp_server_map[tool.name] = native_server - anthropic_tools = _mcp_tools_to_anthropic( - [t for t in mcp_tools if t.name not in blocked] - ) - all_tools.extend(anthropic_tools) + namespace = getattr(native_server, "namespace", "") + kept = [t for t in mcp_tools if not _is_blocked(t, namespace)] + for tool in kept: + mcp_server_map[tool.name] = native_server + all_tools.extend(_mcp_tools_to_anthropic(kept)) except Exception: logger.exception("Failed to list tools from MCP server %s", mcp_spec.name) diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index 975e8094..d8e047c5 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -381,3 +381,112 @@ async def _run(): assert captured.get("cache_control") == {"type": "ephemeral", "ttl": "1h"}, ( f"expected cache_control with 1h ttl, got {captured.get('cache_control')!r}" ) + + +# -- blocked_tools filtering -- + + +def test_blocked_tools_matches_raw_name_against_namespaced_tool(monkeypatch): + """Regression: taskflow YAML blocked_tools uses raw (un-namespaced) + names like 'read_file', but list_tools_unfiltered() returns + namespace-prefixed names like '{hash}read_file'. The filter must + match the raw name against the un-prefixed portion of the + namespaced tool, otherwise blocking is silently bypassed. + + See PR #265 review thread and openai_agents/copilot_sdk for + how blocked_tools are consumed elsewhere (both use raw names). + """ + monkeypatch.setenv("AI_API_TOKEN", "test-token") + import asyncio + from unittest.mock import AsyncMock, MagicMock + + from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name + from seclab_taskflow_agent.sdk.base import MCPServerSpec + + class _FakeTool: + def __init__(self, name): + self.name = name + self.description = "" + self.inputSchema = {} + + def copy(self): + t = _FakeTool(self.name) + return t + + # Build a wrapper whose session.list_tools returns two raw tools. + # list_tools_unfiltered() will return them with namespace prefix. + obj = MagicMock() + obj.name = "RepoContext" + ns = compress_name("RepoContext") + obj.session = MagicMock() + obj.session.list_tools = AsyncMock( + return_value=type("R", (), {"tools": [_FakeTool("read_file"), _FakeTool("safe_helper")]})() + ) + wrap = MCPNamespaceWrap(confirms=[], obj=obj) + + spec = AgentSpec( + name="t", + instructions="", + model="claude-mythos-preview", + mcp_servers=[MCPServerSpec(name="rc", kind="stdio", params={"_native": wrap})], + blocked_tools=["read_file"], # raw name from YAML + ) + backend = AnthropicSDKBackend() + handle = asyncio.run(backend.build(spec)) + + # The blocked tool must be absent from both the tool list AND the + # server map keys (which use the namespaced form). + tool_names = [t["name"] for t in handle.tools] + assert f"{ns}read_file" not in tool_names, ( + f"blocked raw name 'read_file' should have filtered out '{ns}read_file'; " + f"got tools: {tool_names}" + ) + assert f"{ns}safe_helper" in tool_names, ( + f"non-blocked tool 'safe_helper' should still be present; got: {tool_names}" + ) + assert f"{ns}read_file" not in handle.mcp_server_map + assert f"{ns}safe_helper" in handle.mcp_server_map + + +def test_blocked_tools_also_matches_already_namespaced_name(monkeypatch): + """Backwards-compat: if a caller already passes the namespaced name + in blocked_tools (e.g. they computed it externally), it should still + match. The filter checks both forms.""" + monkeypatch.setenv("AI_API_TOKEN", "test-token") + import asyncio + from unittest.mock import AsyncMock, MagicMock + + from seclab_taskflow_agent.mcp_utils import MCPNamespaceWrap, compress_name + from seclab_taskflow_agent.sdk.base import MCPServerSpec + + class _FakeTool: + def __init__(self, name): + self.name = name + self.description = "" + self.inputSchema = {} + + def copy(self): + return _FakeTool(self.name) + + obj = MagicMock() + obj.name = "RepoContext" + ns = compress_name("RepoContext") + obj.session = MagicMock() + obj.session.list_tools = AsyncMock( + return_value=type("R", (), {"tools": [_FakeTool("read_file")]})() + ) + wrap = MCPNamespaceWrap(confirms=[], obj=obj) + + spec = AgentSpec( + name="t", + instructions="", + model="claude-mythos-preview", + mcp_servers=[MCPServerSpec(name="rc", kind="stdio", params={"_native": wrap})], + blocked_tools=[f"{ns}read_file"], # already namespaced + ) + backend = AnthropicSDKBackend() + handle = asyncio.run(backend.build(spec)) + + assert handle.tools == [], ( + f"blocked namespaced name should filter out the tool; got: {handle.tools}" + ) From c6ef3ae885989d223aac9dc5db51312881fb93e5 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Mon, 15 Jun 2026 10:56:49 -0400 Subject: [PATCH 21/23] Revert default_model bump back to gpt-4.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coworker review flagged that gpt-5.5 is not a viable default: - gpt-5 family models require the responses API, but APIProvider has no api_type field to signal that — callers using the default would silently hit the wrong endpoint shape - GitHub Models never received gpt-5.5; gpt-4.1 is what's still supported there, so 'openai/gpt-5.5' would 404 - Most callers specify models explicitly via model_config anyway, so the default is only a fallback safety net — keep it on a model that exists on all three providers Reverts the registry defaults and dataclass default; keeps the gpt-5 prefix in _OpenAIProvider._CHAT_PREFIXES (direct OpenAI API does serve gpt-5 family, and the prefix check is independent of default selection). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/capi.py | 8 ++++---- tests/test_capi_extended.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/seclab_taskflow_agent/capi.py b/src/seclab_taskflow_agent/capi.py index edd9a4d3..3d635c0d 100644 --- a/src/seclab_taskflow_agent/capi.py +++ b/src/seclab_taskflow_agent/capi.py @@ -48,7 +48,7 @@ class APIProvider: name: str base_url: str models_catalog: str = "/models" - default_model: str = "gpt-5.5" + default_model: str = "gpt-4.1" extra_headers: Mapping[str, str] = field(default_factory=dict) bearer_auth: bool = True # Use Authorization: Bearer (not x-api-key) @@ -124,20 +124,20 @@ def check_tool_calls(self, _model: str, model_info: dict) -> bool: "api.githubcopilot.com": _CopilotProvider( name="copilot", base_url="https://api.githubcopilot.com", - default_model="gpt-5.5", + default_model="gpt-4.1", extra_headers={"Copilot-Integration-Id": COPILOT_INTEGRATION_ID}, ), "models.github.ai": _GitHubModelsProvider( name="github-models", base_url="https://models.github.ai/inference", models_catalog="/catalog/models", - default_model="openai/gpt-5.5", + default_model="openai/gpt-4.1", ), "api.openai.com": _OpenAIProvider( name="openai", base_url="https://api.openai.com/v1", models_catalog="/v1/models", - default_model="gpt-5.5", + default_model="gpt-4.1", ), } diff --git a/tests/test_capi_extended.py b/tests/test_capi_extended.py index 70407c28..36c97159 100644 --- a/tests/test_capi_extended.py +++ b/tests/test_capi_extended.py @@ -122,7 +122,7 @@ def test_github_models_provider(self): p = get_provider("https://models.github.ai/inference") assert p.name == "github-models" assert p.models_catalog == "/catalog/models" - assert p.default_model == "openai/gpt-5.5" + assert p.default_model == "openai/gpt-4.1" def test_openai_provider(self): p = get_provider("https://api.openai.com/v1") @@ -140,7 +140,7 @@ def test_awf_proxy_bare_hostname(self, monkeypatch): p = get_provider("http://172.30.0.30:10002") assert p.name == "copilot" assert p.base_url == "http://172.30.0.30:10002/" - assert p.default_model == "gpt-5.5" + assert p.default_model == "gpt-4.1" assert "Copilot-Integration-Id" in p.extra_headers def test_awf_proxy_full_url(self, monkeypatch): From 5e0a38cd827d7f2be4ec4d5126e80fe5997dfaca Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Mon, 15 Jun 2026 11:39:04 -0400 Subject: [PATCH 22/23] Address PR feedback + proactive cleanup pass Three behavior fixes flagged by review or by re-reading the diff: 1. 4xx exception mapping (reviewer-flagged): previously only anthropic.BadRequestError (400) was mapped to BackendBadRequestError. Auth (401), permission (403), not-found (404), conflict (409), unprocessable (422) all fell through to BackendUnexpectedError and surfaced as 'Agent Exception' instead of a clean request error. Catch anthropic.APIStatusError and map any 4xx status to BackendBadRequestError; 5xx still falls through to BackendUnexpectedError (the request was well-formed). 2. Empty-token failure mode: build() now raises BackendBadRequestError with a clear message when no API token can be resolved, instead of either leaking RuntimeError from get_AI_token() or letting the Anthropic client be constructed with an empty 'Bearer ' header (which produces an opaque 401 mid-stream much later). 3. Stale module docstring in sdk/__init__.py: said 'Two backends are supported' and referenced the removed '[copilot]' optional-extra. Updated to reflect the current three-backend reality. Test cleanup (reviewer-flagged): - DRY'd 3x duplicate _FakeStreamCtx boilerplate in the prompt-caching tests into a single _make_fake_client() helper at the top of the file. The helper uses a proper empty async iterator class instead of the 'return; yield' empty-generator pattern the reviewer flagged as awkward. Added regression coverage: - test_build_raises_bad_request_when_no_token_available - test_4xx_api_status_errors_map_to_bad_request (parameterized over 400/401/403/404/409/422) - test_5xx_api_status_errors_map_to_unexpected 281 -> 289 passing; lint clean (hatch fmt --linter --check). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/seclab_taskflow_agent/sdk/__init__.py | 11 +- .../sdk/anthropic_sdk/backend.py | 28 +- tests/test_sdk_anthropic_adapter.py | 252 +++++++++++++----- 3 files changed, 213 insertions(+), 78 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/__init__.py b/src/seclab_taskflow_agent/sdk/__init__.py index 0a413505..5eefafb2 100644 --- a/src/seclab_taskflow_agent/sdk/__init__.py +++ b/src/seclab_taskflow_agent/sdk/__init__.py @@ -3,9 +3,9 @@ """Backend factory for the agent runner. -Two backends are supported: ``openai_agents`` (default) and -``copilot_sdk`` (optional, requires ``pip install -seclab-taskflow-agent[copilot]``). +Three backends are supported: ``openai_agents`` (default), ``copilot_sdk``, +and ``anthropic_sdk``. All three are always available because per-task +backend selection means any SDK may be needed at runtime. """ from __future__ import annotations @@ -70,8 +70,9 @@ def resolve_backend_name( ``SECLAB_TASKFLOW_BACKEND`` env var > ``openai_agents``. Backend selection is always deterministic — there is no auto-detection - based on endpoint URL. Use ``backend: copilot_sdk`` in model config - or set ``SECLAB_TASKFLOW_BACKEND=copilot_sdk`` to opt in. + based on endpoint URL. Use ``backend: copilot_sdk`` or ``backend: + anthropic_sdk`` in model config (or set + ``SECLAB_TASKFLOW_BACKEND=``) to opt in. The *endpoint* parameter is accepted for forward compatibility but is not used for backend selection. diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index 11149406..f76ac70e 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -104,10 +104,22 @@ async def build( from ...capi import get_AI_endpoint, get_AI_token, get_provider - # Resolve token: per-model env var override, then standard token chain + # Resolve token: per-model env var override, then standard token chain. + # Wrap RuntimeError from get_AI_token (env var not set) so the runner + # surfaces it as a request error rather than an internal exception. token = os.getenv(spec.token_env, "") if spec.token_env else "" if not token: - token = get_AI_token() + try: + token = get_AI_token() + except RuntimeError as exc: + raise BackendBadRequestError( + f"anthropic_sdk: no API token available ({exc})" + ) from exc + if not token: + raise BackendBadRequestError( + "anthropic_sdk: no API token available " + "(checked spec.token_env then standard token chain)" + ) endpoint = spec.endpoint or get_AI_endpoint() provider = get_provider(endpoint) @@ -257,8 +269,16 @@ async def run_streamed( raise BackendRateLimitError(str(exc)) from exc except anthropic.APITimeoutError as exc: raise BackendTimeoutError(str(exc)) from exc - except anthropic.BadRequestError as exc: - raise BackendBadRequestError(str(exc)) from exc + except anthropic.APIStatusError as exc: + # Map all 4xx (auth, permission, not_found, conflict, + # unprocessable, bad_request) to BackendBadRequestError so + # the runner surfaces them as request errors rather than + # internal exceptions. 5xx and unclassified errors fall + # through to BackendUnexpectedError. + status = getattr(exc, "status_code", None) + if isinstance(status, int) and 400 <= status < 500: + raise BackendBadRequestError(str(exc)) from exc + raise BackendUnexpectedError(str(exc)) from exc except anthropic.APIError as exc: raise BackendUnexpectedError(str(exc)) from exc diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index d8e047c5..e072fa7b 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -31,6 +31,48 @@ def _spec(**overrides) -> AgentSpec: return AgentSpec(**base) +def _make_fake_client(captured: dict, *, stop_reason: str = "end_turn", content: list | None = None): + """Build a minimal fake Anthropic client that records messages.stream() kwargs. + + The returned client exposes ``client.messages.stream(**kwargs)``; ``kwargs`` is + written into *captured* so tests can assert on what the backend would have sent + to the real SDK. The stream yields nothing and ``get_final_message()`` returns + a stub with the requested ``stop_reason``/``content``. + """ + final_content = content if content is not None else [] + + class _EmptyAsyncIter: + def __aiter__(self): + return self + + async def __anext__(self): + raise StopAsyncIteration + + class _FakeStreamCtx: + async def __aenter__(self): + return self + + async def __aexit__(self, *exc): + return False + + def __aiter__(self): + return _EmptyAsyncIter() + + async def get_final_message(self): + return type("M", (), {"stop_reason": stop_reason, "content": final_content})() + + class _FakeMessages: + def stream(self, **kwargs): + captured.update(kwargs) + return _FakeStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + return _FakeClient() + + # -- Backend registration -- @@ -241,30 +283,9 @@ def test_prompt_caching_enabled_by_default(): from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle - captured = {} - - class _FakeStreamCtx: - async def __aenter__(self): return self - async def __aexit__(self, *exc): return False - def __aiter__(self): - async def _gen(): - return - yield - return _gen() - async def get_final_message(self): - return type("M", (), {"stop_reason": "end_turn", "content": []})() - - class _FakeMessages: - def stream(self, **kwargs): - captured.update(kwargs) - return _FakeStreamCtx() - - class _FakeClient: - def __init__(self): - self.messages = _FakeMessages() - + captured: dict = {} handle = _AnthropicHandle( - client=_FakeClient(), + client=_make_fake_client(captured), system_prompt="", model="claude-mythos-5", max_tokens=100, @@ -291,30 +312,9 @@ def test_prompt_caching_explicit_opt_out(): from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle - captured = {} - - class _FakeStreamCtx: - async def __aenter__(self): return self - async def __aexit__(self, *exc): return False - def __aiter__(self): - async def _gen(): - return - yield - return _gen() - async def get_final_message(self): - return type("M", (), {"stop_reason": "end_turn", "content": []})() - - class _FakeMessages: - def stream(self, **kwargs): - captured.update(kwargs) - return _FakeStreamCtx() - - class _FakeClient: - def __init__(self): - self.messages = _FakeMessages() - + captured: dict = {} handle = _AnthropicHandle( - client=_FakeClient(), + client=_make_fake_client(captured), system_prompt="", model="claude-mythos-5", max_tokens=100, @@ -340,30 +340,9 @@ def test_prompt_caching_1h_ttl_passes_ttl_field(): from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle - captured = {} - - class _FakeStreamCtx: - async def __aenter__(self): return self - async def __aexit__(self, *exc): return False - def __aiter__(self): - async def _gen(): - return - yield - return _gen() - async def get_final_message(self): - return type("M", (), {"stop_reason": "end_turn", "content": []})() - - class _FakeMessages: - def stream(self, **kwargs): - captured.update(kwargs) - return _FakeStreamCtx() - - class _FakeClient: - def __init__(self): - self.messages = _FakeMessages() - + captured: dict = {} handle = _AnthropicHandle( - client=_FakeClient(), + client=_make_fake_client(captured), system_prompt="", model="claude-mythos-5", max_tokens=100, @@ -490,3 +469,138 @@ def copy(self): assert handle.tools == [], ( f"blocked namespaced name should filter out the tool; got: {handle.tools}" ) + + +# -- token validation -- + + +def test_build_raises_bad_request_when_no_token_available(monkeypatch): + """build() must fail loudly when no API token can be resolved. + + Otherwise the Anthropic client gets created with an empty 'Bearer ' + header and the failure surfaces later as an opaque 401 mid-stream + instead of a clear BackendBadRequestError at build time. + """ + import asyncio + + # Clear every token-source env var the standard chain consults + for var in ("AI_API_TOKEN", "OPENAI_API_KEY", "AZURE_OPENAI_API_KEY", + "ANTHROPIC_API_KEY", "GITHUB_TOKEN", "GH_TOKEN"): + monkeypatch.delenv(var, raising=False) + + spec = AgentSpec( + name="t", + instructions="", + model="claude-mythos-preview", + endpoint="https://api.githubcopilot.com", + ) + backend = AnthropicSDKBackend() + with pytest.raises(BackendBadRequestError, match="no API token"): + asyncio.run(backend.build(spec)) + + +# -- exception mapping (4xx -> BackendBadRequestError) -- + + +@pytest.mark.parametrize("status_code", [400, 401, 403, 404, 409, 422]) +def test_4xx_api_status_errors_map_to_bad_request(monkeypatch, status_code): + """Any 4xx APIStatusError must surface as BackendBadRequestError so the + runner logs it as a request error rather than an internal exception. + Previously only BadRequestError (400) was mapped, leaving auth/permission/ + not-found errors (401/403/404) to surface as BackendUnexpectedError.""" + import asyncio + import anthropic + import httpx + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + + response = httpx.Response( + status_code=status_code, + request=httpx.Request("POST", "https://test.example/v1/messages"), + ) + + class _RaisingStreamCtx: + async def __aenter__(self): + raise anthropic.APIStatusError( + f"http {status_code}", response=response, body=None + ) + + async def __aexit__(self, *exc): + return False + + class _FakeMessages: + def stream(self, **kwargs): # noqa: ARG002 + return _RaisingStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + handle = _AnthropicHandle( + client=_FakeClient(), + system_prompt="", + model="claude-mythos-5", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={"prompt_caching": False}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + with pytest.raises(BackendBadRequestError): + asyncio.run(_run()) + + +def test_5xx_api_status_errors_map_to_unexpected(monkeypatch): + """5xx APIStatusError must still surface as BackendUnexpectedError (not + BackendBadRequestError); the request itself was well-formed.""" + import asyncio + import anthropic + import httpx + + from seclab_taskflow_agent.sdk.anthropic_sdk.backend import _AnthropicHandle + from seclab_taskflow_agent.sdk.errors import BackendUnexpectedError + + response = httpx.Response( + status_code=503, + request=httpx.Request("POST", "https://test.example/v1/messages"), + ) + + class _RaisingStreamCtx: + async def __aenter__(self): + raise anthropic.InternalServerError( + "service unavailable", response=response, body=None + ) + + async def __aexit__(self, *exc): + return False + + class _FakeMessages: + def stream(self, **kwargs): # noqa: ARG002 + return _RaisingStreamCtx() + + class _FakeClient: + def __init__(self): + self.messages = _FakeMessages() + + handle = _AnthropicHandle( + client=_FakeClient(), + system_prompt="", + model="claude-mythos-5", + max_tokens=100, + tools=[], + mcp_server_map={}, + model_settings={"prompt_caching": False}, + ) + backend = AnthropicSDKBackend() + + async def _run(): + async for _ in backend.run_streamed(handle, "hi", max_turns=1): + pass + + with pytest.raises(BackendUnexpectedError): + asyncio.run(_run()) From 4f1f4401ecfe3a8331aa0cf3c4dc0a8a6acef152 Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Mon, 15 Jun 2026 12:31:25 -0400 Subject: [PATCH 23/23] fix(anthropic_sdk): preserve empty tool output + harden token test Two more reviewer-flagged issues: 1. _call_tool_result_to_text() dropped empty TextContent The truthy check 'if text:' treated TextContent(text='') the same as text=None and skipped it. With an only-empty content list, parts would be [] and the helper fell through to the str(result) fallback (which is a noisy repr of the result object) instead of returning the actual empty result the tool reported. Fix: 'if text is not None:' preserves explicit empty strings; the str(result) fallback now only fires when there are no text-bearing blocks at all. 2. test_build_raises_bad_request_when_no_token_available was flaky The test cleared a long list of API key env vars (defensive cargo cult) but missed COPILOT_TOKEN, which is the second variable that capi.get_AI_token() consults. On runners with COPILOT_TOKEN set (e.g. CI envs authed to Copilot), the test would unexpectedly find a token and the assertion would fail. Simplified to clear only the two vars the chain actually consults: AI_API_TOKEN and COPILOT_TOKEN. +2 regression tests for empty-string preservation; 291 passing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sdk/anthropic_sdk/backend.py | 10 ++++-- tests/test_sdk_anthropic_adapter.py | 32 ++++++++++++++++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py index f76ac70e..21dab7e0 100644 --- a/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py +++ b/src/seclab_taskflow_agent/sdk/anthropic_sdk/backend.py @@ -54,12 +54,18 @@ def _mcp_tools_to_anthropic(tools: list[Any]) -> list[dict[str, Any]]: def _call_tool_result_to_text(result: Any) -> str: - """Extract text from an MCP CallToolResult.""" + """Extract text from an MCP CallToolResult. + + Preserves empty strings: a tool that returns ``TextContent(text="")`` + is returning an explicit empty result, not "no content". Only fall + back to ``str(result)`` (a noisy repr) when there are genuinely no + text-bearing content blocks at all. + """ content = getattr(result, "content", []) parts = [] for c in content: text = getattr(c, "text", None) - if text: + if text is not None: parts.append(text) return "\n".join(parts) if parts else str(result) diff --git a/tests/test_sdk_anthropic_adapter.py b/tests/test_sdk_anthropic_adapter.py index e072fa7b..bb8bf64d 100644 --- a/tests/test_sdk_anthropic_adapter.py +++ b/tests/test_sdk_anthropic_adapter.py @@ -193,6 +193,25 @@ def test_call_tool_result_to_text_empty(): assert isinstance(text, str) +def test_call_tool_result_to_text_preserves_empty_string(): + """A tool returning TextContent(text='') is reporting an explicit + empty result. The helper must return '' verbatim, not fall back to + str(result) (which is a noisy repr of the result object). + + Regression for the truthy-check bug: ``if text:`` was treating '' + the same as None and dropping it, causing the empty content list + branch to fire and emit ``str(result)`` to the model. + """ + result = type("R", (), {"content": [_FakeContent("")]})() + assert _call_tool_result_to_text(result) == "" + + +def test_call_tool_result_to_text_preserves_empty_among_nonempty(): + """Empty TextContent should join with neighbors as ''.""" + result = type("R", (), {"content": [_FakeContent("a"), _FakeContent(""), _FakeContent("b")]})() + assert _call_tool_result_to_text(result) == "a\n\nb" + + # -- bearer_auth via provider registry -- @@ -480,13 +499,18 @@ def test_build_raises_bad_request_when_no_token_available(monkeypatch): Otherwise the Anthropic client gets created with an empty 'Bearer ' header and the failure surfaces later as an opaque 401 mid-stream instead of a clear BackendBadRequestError at build time. + + Clears every variable consulted by ``capi.get_AI_token`` + (``AI_API_TOKEN`` then ``COPILOT_TOKEN``) to keep the test + deterministic regardless of the runner's ambient environment. """ import asyncio - # Clear every token-source env var the standard chain consults - for var in ("AI_API_TOKEN", "OPENAI_API_KEY", "AZURE_OPENAI_API_KEY", - "ANTHROPIC_API_KEY", "GITHUB_TOKEN", "GH_TOKEN"): - monkeypatch.delenv(var, raising=False) + # Must clear *every* env var the token chain consults; missing + # COPILOT_TOKEN here would make the test flaky on runners that + # happen to have it set (e.g. CI machines authed to copilot). + monkeypatch.delenv("AI_API_TOKEN", raising=False) + monkeypatch.delenv("COPILOT_TOKEN", raising=False) spec = AgentSpec( name="t",