diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py
index b03179c231..df2079b727 100644
--- a/contributing/samples/gepa/experiment.py
+++ b/contributing/samples/gepa/experiment.py
@@ -44,6 +44,7 @@
 from tau_bench.types import EnvRunResult
 from tau_bench.types import RunConfig
 import tau_bench_agent as tau_bench_agent_lib
+import utils
 
 
 def run_tau_bench_rollouts(
diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py
index ca3f5e7696..92299bcc12 100644
--- a/contributing/samples/gepa/run_experiment.py
+++ b/contributing/samples/gepa/run_experiment.py
@@ -26,6 +26,7 @@
 import experiment
 import gepa_utils
 from google.genai import types
+import utils
 
 _OUTPUT_DIR = flags.DEFINE_string(
     'output_dir',
diff --git a/src/google/adk/flows/llm_flows/base_llm_flow.py b/src/google/adk/flows/llm_flows/base_llm_flow.py
index 31f998a588..60ceaa8dcf 100644
--- a/src/google/adk/flows/llm_flows/base_llm_flow.py
+++ b/src/google/adk/flows/llm_flows/base_llm_flow.py
@@ -24,6 +24,7 @@
 
 from google.adk.platform import time as platform_time
 from google.genai import types
+from opentelemetry import context as otel_context
 from opentelemetry import trace
 from websockets.exceptions import ConnectionClosed
 from websockets.exceptions import ConnectionClosedOK
@@ -1169,7 +1170,17 @@ async def _call_llm_async(
   ) -> AsyncGenerator[LlmResponse, None]:
 
     async def _call_llm_with_tracing() -> AsyncGenerator[LlmResponse, None]:
-      with tracer.start_as_current_span('call_llm') as span:
+      # Use explicit span management instead of start_as_current_span context
+      # manager to ensure span.end() is always called. In multi-agent scenarios
+      # with transfer_to_agent, the async generator may receive GeneratorExit
+      # after an async context switch (sub-agent execution). This causes
+      # context.detach() to raise ValueError (stale contextvars token), which
+      # prevents span.end() from being reached when using the context manager.
+      # See: https://github.com/google/adk-python/issues/4715
+      span = tracer.start_span('call_llm')
+      ctx = trace.set_span_in_context(span)
+      token = otel_context.attach(ctx)
+      try:
         # Runs before_model_callback inside the call_llm span so
         # plugins observe the same span as after/error callbacks.
         if response := await self._handle_before_model_callback(
@@ -1262,6 +1273,12 @@ async def _call_llm_with_tracing() -> AsyncGenerator[LlmResponse, None]:
                   llm_response = altered
 
               yield llm_response
+      finally:
+        try:
+          otel_context.detach(token)
+        except ValueError:
+          pass
+        span.end()
 
     async with Aclosing(_call_llm_with_tracing()) as agen:
       async for event in agen:
diff --git a/tests/unittests/telemetry/test_functional.py b/tests/unittests/telemetry/test_functional.py
index 879bfa0198..601173396e 100644
--- a/tests/unittests/telemetry/test_functional.py
+++ b/tests/unittests/telemetry/test_functional.py
@@ -82,6 +82,7 @@ def do_replace(tracer):
     monkeypatch.setattr(
         tracer, "start_as_current_span", real_tracer.start_as_current_span
     )
+    monkeypatch.setattr(tracer, 'start_span', real_tracer.start_span)
 
   do_replace(tracing.tracer)