From 57ca85a536b55cc3a91b3132a94109e9983a7e76 Mon Sep 17 00:00:00 2001 From: YanhuiDua Date: Thu, 25 Jun 2026 10:00:42 +0000 Subject: [PATCH] [Fix] ensure RL trainers close experiment tracker on fit exit --- xtuner/v1/train/rl_trainer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xtuner/v1/train/rl_trainer.py b/xtuner/v1/train/rl_trainer.py index da1b9eaf8..154f14c81 100644 --- a/xtuner/v1/train/rl_trainer.py +++ b/xtuner/v1/train/rl_trainer.py @@ -1573,6 +1573,12 @@ def _sync_weights_from_train_workers(self) -> None: self.logger.info("Rollout workers updated weights from train workers.") def fit(self): + try: + self._fit() + finally: + self._exp_tracker.close() + + def _fit(self): self.logger.info("Start RL training") if self._cur_step >= self._total_train_steps: self.logger.info(f"Train steps {self._total_train_steps} reached, stop training") @@ -1583,7 +1589,8 @@ def fit(self): return ray.get( - self.rollout_controller.validate_registered_workers_to_proxy.remote(), timeout=RL_TRAINER_RAY_GET_TIMEOUT + self.rollout_controller.validate_registered_workers_to_proxy.remote(), + timeout=RL_TRAINER_RAY_GET_TIMEOUT, ) if self._enable_initial_evaluate and not self._debug_rollout: @@ -1788,7 +1795,10 @@ def _resume_from_checkpoint(self, checkpoint_path: Path | str) -> None: def fit(self): # 对外同步 fit;内部用 async loop 组织 producer/consumer。 - return asyncio_run(self._fit()) + try: + return asyncio_run(self._fit()) + finally: + self._exp_tracker.close() async def _get_batch_or_raise_producer_failure( self,