You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Invalidate trace cache @ step 0 and module 23400: cache has only 0 modules
Invalidate trace cache @ step 11: expected module 23412, but got module 23423
[rank3]: Traceback (most recent call last):
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
[rank3]: launch()
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank3]: run_exp()
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 107, in run_exp
[rank3]: _training_function(config={"args": args, "callbacks": callbacks})
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 73, in _training_function
[rank3]: run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/workflow.py", line 72, in run_ppo
[rank3]: ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank3]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/trainer.py", line 259, in ppo_train
[rank3]: stats = self.step(queries, responses, rewards)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 79, in inner
[rank3]: return func(*args, **kwds)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 814, in step
[rank3]: with self.accelerator.accumulate(self.model):
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank3]: return next(self.gen)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1157, in accumulate
[rank3]: cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 492, in enter_context
[rank3]: result = _cm_type.__enter__(cm)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank3]: return next(self.gen)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1038, in no_sync
[rank3]: with context():
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank3]: return next(self.gen)
[rank3]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2000, in no_sync
[rank3]: assert not self.zero_optimization_partition_gradients(), \
[rank3]: AssertionError: no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage 3
0%| | 0/900 [01:48<?, ?it/s]
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
[rank0]: launch()
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank0]: run_exp()
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 107, in run_exp
[rank0]: _training_function(config={"args": args, "callbacks": callbacks})
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 73, in _training_function
[rank0]: run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/workflow.py", line 72, in run_ppo
[rank0]: ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank0]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/trainer.py", line 259, in ppo_train
[rank0]: stats = self.step(queries, responses, rewards)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 79, in inner
[rank0]: return func(*args, **kwds)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 814, in step
[rank0]: with self.accelerator.accumulate(self.model):
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank0]: return next(self.gen)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1157, in accumulate
[rank0]: cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 492, in enter_context
[rank0]: result = _cm_type.__enter__(cm)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank0]: return next(self.gen)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1038, in no_sync
[rank0]: with context():
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank0]: return next(self.gen)
[rank0]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2000, in no_sync
[rank0]: assert not self.zero_optimization_partition_gradients(), \
[rank0]: AssertionError: no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage 3
[rank1]: Traceback (most recent call last):
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
[rank1]: launch()
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank1]: run_exp()
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 107, in run_exp
[rank1]: _training_function(config={"args": args, "callbacks": callbacks})
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 73, in _training_function
[rank1]: run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/workflow.py", line 72, in run_ppo
[rank1]: ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank1]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/trainer.py", line 259, in ppo_train
[rank1]: stats = self.step(queries, responses, rewards)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 79, in inner
[rank1]: return func(*args, **kwds)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 814, in step
[rank1]: with self.accelerator.accumulate(self.model):
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank1]: return next(self.gen)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1157, in accumulate
[rank1]: cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 492, in enter_context
[rank1]: result = _cm_type.__enter__(cm)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank1]: return next(self.gen)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1038, in no_sync
[rank1]: with context():
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank1]: return next(self.gen)
[rank1]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2000, in no_sync
[rank1]: assert not self.zero_optimization_partition_gradients(), \
[rank1]: AssertionError: no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage 3
[rank2]: Traceback (most recent call last):
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
[rank2]: launch()
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
[rank2]: run_exp()
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 107, in run_exp
[rank2]: _training_function(config={"args": args, "callbacks": callbacks})
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 73, in _training_function
[rank2]: run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/workflow.py", line 72, in run_ppo
[rank2]: ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
[rank2]: File "/root/LLaMA-Factory/src/llamafactory/train/ppo/trainer.py", line 259, in ppo_train
[rank2]: stats = self.step(queries, responses, rewards)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 79, in inner
[rank2]: return func(*args, **kwds)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 814, in step
[rank2]: with self.accelerator.accumulate(self.model):
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank2]: return next(self.gen)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1157, in accumulate
[rank2]: cm_stack.enter_context(contextlib.nullcontext() if allow_gradient_sync else self.no_sync(m))
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 492, in enter_context
[rank2]: result = _cm_type.__enter__(cm)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank2]: return next(self.gen)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/accelerate/accelerator.py", line 1038, in no_sync
[rank2]: with context():
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/contextlib.py", line 135, in __enter__
[rank2]: return next(self.gen)
[rank2]: File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2000, in no_sync
[rank2]: assert not self.zero_optimization_partition_gradients(), \
[rank2]: AssertionError: no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage 3
W0526 19:48:03.057000 3420008 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3420075 closing signal SIGTERM
W0526 19:48:03.057000 3420008 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3420076 closing signal SIGTERM
W0526 19:48:03.057000 3420008 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 3420077 closing signal SIGTERM
E0526 19:48:04.536000 3420008 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 3 (pid: 3420078) of binary: /root/anaconda3/envs/llama_factory_transformers/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/llama_factory_transformers/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/torch/distributed/run.py", line 918, in main
run(args)
File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/torch/distributed/run.py", line 909, in run
elastic_launch(
File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/llama_factory_transformers/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/root/LLaMA-Factory/src/llamafactory/launcher.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-05-26_19:48:03
host : localhost.localdomain
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 3420078)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
The text was updated successfully, but these errors were encountered:
Reminder
System Info
yaml文件
ds3配置文件
Reproduction
Others
报错信息
The text was updated successfully, but these errors were encountered: