-
Notifications
You must be signed in to change notification settings - Fork 77
Open
Description
Hello,
I encountered an issue during VQA fine-tuning. The processed dataset works well in the zero-shot setting. However, when I attempt fine-tuning, I encounter an error message 'assert step < max_len, f"{step} < {max_len}"
AssertionError: 200 < 200'. The following is the complete error message:
2025-01-14 22:38:57 - train.py[line:448] - INFO: begin validation on "valid" subset
slice_id 0 seek offset 0
slice_id 2 seek offset 117
slice_id 0 seek offset 0
Traceback (most recent call last):
File "../../train.py", line 540, in
cli_main()
File "../../train.py", line 533, in cli_main
distributed_utils.call_main(cfg, main)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 374, in call_main
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
main(cfg, **kwargs)
File "../../train.py", line 202, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "../../train.py", line 328, in train
cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
File "../../train.py", line 414, in validate_and_save
valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
File "../../train.py", line 484, in validate
trainer.valid_step(sample)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "/raid/zsun5/EcoNET/BiomedGPT/trainer.py", line 1059, in valid_step
sample, self.model, self.criterion, **extra_kwargs
File "/raid/zsun5/EcoNET/BiomedGPT/tasks/mm_tasks/vqa_gen.py", line 260, in valid_step
raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/tasks/fairseq_task.py", line 518, in inference_step
models, sample, prefix_tokens=prefix_tokens, constraints=constraints
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 207, in generate
return self._generate(models, sample, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 480, in _generate
assert step < max_len, f"{step} < {max_len}"
AssertionError: 200 < 200
Traceback (most recent call last):
File "../../train.py", line 540, in
cli_main()
File "../../train.py", line 533, in cli_main
distributed_utils.call_main(cfg, main)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 374, in call_main
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
main(cfg, **kwargs)
File "../../train.py", line 202, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "../../train.py", line 328, in train
cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
File "../../train.py", line 414, in validate_and_save
valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
File "../../train.py", line 484, in validate
trainer.valid_step(sample)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "/raid/zsun5/EcoNET/BiomedGPT/trainer.py", line 1059, in valid_step
sample, self.model, self.criterion, **extra_kwargs
File "/raid/zsun5/EcoNET/BiomedGPT/tasks/mm_tasks/vqa_gen.py", line 260, in valid_step
raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/tasks/fairseq_task.py", line 518, in inference_step
models, sample, prefix_tokens=prefix_tokens, constraints=constraints
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 207, in generate
return self._generate(models, sample, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 480, in _generate
assert step < max_len, f"{step} < {max_len}"
AssertionError: 200 < 200
Traceback (most recent call last):
File "../../train.py", line 540, in
cli_main()
File "../../train.py", line 533, in cli_main
distributed_utils.call_main(cfg, main)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 374, in call_main
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
main(cfg, **kwargs)
File "../../train.py", line 202, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "../../train.py", line 328, in train
cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
File "../../train.py", line 414, in validate_and_save
valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
File "../../train.py", line 484, in validate
trainer.valid_step(sample)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "/raid/zsun5/EcoNET/BiomedGPT/trainer.py", line 1059, in valid_step
sample, self.model, self.criterion, **extra_kwargs
File "/raid/zsun5/EcoNET/BiomedGPT/tasks/mm_tasks/vqa_gen.py", line 260, in valid_step
raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/tasks/fairseq_task.py", line 518, in inference_step
models, sample, prefix_tokens=prefix_tokens, constraints=constraints
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 207, in generate
return self._generate(models, sample, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 480, in _generate
assert step < max_len, f"{step} < {max_len}"
AssertionError: 200 < 200
Traceback (most recent call last):
File "../../train.py", line 540, in
cli_main()
File "../../train.py", line 533, in cli_main
distributed_utils.call_main(cfg, main)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 374, in call_main
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
main(cfg, **kwargs)
File "../../train.py", line 202, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "../../train.py", line 328, in train
cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
File "../../train.py", line 414, in validate_and_save
valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
File "../../train.py", line 484, in validate
trainer.valid_step(sample)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "/raid/zsun5/EcoNET/BiomedGPT/trainer.py", line 1059, in valid_step
sample, self.model, self.criterion, **extra_kwargs
File "/raid/zsun5/EcoNET/BiomedGPT/tasks/mm_tasks/vqa_gen.py", line 260, in valid_step
raw_hyps = self.inference_step(self.generator, [eval_model], sample, prefix_tokens=sample['prefix_tokens'])
File "/raid/zsun5/EcoNET/BiomedGPT/fairseq/fairseq/tasks/fairseq_task.py", line 518, in inference_step
models, sample, prefix_tokens=prefix_tokens, constraints=constraints
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 207, in generate
return self._generate(models, sample, **kwargs)
File "/raid/zsun5/EcoNET/BiomedGPT/models/sequence_generator.py", line 480, in _generate
assert step < max_len, f"{step} < {max_len}"
AssertionError: 200 < 200
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1549730 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1549728) of binary: /raid/zsun5/anaconda3/envs/biomedgpt/bin/python3
Traceback (most recent call last):
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/run.py", line 756, in run
)(*cmd_args)
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/raid/zsun5/anaconda3/envs/biomedgpt/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 248, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
../../train.py FAILED
Metadata
Metadata
Assignees
Labels
No labels