You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello, I encountered an error during the reproduction process. Could you help me?
Training with your own dataset reports an error:
(ID2) ubuntu@s414g1:~/cssegmentation$ bash scripts/dist_train.sh 4 /home/ubuntu/cssegmentation/csseg/co
nfigs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: All tensors must be on devices[0]: 0
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
Killing subprocess 1686028
Killing subprocess 1686029
Killing subprocess 1686030
Killing subprocess 1686031
Traceback (most recent call last):
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 340, in
main()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 326, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/miniconda3/envs/ID2/bin/python', '-u', 'csseg/train.py', '--local_rank=3', '--nproc_per_node', '4', '--cfgfilepath', '/home/ubuntu/cssegmentation/csseg/configs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered:
Hello, I encountered an error during the reproduction process. Could you help me?
Training with your own dataset reports an error:
(ID2) ubuntu@s414g1:~/cssegmentation$ bash scripts/dist_train.sh 4 /home/ubuntu/cssegmentation/csseg/co
nfigs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Filtering Images: 100%|█████████████████████████████████████████████| 274/274 [00:00<00:00, 391.28it/s]
Filtering Images: 100%|███████████████████████████████████████████████| 39/39 [00:00<00:00, 411.89it/s]
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
Killing subprocess 1686028
Killing subprocess 1686029
Killing subprocess 1686030
Killing subprocess 1686031
Traceback (most recent call last):
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 340, in
main()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 326, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/miniconda3/envs/ID2/bin/python', '-u', 'csseg/train.py', '--local_rank=3', '--nproc_per_node', '4', '--cfgfilepath', '/home/ubuntu/cssegmentation/csseg/configs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered: