fp16 compatibility #1129

ianferreira · 2022-10-24T19:44:26Z

🐛 Describe the bug

Training GPT using two RX6800 (Navi 21)

`from transformers import TrainingArguments

training_args = TrainingArguments(
output_dir=model_path, # output directory to where save model checkpoint
evaluation_strategy="steps", # evaluate each logging_steps steps
overwrite_output_dir=True,
num_train_epochs=10, # number of training epochs, feel free to tweak
per_device_train_batch_size=4, # the training batch size, put it as high as your GPU memory fits
gradient_accumulation_steps=8, # accumulating the gradients before updating the weights
per_device_eval_batch_size=2, # evaluation batch size
logging_steps=100, # evaluate, log and save model checkpoints every 1000 step
save_steps=1000,
eval_steps=1000,
# load_best_model_at_end=True, # whether to load the best model (in terms of loss) at the end of training
save_total_limit=3, # whether you don't have much space so you let only 3 model weights saved in the disk
fp16=True
)`

and

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

yields this error in hugging face opengpt

`* Running training *
Num examples = 637416
Num Epochs = 10
Instantaneous batch size per device = 4
Total train batch size (w. parallel, distributed & accumulation) = 64
Gradient Accumulation steps = 8
Total optimization steps = 99590

StopIteration Traceback (most recent call last)
Cell In [198], line 1
----> 1 trainer.train()

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:1500, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1495 self.model_wrapped = self.model
1497 inner_training_loop = find_executable_batch_size(
1498 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1499 )
-> 1500 return inner_training_loop(
1501 args=args,
1502 resume_from_checkpoint=resume_from_checkpoint,
1503 trial=trial,
1504 ignore_keys_for_eval=ignore_keys_for_eval,
1505 )

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:1742, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1740 tr_loss_step = self.training_step(model, inputs)
1741 else:
-> 1742 tr_loss_step = self.training_step(model, inputs)
1744 if (
1745 args.logging_nan_inf_filter
1746 and not is_torch_tpu_available()
1747 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1748 ):
1749 # if loss is nan or inf simply add the average of previous logged losses
1750 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:2486, in Trainer.training_step(self, model, inputs)
2483 return loss_mb.reduce_mean().detach().to(self.args.device)
2485 with self.compute_loss_context_manager():
-> 2486 loss = self.compute_loss(model, inputs)
2488 if self.args.n_gpu > 1:
2489 loss = loss.mean() # mean() to average on multi-gpu parallel training

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:2518, in Trainer.compute_loss(self, model, inputs, return_outputs)
2516 else:
2517 labels = None
-> 2518 outputs = model(**inputs)
2519 # Save past state if it exists
2520 # TODO: this needs to be fixed and made cleaner later.
2521 if self.args.past_index >= 0:

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:168, in DataParallel.forward(self, *inputs, **kwargs)
166 return self.module(*inputs[0], **kwargs[0])
167 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 168 outputs = self.parallel_apply(replicas, inputs, kwargs)
169 return self.gather(outputs, self.output_device)

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:178, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
177 def parallel_apply(self, replicas, inputs, kwargs):
--> 178 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py:86, in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs

File ~/.venvs/pt/lib/python3.8/site-packages/torch/_utils.py:461, in ExceptionWrapper.reraise(self)
457 except TypeError:
458 # If the exception takes multiple arguments, don't try to
459 # instantiate since we don't know how to
460 raise RuntimeError(msg) from None
--> 461 raise exception

StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ian/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/ian/.venvs/pt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ian/.venvs/pt/lib/python3.8/site-packages/transformers/models/openai/modeling_openai.py", line 481, in forward
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration`

Versions

Collecting environment information...
PyTorch version: 1.12.1+rocm5.1.1
Is debug build: False
CUDA used to build PyTorch: N/A
ROCM used to build PyTorch: 5.1.20531-cacfa990

OS: Ubuntu 22.04.1 LTS (x86_64)
GCC version: (Ubuntu 11.2.0-19ubuntu1) 11.2.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: glibc-2.35

Python version: 3.8.15 (default, Oct 12 2022, 19:15:16) [GCC 11.2.0] (64-bit runtime)
Python platform: Linux-5.15.0-52-generic-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: Could not collect
CUDA_MODULE_LOADING set to:
GPU models and configuration: AMD Radeon RX 6800 XT
Nvidia driver version: Could not collect
cuDNN version: Could not collect
HIP runtime version: 5.3.22061
MIOpen runtime version: 2.16.0
Is XNNPACK available: True

Versions of relevant libraries:
[pip3] numpy==1.23.4
[pip3] torch==1.12.1+rocm5.1.1
[pip3] torchaudio==0.12.1+rocm5.1.1
[pip3] torchvision==0.13.1+rocm5.1.1
[conda] Could not collect

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fp16 compatibility #1129

fp16 compatibility #1129

ianferreira commented Oct 24, 2022

fp16 compatibility #1129

fp16 compatibility #1129

Comments

ianferreira commented Oct 24, 2022

🐛 Describe the bug

`***** Running training ***** Num examples = 637416 Num Epochs = 10 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 64 Gradient Accumulation steps = 8 Total optimization steps = 99590

Versions

`* Running training *
Num examples = 637416
Num Epochs = 10
Instantaneous batch size per device = 4
Total train batch size (w. parallel, distributed & accumulation) = 64
Gradient Accumulation steps = 8
Total optimization steps = 99590