aws · SuhitK · Apr 23, 2024 · May 1, 2024 · viclzhu · Apr 23, 2024
diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py
@@ -273,7 +273,7 @@ def train(
  throughput = sample_processed / step_time
  throughputs.append(throughput)
 
- tflops_per_gpu = compute_tflops(throughput, num_params, world_size, batch_seqlen)
+ tflops_per_gpu = compute_tflops(args, sample_processed, step_time, dp_size)
 
  if not total_steps % args.logging_freq and args.log_reduced_training_loss > 0:
  loss_scalar = reduce_loss(loss)

diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py
@@ -37,6 +37,8 @@ def compute_tflops(args, global_batch_size, step_time, world_size):
  # Based on 
  # https://github.com/NVIDIA/Megatron-LM/blob/ba773259dbe5735fbd91ca41e7f4ded60b335c52/megatron/training/training.py#L65
  num_experts_routed_to = 1 if args.moe > 1 else args.num_experts_per_tok
+ if args.num_key_value_heads is None:
+ args.num_key_value_heads = args.num_heads
  num_flops = (
  12
  * global_batch_size