Merge pull request #86 from laekov/v0.3.0-rc

v0.3.0 Release
laekov · Nov 8, 2021 · acf8bec · acf8bec
2 parents 3397bc1 + a461be6
commit acf8bec
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 25 deletions.
diff --git a/cuda/utils/cublas_wrapper.h b/cuda/utils/cublas_wrapper.h
@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
  const c10::Half *beta,
  c10::Half *C, int ldc) {
 #ifdef FMOE_USE_HIP
- return rocblas_hgemm(handle, transa, transb, m, n, k, 
- (const rocblas_half*)alpha, 
- (const rocblas_half*)A, lda, 
- (const rocblas_half*)B, ldb, 
- (const rocblas_half*)beta, 
+ return rocblas_hgemm(handle, transa, transb, m, n, k,
+ (const rocblas_half*)alpha,
+ (const rocblas_half*)A, lda,
+ (const rocblas_half*)B, ldb,
+ (const rocblas_half*)beta,
  (rocblas_half*)C, ldc);
 #else
- return cublasHgemm(handle, transa, transb, m, n, k, 
- (const __half*)alpha, 
- (const __half*)A, lda, 
- (const __half*)B, ldb, 
- (const __half*)beta, 
+ return cublasHgemm(handle, transa, transb, m, n, k,
+ (const __half*)alpha,
+ (const __half*)A, lda,
+ (const __half*)B, ldb,
+ (const __half*)beta,
  (__half*)C, ldc);
 #endif
 }

diff --git a/cuda/utils/helper_cuda.h b/cuda/utils/helper_cuda.h
@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) {
 #ifdef FMOE_USE_HIP
 static const char *_cudaGetErrorEnum(cublasStatus_t error) {
  switch (error) {
- 
+
  case rocblas_status_success:
  return "rocblas_status_success";
- 
+
  case rocblas_status_invalid_handle:
  return "rocblas_status_invalid_handle";
- 
+
  case rocblas_status_not_implemented:
  return "rocblas_status_not_implemented";
 
  case rocblas_status_invalid_pointer:
  return "rocblas_status_invalid_pointer:";
- 
+
  case rocblas_status_invalid_size:
  return "rocblas_status_invalid_size";
- 
+
  case rocblas_status_memory_error:
  return "rocblas_status_memory_error";
- 
+
  case rocblas_status_internal_error:
  return "rocblas_status_internal_error";
- 
+
  case rocblas_status_perf_degraded:
  return "rocblas_status_perf_degraded";
 
@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 
  case rocblas_status_size_increased:
  return "rocblas_status_size_increased";
- 
+
  case rocblas_status_size_unchanged:
  return "rocblas_status_size_unchanged";
- 
+
  case rocblas_status_invalid_value:
  return "rocblas_status_invalid_value";
- 
+
  case rocblas_status_continue:
  return "rocblas_status_continue";
  }
@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file,
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
 #endif // HELPER_CUDA_H
+
diff --git a/doc/release-note.md b/doc/release-note.md
@@ -1,18 +1,45 @@
+## v0.3.0
+
+### FMoE core
+
+* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release.
+* ROCm supported.
+* `FMoELinear` is moved to a stand-alone file.
+
+### Groupped data parallel
+
+* Support any group name by their relative tag name.
+
+### Load balancing
+
+* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
+* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
+
+### Megatron-LM support
+
+* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
+* Support arbitrary customized gate in `MegatronMLP`.
+* Move the patches to a stand-alone file.
+
+### Tests
+
+* Move util functions into `test_ddp.py`.
+
 ## v0.2.1
 
 ## Load balancing
 
 * Fix gradient for balance loss.
 
-## Misc
+### Misc
 
 * Typos.
 * Update benchmark interface.
 * Remove some redundant code for performance improvement.
 * Enable `USE_NCCL` by default.
 * Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
 
-## Megatron adaption
+### Megatron adaption
 
 * Patch for numerical correctness of gradient clipping.
 * Support to pipeline parallelism.

diff --git a/fmoe/layers.py b/fmoe/layers.py
@@ -72,7 +72,7 @@ class FMoE(nn.Module):
  group hold the same copy of input feature, and requires the same copy of
  the output. For each worker, FMoE only computes the output of a certain
  slice of the input batch, and will all-gather the outputs after
- computation. 
+ computation.
  * `top_k` stands for the number of experts each token is going to.
  * `gate` is a gate class which can found in `fmoe.gates`.
  * `expert` can be specified as a module class, it is used to generate

diff --git a/fmoe/megatron/patch.py b/fmoe/megatron/patch.py
@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None):
  def fmoefied_model_provider():
  from .layers import fmoefy
  args = get_args()
- hhs = args.hidden_size * 4 
+ hhs = args.hidden_size * 4
  assert hhs % args.top_k == 0
- hhs = hhs // args.top_k 
+ hhs = hhs // args.top_k
  assert hhs % args.tensor_model_parallel_size == 0
  hhs = hhs // args.tensor_model_parallel_size
  return fmoefy(