state-spaces · tridao · Jun 30, 2024 · Jun 24, 2024
diff --git a/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/csrc/selective_scan/selective_scan_bwd_kernel.cuh
@@ -536,12 +536,6 @@ template<typename input_t, typename weight_t>
 void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream) {
 
  #ifndef USE_ROCM
- #define warp_size 32
- #else
- #define warp_size ROCM_WARP_SIZE
- #endif
-
- #if warp_size == 32 
  if (params.seqlen <= 128) {
  selective_scan_bwd_launch<32, 4, input_t, weight_t>(params, stream);
  } else if (params.seqlen <= 256) {

diff --git a/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/csrc/selective_scan/selective_scan_fwd_kernel.cuh
@@ -351,12 +351,6 @@ template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
  #ifndef USE_ROCM
- #define warp_size 32
- #else
- #define warp_size ROCM_WARP_SIZE
- #endif
-
- #if warp_size == 32
  if (params.seqlen <= 128) { 
  selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
  } else if (params.seqlen <= 256) {

diff --git a/setup.py b/setup.py
@@ -199,23 +199,6 @@ def append_nvcc_threads(nvcc_extra_args):
 
  if HIP_BUILD:
 
- try:
- # set warp size based on gcn architecure 
- gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
- if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
- # radeon
- warp_size = 32
- else:
- # instinct
- warp_size = 64
- except AttributeError as e:
- # fall back to crude method to set warp size
- device_name = torch.cuda.get_device_properties(0).name
- if 'instinct' in device_name.lower():
- warp_size = 64
- else:
- warp_size = 32
-
  extra_compile_args = {
  "cxx": ["-O3", "-std=c++17"],
  "nvcc": [
@@ -226,7 +209,6 @@ def append_nvcc_threads(nvcc_extra_args):
  "-U__CUDA_NO_HALF_CONVERSIONS__",
  "-DCK_FMHA_FWD_FAST_EXP2=1",
  "-fgpu-flush-denormals-to-zero",
- f"-DROCM_WARP_SIZE={warp_size}"
  ]
  + cc_flag,
  }