From 4daf976c0e1c8cc1a3192da9d5faf51cdd10e01d Mon Sep 17 00:00:00 2001
From: root <ajassani@amd.com>
Date: Mon, 24 Jun 2024 22:08:05 +0000
Subject: [PATCH] radeon fix - compile with same launch params as instinct

---
 .../selective_scan_bwd_kernel.cuh              |  6 ------
 .../selective_scan_fwd_kernel.cuh              |  6 ------
 setup.py                                       | 18 ------------------
 3 files changed, 30 deletions(-)
diff --git a/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/csrc/selective_scan/selective_scan_bwd_kernel.cuh
index 737420f0..c720ba28 100755
--- a/csrc/selective_scan/selective_scan_bwd_kernel.cuh
+++ b/csrc/selective_scan/selective_scan_bwd_kernel.cuh
@@ -536,12 +536,6 @@ template<typename input_t, typename weight_t>
 void selective_scan_bwd_cuda(SSMParamsBwd &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
-        #define warp_size 32
-    #else
-        #define warp_size ROCM_WARP_SIZE
-    #endif
-
-    #if warp_size == 32 
         if (params.seqlen <= 128) {
             selective_scan_bwd_launch<32, 4, input_t, weight_t>(params, stream);
         } else if (params.seqlen <= 256) {
diff --git a/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/csrc/selective_scan/selective_scan_fwd_kernel.cuh
index e15ab81b..80e9e37e 100755
--- a/csrc/selective_scan/selective_scan_fwd_kernel.cuh
+++ b/csrc/selective_scan/selective_scan_fwd_kernel.cuh
@@ -351,12 +351,6 @@ template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
-        #define warp_size 32
-    #else
-        #define warp_size ROCM_WARP_SIZE
-    #endif
-
-    #if warp_size == 32
         if (params.seqlen <= 128) {           
             selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
         } else if (params.seqlen <= 256) {
diff --git a/setup.py b/setup.py
index a1663fff..e162e205 100755
--- a/setup.py
+++ b/setup.py
@@ -199,23 +199,6 @@ def append_nvcc_threads(nvcc_extra_args):
 
     if HIP_BUILD:
 
-        try:
-            # set warp size based on gcn architecure 
-            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
-            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
-                # radeon
-                warp_size = 32
-            else:
-                # instinct
-                warp_size = 64
-        except AttributeError as e:
-            # fall back to crude method to set warp size
-            device_name = torch.cuda.get_device_properties(0).name
-            if 'instinct' in device_name.lower():
-                warp_size = 64
-            else:
-                warp_size = 32
-
         extra_compile_args = {
             "cxx": ["-O3", "-std=c++17"],
             "nvcc": [
@@ -226,7 +209,6 @@ def append_nvcc_threads(nvcc_extra_args):
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
-                f"-DROCM_WARP_SIZE={warp_size}"
             ]
             + cc_flag,
         }