From 4daf976c0e1c8cc1a3192da9d5faf51cdd10e01d Mon Sep 17 00:00:00 2001 From: root Date: Mon, 24 Jun 2024 22:08:05 +0000 Subject: [PATCH] radeon fix - compile with same launch params as instinct --- .../selective_scan_bwd_kernel.cuh | 6 ------ .../selective_scan_fwd_kernel.cuh | 6 ------ setup.py | 18 ------------------ 3 files changed, 30 deletions(-) diff --git a/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/csrc/selective_scan/selective_scan_bwd_kernel.cuh index 737420f0..c720ba28 100755 --- a/csrc/selective_scan/selective_scan_bwd_kernel.cuh +++ b/csrc/selective_scan/selective_scan_bwd_kernel.cuh @@ -536,12 +536,6 @@ template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream) { #ifndef USE_ROCM - #define warp_size 32 - #else - #define warp_size ROCM_WARP_SIZE - #endif - - #if warp_size == 32 if (params.seqlen <= 128) { selective_scan_bwd_launch<32, 4, input_t, weight_t>(params, stream); } else if (params.seqlen <= 256) { diff --git a/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/csrc/selective_scan/selective_scan_fwd_kernel.cuh index e15ab81b..80e9e37e 100755 --- a/csrc/selective_scan/selective_scan_fwd_kernel.cuh +++ b/csrc/selective_scan/selective_scan_fwd_kernel.cuh @@ -351,12 +351,6 @@ template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { #ifndef USE_ROCM - #define warp_size 32 - #else - #define warp_size ROCM_WARP_SIZE - #endif - - #if warp_size == 32 if (params.seqlen <= 128) { selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream); } else if (params.seqlen <= 256) { diff --git a/setup.py b/setup.py index a1663fff..e162e205 100755 --- a/setup.py +++ b/setup.py @@ -199,23 +199,6 @@ def append_nvcc_threads(nvcc_extra_args): if HIP_BUILD: - try: - # set warp size based on gcn architecure - gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName - if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name: - # radeon - warp_size = 32 - else: - # instinct - warp_size = 64 - except AttributeError as e: - # fall back to crude method to set warp size - device_name = torch.cuda.get_device_properties(0).name - if 'instinct' in device_name.lower(): - warp_size = 64 - else: - warp_size = 32 - extra_compile_args = { "cxx": ["-O3", "-std=c++17"], "nvcc": [ @@ -226,7 +209,6 @@ def append_nvcc_threads(nvcc_extra_args): "-U__CUDA_NO_HALF_CONVERSIONS__", "-DCK_FMHA_FWD_FAST_EXP2=1", "-fgpu-flush-denormals-to-zero", - f"-DROCM_WARP_SIZE={warp_size}" ] + cc_flag, }