-
Notifications
You must be signed in to change notification settings - Fork 19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[TODO] 开发memory_reserved算子 和 性能优化 #122
Labels
Comments
结合NVTX注释上手nsys
nvtx 教程: https://nvtx.readthedocs.io/en/latest/index.html Python Demoimport numpy as np
import cupy as cp
import nvtx
@nvtx.annotate("fft function", color="blue")
def fast_fft(input_array):
with nvtx.annotate("Copy input array to GPU and CuPy", color="red"):
gpu_array = cp.array(input_array)
with nvtx.annotate("GPU FFT operation", color="yellow"):
result = cp.fft.fft(gpu_array)
with nvtx.annotate("Copy back to CPU and Numpy", color="green"):
cpu_result = cp.asnumpy(result)
return cpu_result
for i in range(5):
print(fast_fft(np.random.random(10))) 启动指令:
C++ Demo#include <cuda_runtime.h>
#include "nvToolsExt.h"
#include <iostream>
// 定义向量加法的 CUDA 核函数
__global__ void vectorAdd(const float *A, float *C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N) {
C[i] = A[i] + 1.0f;
}
}
// 启动 CUDA 核函数
void launch_kernel(const float *A, float *C, int N) {
nvtxRangePushA("_FUNCTION_"); // 开始记录 _FUNCTION_ 的时间戳
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
for(int i = 0; i < 4; i++) {
nvtxRangePushA("vectorAdd"); // 开始记录 vectorAdd 的时间戳
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(A, C, N);
nvtxRangePop(); // 结束记录 vectorAdd 的时间戳
}
nvtxRangePop(); // 结束记录 _FUNCTION_ 的时间戳
}
int main() {
const int N = 100;
float *A, *C;
cudaMallocManaged(&A, N * sizeof(float));
cudaMallocManaged(&C, N * sizeof(float));
for(int i = 0; i < N; i++) {
A[i] = static_cast<float>(i);
C[i] = 0.0f;
}
std::cout << "Launching kernel..." << std::endl;
launch_kernel(A, C, N);
cudaFree(A);
cudaFree(C);
return 0;
}
// 完成程序 Reference |
memory_reserved算子目前打算不使用直接调CUDA API的方式,需要更改oneflow BInAllocator部分,我把他跟lazy_init放一起了,这周PR |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
profile工具上手
one-yolov5项目
如果执行nsys产生报错
请将 train.py中 check_git_status() 这一行注释
glm 项目
The text was updated successfully, but these errors were encountered: