以前写cuda:初始化环境,申请显存,初始化显存,launch kernel,拷贝数据,释放显存。一个页面大部分都是这些繁杂但又必须的操作,有时还会忘掉释放部分显存。
今天用C++11封装了这些CUDA操作,然后就可以专注于写kernel代码了。.cu文件就像glsl shader文件一样简洁明了。
例如:./kernel.cu文件,里面只有一个fill函数用于填充数组A。
extern "C" __global__ void fill(int * A, int cnt){
const int gap = blockDim.x*gridDim.x;
for (int id = blockDim.x*blockIdx.x + threadIdx.x; id < cnt; id += gap)
A[id] = id * ;
};
下面的main.cpp演示了Cuder类的使用。
#include "Cuder.h"
const int N = ;
std::string get_ptx_path(const char*);
int main(){
int A[N]; for (int i = ; i < N; ++i) A[i] = i;
//为禁止随意创建CUcontext,将构造函数声明为private,安全起见禁用了拷贝构造函数和拷贝赋值运算符
redips::Cuder cuder = redips::Cuder::getInstance();
//添加并编译一个.cu文件\[相当于glsl shader 文件\],或者直接添加一个ptx文件。
//std::string module\_file = "kernel.cu";
std::string module\_file = get\_ptx\_path("kernel.cu");
cuder.addModule(module\_file);
//显存上申请一个大小为\[sizeof(int)\*N\]的数组,并将其命名为\["a\_dev"\],用于后面操作中该数组的标识;
//如果第三个参数不为null,还会执行cpu->gpu的数据拷贝
cuder.applyArray("a\_dev", sizeof(int)\*N, A);
//运行\["./kernel.cu"\]文件中指定的\["fill"\]函数, 前两个参数设定了gridSize和blockSize
//{ "a\_dev", N }是C++11中的initializer\_list, 如果是字符串则对应前面申请的显存数组名,否则是变量类型
cuder.launch(dim3(, , ), dim3(, , ), module\_file, "fill", { "a\_dev", N });
//将\["a\_dev"\]对应的显存数组拷贝回\[A\]
cuder.fetchArray("a\_dev", sizeof(int)\*N, A);
return ;
}
std::string get_ptx_path(const char* cuFile){
std::string path = "./ptx/";
#ifdef WIN32
path += "Win32/";
#else
path += "x64/";
#endif
#ifdef _DEBUG
path += "Debug/";
#else
path += "Release/";
#endif
return path + cuFile + ".ptx";
}
cuder.addModule(…)函数的参数是一个.cu文件或者.ptx文件。
1. 如果是.cu文件,该函数负责将函数编译成ptx代码。然后封装到CUmodule里。
2. 如果是.ptx文件,该函数只是将ptx封装到CUmodule里。
建议使用第二种方式,nvidia的optix就是这么做的。好处是在编译阶段编译总比运行时编译好,如果代码有错误编译时就会提示。这时需要两点配置:
2.a 在生成依赖项里添加cuda 编译器,然后相应的.cu文件设定为用该编译器编译。
2.b 设定将.cu文件生成到指定路径下的ptx文件,然后在程序中指定该ptx文件的路径。
下面贴上Cuder.h的代码
#pragma once
#include
namespace redips{
class Cuder{
CUcontext context;
std::map
std::map
Cuder(){
checkCudaErrors(cuCtxCreate(&context, , cuDevice));
}
void release(){
//for (auto module : modules) delete module.second;
for (auto dptr : devptrs) cuMemFree(dptr.second);
devptrs.clear();
modules.clear();
cuCtxDestroy(context);
}
public:
class ValueHolder{
public:
void \* value = nullptr;
bool is\_string = false;
ValueHolder(const char\* str){
value = (void\*)str;
is\_string = true;
}
template <typename T>
ValueHolder(const T& data){
value = new T(data);
}
};
static Cuder getInstance(){
if (!cuda\_enviroment\_initialized) initialize();
return Cuder();
}
//forbidden copy-constructor and assignment function
Cuder(const Cuder&) = delete;
Cuder& operator= (const Cuder& another) = delete;
Cuder(Cuder&& another){
this->context = another.context;
another.context = nullptr;
this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));
this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
}
Cuder& operator= (Cuder&& another) {
if (this->context == another.context) return \*this;
release();
this->context = another.context;
another.context = nullptr;
this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));
this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
return \*this;
}
virtual ~Cuder(){ release(); };
public:
bool launch(dim3 gridDim, dim3 blockDim, std::string module, std::string kernel\_function, std::initializer\_list<ValueHolder> params){
//get kernel address
if (!modules.count(module)){
std::cerr << "\[Cuder\] : error: doesn't exists an module named " << module << std::endl; return false;
}
CUfunction kernel\_addr;
if (CUDA\_SUCCESS != cuModuleGetFunction(&kernel\_addr, modules\[module\], kernel\_function.c\_str())){
std::cerr << "\[Cuder\] : error: doesn't exists an kernel named " << kernel\_function << " in module " << module << std::endl; return false;
}
//setup params
std::vector<void\*> pamary;
for (auto v : params){
if (v.is\_string){
if (devptrs.count((const char\*)(v.value))) pamary.push\_back((void\*)(&(devptrs\[(const char\*)(v.value)\])));
else{
std::cerr << "\[Cuder\] : error: launch failed. doesn't exists an array named " << (const char\*)(v.value) << std::endl;;
return false;
}
}
else pamary.push\_back(v.value);
}
cudaEvent\_t start, stop;
float elapsedTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, );
bool result = (CUDA\_SUCCESS == cuLaunchKernel(kernel\_addr,/\* grid dim \*/gridDim.x, gridDim.y, gridDim.z, /\* block dim \*/blockDim.x, blockDim.y, blockDim.z, /\* shared mem, stream \*/ , , &pamary\[\], /\* arguments \*/));
cuCtxSynchronize();
cudaEventRecord(stop, );
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
std::cout << "\[Cuder\] : launch finish. cost " << elapsedTime << "ms" << std::endl;
return result;
}
bool addModule(std::string cufile){
if (modules.count(cufile)){
std::cerr << "\[Cuder\] : error: already has an modules named " << cufile << std::endl;;
return false;
}
std::string ptx = get\_ptx(cufile);
if (ptx.length() > ){
CUmodule module;
checkCudaErrors(cuModuleLoadDataEx(&module, ptx.c\_str(), , , ));
modules\[cufile\] = module;
return true;
}
else{
std::cerr << "\[Cuder\] : error: add module " << cufile << " failed!\\n";
return false;
}
}
void applyArray(const char\* name, size\_t size, void\* h\_ptr=nullptr){
if (devptrs.count(name)){
std::cerr << "\[Cuder\] : error: already has an array named " << name << std::endl;;
return;
}
CUdeviceptr d\_ptr;
checkCudaErrors(cuMemAlloc(&d\_ptr, size));
if (h\_ptr)
checkCudaErrors(cuMemcpyHtoD(d\_ptr, h\_ptr, size));
devptrs\[name\] = d\_ptr;
}
void fetchArray(const char\* name, size\_t size,void \* h\_ptr){
if (!devptrs.count(name)){
std::cerr << "\[Cuder\] : error: doesn't exists an array named " << name << std::endl;;
return;
}
checkCudaErrors(cuMemcpyDtoH(h\_ptr, devptrs\[name\], size));
}
private:
static int devID;
static CUdevice cuDevice;
static bool cuda\_enviroment\_initialized;
static void initialize(){
// picks the best CUDA device \[with highest Gflops/s\] available
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
// print device information
{
char name\[\]; int major = , minor = ;
checkCudaErrors(cuDeviceGetName(name, , cuDevice));
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice));
printf("\[Cuder\] : Using CUDA Device \[%d\]: %s, %d.%d compute capability\\n", devID, name, major, minor);
}
//initialize
checkCudaErrors(cuInit());
cuda\_enviroment\_initialized = true;
}
//如果是ptx文件则直接返回文件内容,如果是cu文件则编译后返回ptx
std::string get\_ptx(std::string filename){
std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
if (!inputFile.is\_open()) {
std::cerr << "\[Cuder\] : error: unable to open " << filename << " for reading!\\n";
return "";
}
std::streampos pos = inputFile.tellg();
size\_t inputSize = (size\_t)pos;
char \* memBlock = new char\[inputSize + \];
inputFile.seekg(, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock\[inputSize\] = '\\x0';
if (filename.find(".ptx") != std::string::npos)
return std::string(std::move(memBlock));
// compile
nvrtcProgram prog;
if (nvrtcCreateProgram(&prog, memBlock, filename.c\_str(), , NULL, NULL) == NVRTC\_SUCCESS){
delete memBlock;
if (nvrtcCompileProgram(prog, , nullptr) == NVRTC\_SUCCESS){
// dump log
size\_t logSize;
nvrtcGetProgramLogSize(prog, &logSize);
if (logSize>){
char \*log = new char\[logSize + \];
nvrtcGetProgramLog(prog, log);
log\[logSize\] = '\\x0';
std::cout << "\[Cuder\] : compile \[" << filename << "\] " << log << std::endl;
delete(log);
}
else std::cout << "\[Cuder\] : compile \[" << filename << "\] finish" << std::endl;
// fetch PTX
size\_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
char \*ptx = new char\[ptxSize+\];
nvrtcGetPTX(prog, ptx);
nvrtcDestroyProgram(&prog);
return std::string(std::move(ptx));
}
}
delete memBlock;
return "";
}
};
bool Cuder::cuda\_enviroment\_initialized = false;
int Cuder::devID = ;
CUdevice Cuder::cuDevice = ;
};
下面贴一下VS里面需要的配置
//include
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.\common\inc
//lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.\lib\x64
cuda.lib
cudart.lib
nvrtc.lib
手机扫一扫
移动阅读更方便
你可能感兴趣的文章