Cuder - 用C++11封装的CUDA类
阅读原文时间:2023年07月11日阅读:1

以前写cuda:初始化环境,申请显存,初始化显存,launch kernel,拷贝数据,释放显存。一个页面大部分都是这些繁杂但又必须的操作,有时还会忘掉释放部分显存。

今天用C++11封装了这些CUDA操作,然后就可以专注于写kernel代码了。.cu文件就像glsl shader文件一样简洁明了。

例如:./kernel.cu文件,里面只有一个fill函数用于填充数组A。

extern "C" __global__ void fill(int * A, int cnt){
const int gap = blockDim.x*gridDim.x;
for (int id = blockDim.x*blockIdx.x + threadIdx.x; id < cnt; id += gap)
A[id] = id * ;
};

下面的main.cpp演示了Cuder类的使用。

#include "Cuder.h"
const int N = ;
std::string get_ptx_path(const char*);

int main(){
int A[N]; for (int i = ; i < N; ++i) A[i] = i;

//为禁止随意创建CUcontext,将构造函数声明为private,安全起见禁用了拷贝构造函数和拷贝赋值运算符  
redips::Cuder cuder = redips::Cuder::getInstance();

//添加并编译一个.cu文件\[相当于glsl shader 文件\],或者直接添加一个ptx文件。  
//std::string module\_file = "kernel.cu";  
std::string module\_file = get\_ptx\_path("kernel.cu");  
cuder.addModule(module\_file);

//显存上申请一个大小为\[sizeof(int)\*N\]的数组,并将其命名为\["a\_dev"\],用于后面操作中该数组的标识;  
//如果第三个参数不为null,还会执行cpu->gpu的数据拷贝  
cuder.applyArray("a\_dev", sizeof(int)\*N, A);

//运行\["./kernel.cu"\]文件中指定的\["fill"\]函数, 前两个参数设定了gridSize和blockSize  
//{ "a\_dev", N }是C++11中的initializer\_list, 如果是字符串则对应前面申请的显存数组名,否则是变量类型  
cuder.launch(dim3(, , ), dim3(, , ), module\_file, "fill", { "a\_dev", N });

//将\["a\_dev"\]对应的显存数组拷贝回\[A\]  
cuder.fetchArray("a\_dev", sizeof(int)\*N, A);  
return ;  

}

std::string get_ptx_path(const char* cuFile){
std::string path = "./ptx/";

#ifdef WIN32
path += "Win32/";
#else
path += "x64/";
#endif

#ifdef _DEBUG
path += "Debug/";
#else
path += "Release/";
#endif
return path + cuFile + ".ptx";
}

cuder.addModule(…)函数的参数是一个.cu文件或者.ptx文件。

1. 如果是.cu文件,该函数负责将函数编译成ptx代码。然后封装到CUmodule里。
2. 如果是.ptx文件,该函数只是将ptx封装到CUmodule里。
建议使用第二种方式,nvidia的optix就是这么做的。好处是在编译阶段编译总比运行时编译好,如果代码有错误编译时就会提示。这时需要两点配置:
2.a 在生成依赖项里添加cuda 编译器,然后相应的.cu文件设定为用该编译器编译。
2.b 设定将.cu文件生成到指定路径下的ptx文件,然后在程序中指定该ptx文件的路径。

下面贴上Cuder.h的代码

#pragma once
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include

namespace redips{
class Cuder{
CUcontext context;
std::map modules;
std::map devptrs;

    Cuder(){  
        checkCudaErrors(cuCtxCreate(&context, , cuDevice));  
    }  
    void release(){  
        //for (auto module : modules) delete module.second;  
        for (auto dptr : devptrs)    cuMemFree(dptr.second);  
        devptrs.clear();  
        modules.clear();  
        cuCtxDestroy(context);  
    }  
public:  
    class ValueHolder{  
    public:  
        void \* value = nullptr;  
        bool is\_string = false;  
        ValueHolder(const char\* str){  
            value = (void\*)str;  
            is\_string = true;  
        }  
        template <typename T>  
        ValueHolder(const T& data){  
            value = new T(data);  
        }  
    };

    static Cuder getInstance(){  
        if (!cuda\_enviroment\_initialized) initialize();  
        return Cuder();  
    }

    //forbidden copy-constructor and assignment function  
    Cuder(const Cuder&) = delete;  
    Cuder& operator= (const Cuder& another) = delete;

    Cuder(Cuder&& another){  
        this->context = another.context;  
        another.context = nullptr;  
        this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));  
        this->modules = std::map<std::string, CUmodule>(std::move(another.modules));  
    }  
    Cuder& operator= (Cuder&& another) {  
        if (this->context == another.context) return \*this;  
        release();  
        this->context = another.context;  
        another.context = nullptr;  
        this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));  
        this->modules = std::map<std::string, CUmodule>(std::move(another.modules));  
        return \*this;  
    }

    virtual ~Cuder(){ release();    };

public:  
    bool launch(dim3 gridDim, dim3 blockDim, std::string module, std::string kernel\_function, std::initializer\_list<ValueHolder> params){  
        //get kernel address  
        if (!modules.count(module)){  
            std::cerr << "\[Cuder\] : error: doesn't exists an module named " << module << std::endl; return false;  
        }  
        CUfunction kernel\_addr;  
        if (CUDA\_SUCCESS != cuModuleGetFunction(&kernel\_addr, modules\[module\], kernel\_function.c\_str())){  
            std::cerr << "\[Cuder\] : error: doesn't exists an kernel named " << kernel\_function << " in module " << module << std::endl; return false;  
        }  
        //setup params  
        std::vector<void\*> pamary;  
        for (auto v : params){  
            if (v.is\_string){  
                if (devptrs.count((const char\*)(v.value))) pamary.push\_back((void\*)(&(devptrs\[(const char\*)(v.value)\])));  
                else{  
                    std::cerr << "\[Cuder\] : error: launch failed. doesn't exists an array named " << (const char\*)(v.value) << std::endl;;  
                    return false;  
                }  
            }  
            else pamary.push\_back(v.value);  
        }

        cudaEvent\_t start, stop;  
        float elapsedTime = 0.0;  
        cudaEventCreate(&start);  
        cudaEventCreate(&stop);  
        cudaEventRecord(start, );

        bool result = (CUDA\_SUCCESS == cuLaunchKernel(kernel\_addr,/\* grid dim \*/gridDim.x, gridDim.y, gridDim.z, /\* block dim \*/blockDim.x, blockDim.y, blockDim.z, /\* shared mem, stream \*/ , , &pamary\[\], /\* arguments \*/));  
        cuCtxSynchronize();

        cudaEventRecord(stop, );  
        cudaEventSynchronize(stop);  
        cudaEventElapsedTime(&elapsedTime, start, stop);  
        std::cout << "\[Cuder\] : launch finish. cost " << elapsedTime << "ms" << std::endl;  
        return result;  
    }  
    bool addModule(std::string cufile){  
        if (modules.count(cufile)){  
            std::cerr << "\[Cuder\] : error: already has an modules named " << cufile << std::endl;;  
            return false;  
        }

        std::string ptx = get\_ptx(cufile);

        if (ptx.length() > ){  
            CUmodule module;  
            checkCudaErrors(cuModuleLoadDataEx(&module, ptx.c\_str(), , , ));  
            modules\[cufile\] = module;  
            return true;  
        }  
        else{  
            std::cerr << "\[Cuder\] : error: add module " << cufile << " failed!\\n";  
            return false;  
        }  
    }  
    void applyArray(const char\* name, size\_t size, void\* h\_ptr=nullptr){  
        if (devptrs.count(name)){  
            std::cerr << "\[Cuder\] : error: already has an array named " << name << std::endl;;  
            return;  
        }  
        CUdeviceptr d\_ptr;  
        checkCudaErrors(cuMemAlloc(&d\_ptr, size));  
        if (h\_ptr)  
            checkCudaErrors(cuMemcpyHtoD(d\_ptr, h\_ptr, size));  
        devptrs\[name\] = d\_ptr;  
    }  
    void fetchArray(const char\* name, size\_t size,void \* h\_ptr){  
        if (!devptrs.count(name)){  
            std::cerr << "\[Cuder\] : error: doesn't exists an array named " << name << std::endl;;  
            return;  
        }  
        checkCudaErrors(cuMemcpyDtoH(h\_ptr, devptrs\[name\], size));  
    }

private:  
    static int devID;  
    static CUdevice cuDevice;  
    static bool cuda\_enviroment\_initialized;  
    static void initialize(){  
        // picks the best CUDA device \[with highest Gflops/s\] available  
        devID = gpuGetMaxGflopsDeviceIdDRV();  
        checkCudaErrors(cuDeviceGet(&cuDevice, devID));  
        // print device information  
        {  
            char name\[\]; int major = , minor = ;  
            checkCudaErrors(cuDeviceGetName(name, , cuDevice));  
            checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice));  
            printf("\[Cuder\] : Using CUDA Device \[%d\]: %s, %d.%d compute capability\\n", devID, name, major, minor);  
        }  
        //initialize  
        checkCudaErrors(cuInit());

        cuda\_enviroment\_initialized = true;  
    }  
    //如果是ptx文件则直接返回文件内容,如果是cu文件则编译后返回ptx  
    std::string get\_ptx(std::string filename){  
        std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);  
        if (!inputFile.is\_open()) {  
            std::cerr << "\[Cuder\] : error: unable to open " << filename << " for reading!\\n";  
            return "";  
        }

        std::streampos pos = inputFile.tellg();  
        size\_t inputSize = (size\_t)pos;  
        char \* memBlock = new char\[inputSize + \];

        inputFile.seekg(, std::ios::beg);  
        inputFile.read(memBlock, inputSize);  
        inputFile.close();  
        memBlock\[inputSize\] = '\\x0';

        if (filename.find(".ptx") != std::string::npos)  
            return std::string(std::move(memBlock));  
        // compile  
        nvrtcProgram prog;  
        if (nvrtcCreateProgram(&prog, memBlock, filename.c\_str(), , NULL, NULL) == NVRTC\_SUCCESS){  
            delete memBlock;  
            if (nvrtcCompileProgram(prog, , nullptr) == NVRTC\_SUCCESS){  
                // dump log  
                size\_t logSize;  
                nvrtcGetProgramLogSize(prog, &logSize);  
                if (logSize>){  
                    char \*log = new char\[logSize + \];  
                    nvrtcGetProgramLog(prog, log);  
                    log\[logSize\] = '\\x0';  
                    std::cout << "\[Cuder\] : compile \[" << filename << "\] " << log << std::endl;  
                    delete(log);  
                }  
                else std::cout << "\[Cuder\] : compile \[" << filename << "\] finish" << std::endl;

                // fetch PTX  
                size\_t ptxSize;  
                nvrtcGetPTXSize(prog, &ptxSize);  
                char \*ptx = new char\[ptxSize+\];  
                nvrtcGetPTX(prog, ptx);  
                nvrtcDestroyProgram(&prog);  
                return std::string(std::move(ptx));  
            }  
        }  
        delete memBlock;  
        return "";  
    }  
};  
bool Cuder::cuda\_enviroment\_initialized = false;  
int Cuder::devID = ;  
CUdevice Cuder::cuDevice = ;  

};

下面贴一下VS里面需要的配置

//include
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.\common\inc
//lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.\lib\x64

cuda.lib
cudart.lib
nvrtc.lib

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章