CUDA-GPU编程
阅读原文时间:2023年07月14日阅读:1

参考:http://blog.csdn.net/augusdi/article/details/12833235  第二节

新建NVIDIA项目:

新建项目及会生成一个简单的代码demo,计算矩阵的加法,如下(main中加了一些显示显卡性能的打印):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}

int main()
{
const int arraySize = ;
const int a[arraySize] = { , , , , };
const int b[arraySize] = { , , , , };
int c[arraySize] = { };

// Add vectors in parallel.  
cudaError\_t cudaStatus = addWithCuda(c, a, b, arraySize);  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "addWithCuda failed!");  
    return ;  
}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\\n",  
    c\[\], c\[\], c\[\], c\[\], c\[\]);

// cudaDeviceReset must be called before exiting in order for profiling and  
// tracing tools such as Nsight and Visual Profiler to show complete traces.  
cudaStatus = cudaDeviceReset();  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaDeviceReset failed!");  
    return ;  
}

int deviceCount;  
cudaGetDeviceCount(&deviceCount);  
int dev;  
for (dev = ; dev < deviceCount; dev++)  
{  
    cudaDeviceProp deviceProp;  
    cudaGetDeviceProperties(&deviceProp, dev);  
    if (dev == )  
    {  
        if (/\*deviceProp.major==9999 && \*/deviceProp.minor = &&deviceProp.major==)  
            printf("\\n");

    }  
    printf("\\nDevice%d:\\"%s\\"\\n", dev, deviceProp.name);  
    printf("Total amount of global memory                   %u bytes\\n", deviceProp.totalGlobalMem);  
    printf("Number of mltiprocessors                        %d\\n", deviceProp.multiProcessorCount);  
    printf("Total amount of constant memory:                %u bytes\\n", deviceProp.totalConstMem);  
    printf("Total amount of shared memory per block         %u bytes\\n", deviceProp.sharedMemPerBlock);  
    printf("Total number of registers available per block:  %d\\n", deviceProp.regsPerBlock);  
    printf("Warp size                                       %d\\n", deviceProp.warpSize);  
    printf("Maximum number of threada per block:            %d\\n", deviceProp.maxThreadsPerBlock);  
    printf("Maximum sizes of each dimension of a block:     %d x %d x %d\\n", deviceProp.maxThreadsDim\[\],  
        deviceProp.maxThreadsDim\[\],  
        deviceProp.maxThreadsDim\[\]);  
    printf("Maximum size of each dimension of a grid:       %d x %d x %d\\n", deviceProp.maxGridSize\[\], deviceProp.maxGridSize\[\], deviceProp.maxGridSize\[\]);  
    printf("Maximum memory pitch :                          %u bytes\\n", deviceProp.memPitch);  
    printf("Texture alignmemt                               %u bytes\\n", deviceProp.texturePitchAlignment);  
    printf("Clock rate                                      %.2f GHz\\n", deviceProp.clockRate\*1e-6f);  
}  
printf("\\nTest PASSED\\n");

getchar();  
return ;  

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.  
cudaStatus = cudaSetDevice();  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
    goto Error;  
}

// Allocate GPU buffers for three vectors (two input, one output)    .  
cudaStatus = cudaMalloc((void\*\*)&dev\_c, size \* sizeof(int));  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMalloc failed!");  
    goto Error;  
}

cudaStatus = cudaMalloc((void\*\*)&dev\_a, size \* sizeof(int));  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMalloc failed!");  
    goto Error;  
}

cudaStatus = cudaMalloc((void\*\*)&dev\_b, size \* sizeof(int));  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMalloc failed!");  
    goto Error;  
}

// Copy input vectors from host memory to GPU buffers.  
cudaStatus = cudaMemcpy(dev\_a, a, size \* sizeof(int), cudaMemcpyHostToDevice);  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMemcpy failed!");  
    goto Error;  
}

cudaStatus = cudaMemcpy(dev\_b, b, size \* sizeof(int), cudaMemcpyHostToDevice);  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMemcpy failed!");  
    goto Error;  
}

// Launch a kernel on the GPU with one thread for each element.  
addKernel<<<, size>>>(dev\_c, dev\_a, dev\_b);

// Check for any errors launching the kernel  
cudaStatus = cudaGetLastError();  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "addKernel launch failed: %s\\n", cudaGetErrorString(cudaStatus));  
    goto Error;  
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns  
// any errors encountered during the launch.  
cudaStatus = cudaDeviceSynchronize();  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\\n", cudaStatus);  
    goto Error;  
}

// Copy output vector from GPU buffer to host memory.  
cudaStatus = cudaMemcpy(c, dev\_c, size \* sizeof(int), cudaMemcpyDeviceToHost);  
if (cudaStatus != cudaSuccess) {  
    fprintf(stderr, "cudaMemcpy failed!");  
    goto Error;  
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus;  

}

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章