CUDA 介绍

阅读原文时间：2023年07月13日阅读：1

1. 介绍

GPU 使用更多的晶体管进行数据处理，而不是数据缓存和流控制，因此可以提供高度的并行计算。
GPU 可以通过计算来隐藏内存访问延迟，而不是依赖于大量的数据缓存和复杂的流控制来避免长时间的内存访问延迟。

NVIDIA 推出的一种通用的并行计算平台和编程模型，它利用了 NVIDIA GPU 中的并行计算引擎。

2. 编程模型

就是一个 C++ 函数，但与普通 C++ 函数不同的是：当 kernel 被调用时，它会在 N 个不同的 CUDA 线程上被并行执行，也就是一共被执行了 N 次。
使用 __global__ 来标识 kernel。
执行 kernel 所使用的 CUDA 线程数在 <<<…>>> 中指定：kernelName<<<blocksPerGid, threadsPerBlock>>>(params…)

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)  
{  
    C\[i\] = A\[i\] + B\[i\];  
}

}

int main(void)
{
// …
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<>>(d_A, d_B, d_C, numElements);
// …
}

一个 grid 可以包含多个 block，一个 block 又可以包含多个 thread。
内建变量 blockIdx（block在grid中的索引）是一个三维变量，threadIdx（thread在block中的索引）也是一个三维变量。这意味着 grid 中的 blocks 可以组织成三维，block 中的 threads 也可以组织成三维。如，blockIdx.x 表示 block 在 grid 中的 x 坐标（x轴方向上第几个 block）；threadIdx.x 表示 thread 在 block 中的 x 坐标（x轴方向上第几个 thread）。
block 的各个维度的大小可用通过内建变量 blockDim 来获得，如，blockDim.x 表示 grid 在 x 轴方向上有多少个 block。
线程索引和线程 ID 的对应关系如下：

线程块

线程块大小

线程索引

线程 ID

一维

(Dx)

(x)

二维

(Dx, Dy)

(x, y)

yDx + x

三维

(Dx, Dy, Dz)

(x, y, z)

zDxDy + yDx + x

__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
　　 // blockIdx.x * blockDim.x 表示当前 block 的第一个 thread 在 grid 中的 x 坐标
　　 // 所以 blockIdx.x * blockDim.x + threadIdx.x 表示当前 thread 在 grid 中的 x 坐标
int i = blockIdx.x * blockDim.x + threadIdx.x;

　　 // blockIdx.y * blockDim.y 表示当前 block 的第一个 thread 在 grid 中的 y 坐标
　　 // 所以 blockIdx.y * blockDim.y + threadIdx.y 表示当前 thread 在 grid 中的 y 坐标
int j = blockIdx.y * blockDim.y + threadIdx.y;

if (i < N && j < N)  
    C\[i\]\[j\] = A\[i\]\[j\] + B\[i\]\[j\];

}

int main()
{
…
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<>>(A, B, C);
…
}

thread 私有内存：每个 thread 都有一个本地内存，该内存只有自己能够访问；
block 共享内存：每个 block 都有一个共享内存，该内存只有 block 内的 thread 可访问；
全局内存：所有 thread 都可以访问全局内存；
常量内存和 纹理(texture) 内存：只读、全局可访问。

全局内存、常量内存和纹理内存空间在由同一应用程序启动的多个 kernel 之间是持久的。

kernel （device code）在 GPU （device）上执行，程序的其他部分（host code）在 CPU （host）上执行。

3. 编程接口

3.1.1 一维

分配内存：cudaMalloc()
释放内存：cudaFree()
在 host 和 device 之间拷贝内存：cudaMemcpy()

int N = 1024;
size_t size = N * sizeof(float);

// 分配 host 内存
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);

// 初始化输入
…

// 分配 device 内存
float* d_A;
cudaMalloc(&d_A, size);
float* d_B;
cudaMalloc(&d_B, size);
float* d_C;
cudaMalloc(&d_C, size);

// 拷贝内存：host -> device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// 调用 kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<>>(d_A, d_B, d_C, N);

// 返回结果：拷贝内存，device -> host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

// 释放 device 内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

// 释放 host 内存
free(h_A)
free(h_B)
free(h_C)

对于二维、三维向量来说，也可以使用 cudaMalloc() 和 cudaMemcpy()，但是使用下列函数可以进行适当填充，实现内存对齐。

3.1.2 二维

分配内存：cudaMallocPitch()
在 host 和 device 之间拷贝内存：cudaMemcpy2D()

// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch;

cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height);

MyKernel<<<100, 512>>>(devPtr, pitch, width, height);

// Device code
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height)
{
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c) {
float element = row[c];
}
}
}

3.1.3 三维

分配内存：cudaMalloc3D()
在 host 和 device 之间拷贝内存：cudaMemcpy3D()

// Host code
int width = 64, height = 64, depth = 64;
cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);

cudaPitchedPtr devPitchedPtr;
cudaMalloc3D(&devPitchedPtr, extent);

MyKernel<<<100, 512>>>(devPitchedPtr, width, height, depth);

// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr, int width, int height, int depth)
{
char* devPtr = devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * height;
for (int z = 0; z < depth; ++z) {
char* slice = devPtr + z * slicePitch;
for (int y = 0; y < height; ++y) {
float* row = (float*)(slice + y * pitch);
for (int x = 0; x < width; ++x) {
float element = row[x];
}
}
}
}

共享内存使用标识符 __shared__ 表示。

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width; // 每个矩阵/子矩阵的宽
int height; // 每个矩阵/子矩阵的高
int stride; // 对子矩阵而言有意义，表示所在大矩阵的宽
float* elements; // 数据区起始地址
} Matrix;

// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}

// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}

// Thread block size
#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

Matrix d\_B;  
d\_B.width = d\_B.stride = B.width; d\_B.height = B.height;  
size = B.width \* B.height \* sizeof(float);  
cudaMalloc(&d\_B.elements, size);  
cudaMemcpy(d\_B.elements, B.elements, size, cudaMemcpyHostToDevice);

// Allocate C in device memory  
Matrix d\_C;  
d\_C.width = d\_C.stride = C.width; d\_C.height = C.height;  
size = C.width \* C.height \* sizeof(float);  
cudaMalloc(&d\_C.elements, size);

// Invoke kernel  
dim3 dimBlock(BLOCK\_SIZE, BLOCK\_SIZE);  
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);  
MatMulKernel<<<dimGrid, dimBlock>>>(d\_A, d\_B, d\_C);

// Read C from device memory  
cudaMemcpy(C.elements, d\_C.elements, size, cudaMemcpyDeviceToHost);

// Free device memory  
cudaFree(d\_A.elements);  
cudaFree(d\_B.elements);  
cudaFree(d\_C.elements);

}

// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;

// Each thread block computes one sub-matrix Csub of C  
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

// Each thread computes one element of Csub  
// by accumulating results into Cvalue  
float Cvalue = 0;

// Thread row and column within Csub  
int row = threadIdx.y;  
int col = threadIdx.x;

// Loop over all the sub-matrices of A and B that are  
// required to compute Csub  
// Multiply each pair of sub-matrices together  
// and accumulate the results  
for (int m = 0; m < (A.width / BLOCK\_SIZE); ++m) {

    // Get sub-matrix Asub of A  
    Matrix Asub = GetSubMatrix(A, blockRow, m);

    // Get sub-matrix Bsub of B  
    Matrix Bsub = GetSubMatrix(B, m, blockCol);

    // Shared memory used to store Asub and Bsub respectively  
    \_\_shared\_\_ float As\[BLOCK\_SIZE\]\[BLOCK\_SIZE\];  
    \_\_shared\_\_ float Bs\[BLOCK\_SIZE\]\[BLOCK\_SIZE\];

    // Load Asub and Bsub from device memory to shared memory  
    // Each thread loads one element of each sub-matrix  
    As\[row\]\[col\] = GetElement(Asub, row, col);  
    Bs\[row\]\[col\] = GetElement(Bsub, row, col);

    // Synchronize to make sure the sub-matrices are loaded  
    // before starting the computation  
    \_\_syncthreads();  
    // Multiply Asub and Bsub together  
    for (int e = 0; e < BLOCK\_SIZE; ++e)  
        Cvalue += As\[row\]\[e\] \* Bs\[e\]\[col\];

    // Synchronize to make sure that the preceding  
    // computation is done before loading two new  
    // sub-matrices of A and B in the next iteration  
    \_\_syncthreads();  
}

// Write Csub to device memory  
// Each thread writes one element  
SetElement(Csub, row, col, Cvalue);

}

4. C++ 扩展

global

在 device 上执行；
从 host 上调用；
标识一个函数为 kernel；
异步执行：在 device 完成计算之前就返回；
返回值为 void，不可以是成员函数；

device

在 device 上执行；
只能从 device 上调用；

host

在 host 上执行；
只能从 host 上调用；

device

驻留在 device 上；
位于全局内存中；（如果不和其他标识符一起使用的话）
拥有和 CUDA context 相同的生命周期；
可以和以下标识符一起使用；

constant

位于常量内存中；
拥有和 CUDA context 相同的生命周期；

shared

位于共享内存中；
拥有和 block 相同的生命周期；
只能被 block 内的线程访问；

managed

可以在 device 和 host 上访问；
拥有和应用程序相同的生命周期；
gridDim：grid 的维度，dim3 类型；
blockDim：block 的维度，dim3 类型；
blockIdx：block 在 grid 内的索引，uint3 类型；
threadIdx：thread 在 block 内的索引，uint3 类型；

5. 例子

ps：本例子是在 cuda 自带的 samples 的基础上做的简化。

vectorAdd.cu:

#include
#include
#include

// C = A + B
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements) {  
    C\[i\] = A\[i\] + B\[i\];  
}

}

int main(void) {
cudaError_t err = cudaSuccess;

int numElements = 50000;  
size\_t size = numElements \* sizeof(float);

// 分配 host 内存  
float \*h\_A = (float \*)malloc(size);  
float \*h\_B = (float \*)malloc(size);  
float \*h\_C = (float \*)malloc(size);

if (h\_A == NULL || h\_B == NULL || h\_C == NULL) {  
    fprintf(stderr, "Failed to allocate host vectors!\\n");  
    exit(EXIT\_FAILURE);  
}

// 初始化输入向量  
for (int i = 0; i < numElements; ++i) {  
    h\_A\[i\] = rand()/(float)RAND\_MAX;  
    h\_B\[i\] = rand()/(float)RAND\_MAX;  
}

// 分配 device 内存  
float \*d\_A = NULL;  
err = cudaMalloc((void \*\*)&d\_A, size);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

float \*d\_B = NULL;  
err = cudaMalloc((void \*\*)&d\_B, size);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

float \*d\_C = NULL;  
err = cudaMalloc((void \*\*)&d\_C, size);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

// 拷贝内存：host -> device  
err = cudaMemcpy(d\_A, h\_A, size, cudaMemcpyHostToDevice);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

err = cudaMemcpy(d\_B, h\_B, size, cudaMemcpyHostToDevice);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

// 启动 kernel  
int threadsPerBlock = 256;  
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d\_A, d\_B, d\_C, numElements);  
err = cudaGetLastError();  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

// 返回计算结果：device -> host  
err = cudaMemcpy(h\_C, d\_C, size, cudaMemcpyDeviceToHost);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

// 验证计算结果  
for (int i = 0; i < numElements; ++i) {  
    if (fabs(h\_A\[i\] + h\_B\[i\] - h\_C\[i\]) > 1e-5) {  
        fprintf(stderr, "Result verification failed at element %d!\\n", i);  
        exit(EXIT\_FAILURE);  
    }  
}  
printf("Test PASSED\\n");

// 释放 device 内存  
err = cudaFree(d\_A);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to free device vector A (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

err = cudaFree(d\_B);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to free device vector B (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

err = cudaFree(d\_C);  
if (err != cudaSuccess) {  
    fprintf(stderr, "Failed to free device vector C (error code %s)!\\n", cudaGetErrorString(err));  
    exit(EXIT\_FAILURE);  
}

// 释放 host 内存  
free(h\_A);  
free(h\_B);  
free(h\_C);

printf("Done\\n");  
return 0;

}

Makefile:

CUDA_PATH ?= "/usr/local/cuda-10.0"

架构

HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
TARGET_SIZE := 64

操作系统

HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)

编译器

HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=

NVCCFLAGS += -g -G
BUILD_TYPE := debug

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

INCLUDES := -I../../common/inc
LIBRARIES :=

SMS ?= 30 35 37 50 52 60 61 70 75
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
HIGHEST_SM := $(lastword $(sort $(SMS)))
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)

all: build
build: vectorAdd

vectorAdd.o:vectorAdd.cu
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAdd: vectorAdd.o
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)

run: build
./vectorAdd

clean:
rm -f vectorAdd vectorAdd.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/vectorAdd

vectorAdd$ make
"/usr/local/cuda-10.0"/bin/nvcc -ccbin g++ -I../../common/inc -m64 -g -G -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd.o -c vectorAdd.cu
"/usr/local/cuda-10.0"/bin/nvcc -ccbin g++ -m64 -g -G -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd vectorAdd.o
mkdir -p ../../bin/x86_64/linux/debug
cp vectorAdd ../../bin/x86_64/linux/debug