NVIDIA 推出的一种通用的并行计算平台和编程模型,它利用了 NVIDIA GPU 中的并行计算引擎。
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
C\[i\] = A\[i\] + B\[i\];
}
}
int main(void)
{
// …
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<
// …
}
线程块
线程块大小
线程索引
线程 ID
一维
(Dx)
(x)
x
二维
(Dx, Dy)
(x, y)
yDx + x
三维
(Dx, Dy, Dz)
(x, y, z)
zDxDy + yDx + x
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
// blockIdx.x * blockDim.x 表示当前 block 的第一个 thread 在 grid 中的 x 坐标
// 所以 blockIdx.x * blockDim.x + threadIdx.x 表示当前 thread 在 grid 中的 x 坐标
int i = blockIdx.x * blockDim.x + threadIdx.x;
// blockIdx.y * blockDim.y 表示当前 block 的第一个 thread 在 grid 中的 y 坐标
// 所以 blockIdx.y * blockDim.y + threadIdx.y 表示当前 thread 在 grid 中的 y 坐标
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C\[i\]\[j\] = A\[i\]\[j\] + B\[i\]\[j\];
}
int main()
{
…
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<
…
}
全局内存、常量内存和纹理内存空间在由同一应用程序启动的多个 kernel 之间是持久的。
kernel (device code)在 GPU (device)上执行,程序的其他部分(host code)在 CPU (host)上执行。
int N = 1024;
size_t size = N * sizeof(float);
// 分配 host 内存
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);
// 初始化输入
…
// 分配 device 内存
float* d_A;
cudaMalloc(&d_A, size);
float* d_B;
cudaMalloc(&d_B, size);
float* d_C;
cudaMalloc(&d_C, size);
// 拷贝内存:host -> device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// 调用 kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<
// 返回结果:拷贝内存,device -> host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// 释放 device 内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// 释放 host 内存
free(h_A)
free(h_B)
free(h_C)
对于二维、三维向量来说,也可以使用 cudaMalloc() 和 cudaMemcpy(),但是使用下列函数可以进行适当填充,实现内存对齐。
// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch;
cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch, width, height);
// Device code
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height)
{
for (int r = 0; r < height; ++r) {
float* row = (float*)((char*)devPtr + r * pitch);
for (int c = 0; c < width; ++c) {
float element = row[c];
}
}
}
// Host code
int width = 64, height = 64, depth = 64;
cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth);
cudaPitchedPtr devPitchedPtr;
cudaMalloc3D(&devPitchedPtr, extent);
MyKernel<<<100, 512>>>(devPitchedPtr, width, height, depth);
// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr, int width, int height, int depth)
{
char* devPtr = devPitchedPtr.ptr;
size_t pitch = devPitchedPtr.pitch;
size_t slicePitch = pitch * height;
for (int z = 0; z < depth; ++z) {
char* slice = devPtr + z * slicePitch;
for (int y = 0; y < height; ++y) {
float* row = (float*)(slice + y * pitch);
for (int x = 0; x < width; ++x) {
float element = row[x];
}
}
}
}
共享内存使用标识符 __shared__ 表示。
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width; // 每个矩阵/子矩阵的宽
int height; // 每个矩阵/子矩阵的高
int stride; // 对子矩阵而言有意义,表示所在大矩阵的宽
float* elements; // 数据区起始地址
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d\_B;
d\_B.width = d\_B.stride = B.width; d\_B.height = B.height;
size = B.width \* B.height \* sizeof(float);
cudaMalloc(&d\_B.elements, size);
cudaMemcpy(d\_B.elements, B.elements, size, cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d\_C;
d\_C.width = d\_C.stride = C.width; d\_C.height = C.height;
size = C.width \* C.height \* sizeof(float);
cudaMalloc(&d\_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK\_SIZE, BLOCK\_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d\_A, d\_B, d\_C);
// Read C from device memory
cudaMemcpy(C.elements, d\_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d\_A.elements);
cudaFree(d\_B.elements);
cudaFree(d\_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK\_SIZE); ++m) {
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
\_\_shared\_\_ float As\[BLOCK\_SIZE\]\[BLOCK\_SIZE\];
\_\_shared\_\_ float Bs\[BLOCK\_SIZE\]\[BLOCK\_SIZE\];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As\[row\]\[col\] = GetElement(Asub, row, col);
Bs\[row\]\[col\] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
\_\_syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK\_SIZE; ++e)
Cvalue += As\[row\]\[e\] \* Bs\[e\]\[col\];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
\_\_syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
可以在 device 和 host 上访问;
拥有和应用程序相同的生命周期;
gridDim:grid 的维度,dim3 类型;
blockDim:block 的维度,dim3 类型;
blockIdx:block 在 grid 内的索引,uint3 类型;
threadIdx:thread 在 block 内的索引,uint3 类型;
ps:本例子是在 cuda 自带的 samples 的基础上做的简化。
vectorAdd.cu:
#include
#include
#include
// C = A + B
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C\[i\] = A\[i\] + B\[i\];
}
}
int main(void) {
cudaError_t err = cudaSuccess;
int numElements = 50000;
size\_t size = numElements \* sizeof(float);
// 分配 host 内存
float \*h\_A = (float \*)malloc(size);
float \*h\_B = (float \*)malloc(size);
float \*h\_C = (float \*)malloc(size);
if (h\_A == NULL || h\_B == NULL || h\_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\\n");
exit(EXIT\_FAILURE);
}
// 初始化输入向量
for (int i = 0; i < numElements; ++i) {
h\_A\[i\] = rand()/(float)RAND\_MAX;
h\_B\[i\] = rand()/(float)RAND\_MAX;
}
// 分配 device 内存
float \*d\_A = NULL;
err = cudaMalloc((void \*\*)&d\_A, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
float \*d\_B = NULL;
err = cudaMalloc((void \*\*)&d\_B, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
float \*d\_C = NULL;
err = cudaMalloc((void \*\*)&d\_C, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
// 拷贝内存:host -> device
err = cudaMemcpy(d\_A, h\_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
err = cudaMemcpy(d\_B, h\_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
// 启动 kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d\_A, d\_B, d\_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
// 返回计算结果:device -> host
err = cudaMemcpy(h\_C, d\_C, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
// 验证计算结果
for (int i = 0; i < numElements; ++i) {
if (fabs(h\_A\[i\] + h\_B\[i\] - h\_C\[i\]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\\n", i);
exit(EXIT\_FAILURE);
}
}
printf("Test PASSED\\n");
// 释放 device 内存
err = cudaFree(d\_A);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
err = cudaFree(d\_B);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector B (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
err = cudaFree(d\_C);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector C (error code %s)!\\n", cudaGetErrorString(err));
exit(EXIT\_FAILURE);
}
// 释放 host 内存
free(h\_A);
free(h\_B);
free(h\_C);
printf("Done\\n");
return 0;
}
Makefile:
CUDA_PATH ?= "/usr/local/cuda-10.0"
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
TARGET_SIZE := 64
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
NVCCFLAGS += -g -G
BUILD_TYPE := debug
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
INCLUDES := -I../../common/inc
LIBRARIES :=
SMS ?= 30 35 37 50 52 60 61 70 75
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
HIGHEST_SM := $(lastword $(sort $(SMS)))
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
all: build
build: vectorAdd
vectorAdd.o:vectorAdd.cu
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
vectorAdd: vectorAdd.o
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
./vectorAdd
clean:
rm -f vectorAdd vectorAdd.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/vectorAdd
vectorAdd$ make
"/usr/local/cuda-10.0"/bin/nvcc -ccbin g++ -I../../common/inc -m64 -g -G -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd.o -c vectorAdd.cu
"/usr/local/cuda-10.0"/bin/nvcc -ccbin g++ -m64 -g -G -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o vectorAdd vectorAdd.o
mkdir -p ../../bin/x86_64/linux/debug
cp vectorAdd ../../bin/x86_64/linux/debug
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
手机扫一扫
移动阅读更方便
你可能感兴趣的文章