使用cublas 矩阵库函数实现矩阵相乘
阅读原文时间:2023年07月15日阅读:1

2014-08-10

cublas中执行矩阵乘法运算的函数

首先要注意的是cublas使用的是以列为主的存储方式,和c/c++中的以行为主的方式是不一样的。处理方法可参考下面的注释代码

// SOME PRECAUTIONS:
// IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
// WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
// The reason is explained as follows:

// CUBLAS library uses column-major storage, but C/C++ use row-major storage.
// When passing the matrix pointer to CUBLAS, the memory layout alters from
// row-major to column-major, which is equivalent to an implict transpose.

// In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
// C = A * B, we can't use the input order like cublasSgemm(A, B) because of
// implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
// If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
// multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
// is a column-based cublas matrix, which means C(T) in C/C++, we need extra
// transpose code to convert it to a row-based C/C++ matrix.

// To solve the problem, let's consider our desired result C, a row-major matrix.
// In cublas format, it is C(T) actually (becuase of the implict transpose).
// C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
// happen to be C/C++ matrice B and A (still becuase of the implict transpose)!
// We don't need extra transpose code, we only need alter the input order!
//
// CUBLAS provides high-performance matrix multiplication.
// See also:
// V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
// in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
// Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
//

小例子C++中:

A矩阵:0   3   5                      B矩阵:1   1   1

0   0   4                             1   1   1

1   0   0                             1   1   1

现在要求C = A*B

C++中的结果

C矩阵:8   8   8

4   4   4

1   1   1

在cublas中:变成以行为主

A矩阵:0   0   1                      B矩阵:1   1   1

3   0   0                             1   1   1

5   4   0                             1   1   1

在cublas中求C2=B*A

结果如下:C2在cublas中以列为主

惯性思维,先把结果用行为主存储好理解:

C2矩阵:8   4   1

8   4   1

8   4   1

在cublas实际是一列存储的,结果如下:

C2矩阵:8   8   8

4   4   4

1   1   1

此时在cublas中B*A的结果与C++中A*B结果一样,使用cublas时只需改变下参数的位置即可得到想要的结果。

cublasgemm()

cublasStatus_t cublasSgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
intm, intn, intk,
const float*alpha,
const float*A, intlda,
const float*B, intldb,
const float*beta,
float*C, intldc);
cublasStatus_t cublasDgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
intm, intn, intk,
const double*alpha,
const double*A, intlda,
const double*B, intldb,
const double*beta,
double*C, intldc);
cublasStatus_t cublasCgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
intm, intn, intk,
constcuComplex *alpha,
constcuComplex *A, intlda,
constcuComplex *B, intldb,
constcuComplex *beta,
cuComplex *C, intldc);
cublasStatus_t cublasZgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
intm, intn, intk,
constcuDoubleComplex *alpha,
constcuDoubleComplex *A, intlda,
constcuDoubleComplex *B, intldb,
constcuDoubleComplex *beta,
cuDoubleComplex *C, intldc);

参数含义可参考下面的信息:

使用cublas中cublasSgemm实现简单的矩阵相乘代码如下:

头文件:matrix.h

// SOME PRECAUTIONS:
// IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
// WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
// The reason is explained as follows:

// CUBLAS library uses column-major storage, but C/C++ use row-major storage.
// When passing the matrix pointer to CUBLAS, the memory layout alters from
// row-major to column-major, which is equivalent to an implict transpose.

// In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
// C = A * B, we can't use the input order like cublasSgemm(A, B) because of
// implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
// If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
// multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
// is a column-based cublas matrix, which means C(T) in C/C++, we need extra
// transpose code to convert it to a row-based C/C++ matrix.

// To solve the problem, let's consider our desired result C, a row-major matrix.
// In cublas format, it is C(T) actually (becuase of the implict transpose).
// C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
// happen to be C/C++ matrice B and A (still becuase of the implict transpose)!
// We don't need extra transpose code, we only need alter the input order!
//
// CUBLAS provides high-performance matrix multiplication.
// See also:
// V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
// in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
// Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
//

#include
#include

//cuda runtime
#include
#include

//包含的库
#pragma comment (lib,"cudart")
#pragma comment (lib,"cublas")

//使用这个宏就可以很方便的将我们习惯的行为主的数据转化为列为主的数据
//#define IDX2C(i,j,leading) (((j)*(leading))+(i))

typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;

cudaError_t matrixMultiply(float *h_C, const float *h_A, const float *h_B,int devID, sMatrixSize &matrix_size);

CPP文件:matrix.cpp

#include "matrix.h"

cudaError_t matrixMultiply(float *h_C, const float *h_A, const float *h_B,int devID, sMatrixSize &matrix_size){
float *dev_A = NULL;
float *dev_B = NULL;
float *dev_C = NULL;
float *h_CUBLAS = NULL;

 cudaDeviceProp devicePro;  
 cudaError\_t cudaStatus;

 cudaStatus = cudaGetDeviceProperties(&devicePro, devID);

 if(cudaStatus != cudaSuccess){  
     fprintf(stderr,"cudaGetDeviceProperties returned error code %d, line(%d)\\n", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 // allocate device memory for matrices dev\_A 、 dev\_B and dev\_C  
 unsigned int size\_A = matrix\_size.uiWA \* matrix\_size.uiHA;  
 unsigned int mem\_size\_A = sizeof(float) \* size\_A;

 unsigned int size\_B = matrix\_size.uiWB \* matrix\_size.uiHB;  
 unsigned int mem\_size\_B = sizeof(float) \* size\_B; 

 unsigned int size\_C = matrix\_size.uiWC \* matrix\_size.uiHC;  
 unsigned int mem\_size\_C = sizeof(float) \* size\_C;

 //cudaMalloc dev\_A  
 cudaStatus = cudaMalloc( (void\*\*)&dev\_A, mem\_size\_A);  
 if(cudaStatus != cudaSuccess){  
     fprintf(stderr, "cudaMalloc dev\_A return error code %d, line(%d)\\n", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 //cudaMalloc dev\_B  
 cudaStatus = cudaMalloc( (void\*\*)&dev\_B, mem\_size\_B);  
 if(cudaStatus != cudaSuccess){  
     fprintf(stderr, "cudaMalloc dev\_B return error code %d, line(%d)\\n", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 //cudaMalloc dev\_C  
 cudaStatus = cudaMalloc( (void\*\*)&dev\_C, mem\_size\_C);  
 if(cudaStatus != cudaSuccess){  
     fprintf(stderr, "cudaMalloc dev\_C return error code %d, line(%d)\\n", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 // allocate host memory for result matrices h\_CUBLAS  
 h\_CUBLAS = (float\*)malloc(mem\_size\_C);  
 if( h\_CUBLAS == NULL && size\_C > ){  
     fprintf(stderr, "malloc h\_CUBLAS error, line(%d)\\n",\_\_LINE\_\_);  
     goto Error;  
 }

 /\*  
 copy the host input vector h\_A, h\_B in host memory  
 to the device input vector dev\_A, dev\_B in device memory  
 \*/

 //cudaMemcpy h\_A to dev\_A  
 cudaStatus = cudaMemcpy(dev\_A, h\_A, mem\_size\_A, cudaMemcpyHostToDevice);  
 if( cudaStatus != cudaSuccess){  
     fprintf(stderr,"cudaMemcpy h\_A to dev\_A return error code %d, line(%d)", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 //cudaMemcpy h\_B to dev\_B  
 cudaStatus = cudaMemcpy(dev\_B, h\_B, mem\_size\_B, cudaMemcpyHostToDevice);  
 if( cudaStatus != cudaSuccess){  
     fprintf(stderr,"cudaMemcpy h\_B to dev\_B returned error code %d, line(%d)", cudaStatus, \_\_LINE\_\_);  
     goto Error;  
 }

 //CUBLAS version 2.0  
 {  
     cublasHandle\_t handle;  
     cublasStatus\_t ret;

     ret = cublasCreate(&handle);  
     if(ret != CUBLAS\_STATUS\_SUCCESS){  
         fprintf(stderr, "cublasSgemm returned error code %d, line(%d)", ret, \_\_LINE\_\_);  
         goto Error;  
     }

     cudaEvent\_t start;  
     cudaEvent\_t stop;

     cudaStatus = cudaEventCreate(&start);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Falied to create start Event (error code %s)!\\n",cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     cudaStatus = cudaEventCreate(&stop);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Falied to create stop Event (error code %s)!\\n",cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     //recode start event  
     cudaStatus = cudaEventRecord(start,NULL);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Failed to record start event (error code %s)!\\n",cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     //matrix multiple A\*B, beceause matrix is column  primary in cublas, so we can change the input  
     //order to B\*A.the reason you can see the file matrix.h

     float alpha = 1.0f;  
     float beta = 0.0f;  
     //ret = cublasSgemm(handle, CUBLAS\_OP\_N, CUBLAS\_OP\_N, matrix\_size.uiHB, matrix\_size.uiHA, matrix\_size.uiWA,  
         //&alpha, dev\_B, matrix\_size.uiWB, dev\_A, matrix\_size.uiWA, &beta, dev\_C, matrix\_size.uiWA);

     ret = cublasSgemm(handle, CUBLAS\_OP\_N, CUBLAS\_OP\_N, matrix\_size.uiHA, matrix\_size.uiHB, matrix\_size.uiWB,  
         &alpha, dev\_A, matrix\_size.uiWA, dev\_B, matrix\_size.uiWB, &beta, dev\_C, matrix\_size.uiWB);

     if(ret != CUBLAS\_STATUS\_SUCCESS){  
         fprintf(stderr,"cublasSgemm returned error code %d, line(%d)\\n", ret, \_\_LINE\_\_);  
     }

     printf("cublasSgemm done.\\n");

     //recode stop event  
     cudaStatus = cudaEventRecord(stop,NULL);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Failed to record stop event (error code %s)!\\n",cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     //wait for the stop event to complete  
     cudaStatus = cudaEventSynchronize(stop);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Failed to synchronize on the stop event (error code %s)!\\n", cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     float secTotal = 0.0f;  
     cudaStatus = cudaEventElapsedTime(&secTotal ,start, stop);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"Failed to get time elapsed between event (error code %s)!\\n", cudaGetErrorString( cudaStatus ) );  
         goto Error;  
     }

     //copy result from device to host  
     cudaStatus = cudaMemcpy(h\_CUBLAS, dev\_C, mem\_size\_C, cudaMemcpyDeviceToHost);  
     if(cudaStatus != cudaSuccess){  
         fprintf(stderr,"cudaMemcpy dev\_C to h\_CUBLAS error code %d, line(%d)!\\n", cudaStatus, \_\_LINE\_\_);  
         goto Error;  
     }

 }

 for(int i = ; i < matrix\_size.uiWC; i++){  
     for(int j = ; j < matrix\_size.uiHC; j++){  
         printf("%f    ", h\_CUBLAS\[ i\*matrix\_size.uiWC + j\]);  
     }  
     printf("\\n");  
 }

/*
//change the matrix from column primary to rows column primary
for(int i = 0; i= matrix_size.uiWC*matrix_size.uiHC || at2 >= matrix_size.uiWC*matrix_size.uiHC)
printf("transc error \n");
h_C[ at1 ] = h_CUBLAS[ at2 ];
}
}
*/
/*
for(int i = 0; i= matrix_size.uiWC*matrix_size.uiHC || at2 >= matrix_size.uiWC*matrix_size.uiHC)
//printf("transc error \n");
h_C[ at2 ] = h_CUBLAS[ at2 ];
}
}
*/

Error:
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
free(h_CUBLAS);
dev_A = NULL;
dev_B = NULL;
dev_C = NULL;
h_CUBLAS = NULL;
return cudaStatus;
}

cudaError_t reduceEdge(){
cudaError_t cudaStatus = cudaSuccess;
Error:
return cudaStatus;
}