cudaThreadSynchronize()
阅读原文时间:2023年07月15日阅读:1

// 调用CUDA kernel 是非阻塞的,调用kernel语句后面的语句不等待kernel执行完,立即执行。所以在 call_kernel(see kernel.cu) 中执行 m5op.dump 是错误的!!!

// REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime

// cudaThreadSynchronize() 暂停调用者的执行,直到前面的 stream operation 执行完毕。

// REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize

// C++ thread join 问题,在 kernel.cpp 中也有 join,那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?

// REF: http://en.cppreference.com/w/cpp/algorithm/for_each

// 若 GPU 先执行完毕,在 main.cpp 中join后 dump 似乎合理; 若 CPU 先执行完毕,岂不是要阻塞在 cudaThreadSynchronize 处?

// 暂且在 kernel.cp p中 dump!

kernel.cpp

// CPU threads--------------------------------------------------------------------------------------
void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha
#ifdef CUDA_8_0
, std::atomic_int *worklist
#endif
) {
std::cout<<"run_cpu_threads start."<<std::endl;

const int                REGS\_CPU = REGS \* ldim;  
std::vector<std::thread> cpu\_threads;  
for(int i = ; i < n\_threads; i++) {

    cpu\_threads.push\_back(std::thread(\[=\]() {

#ifdef CUDA_8_0
Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist);
#else
Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads);
#endif

        const int matrix\_size       = m \* (n + pad);  
        const int matrix\_size\_align = (matrix\_size + ldim \* REGS - ) / (ldim \* REGS) \* (ldim \* REGS);

        for(int my\_s = cpu\_first(&p); cpu\_more(&p); my\_s = cpu\_next(&p)) {

            // Declare on-chip memory  
            T   reg\[REGS\_CPU\];  
            int pos      = matrix\_size\_align -  - (my\_s \* REGS\_CPU);  
            int my\_s\_row = pos / (n + pad);  
            int my\_x     = pos % (n + pad);  
            int pos2     = my\_s\_row \* n + my\_x;  

// Load in on-chip memory
#pragma unroll
for(int j = ; j < REGS_CPU; j++) { if(pos2 >= && my_x < n && pos2 < matrix_size)
reg[j] = matrix[pos2];
else
reg[j] = ;
pos--;
my_s_row = pos / (n + pad);
my_x = pos % (n + pad);
pos2 = my_s_row * n + my_x;
}

            // Set global synch  
            while((&flags\[my\_s\])->load() == ) {  
            }  
            (&flags\[my\_s + \])->fetch\_add();

            // Store to global memory  
            pos = matrix\_size\_align -  - (my\_s \* REGS\_CPU);  

#pragma unroll
for(int j = ; j < REGS_CPU; j++) { if(pos >= && pos < matrix_size)
matrix_out[pos] = reg[j];
pos--;
}
}
}));
}
std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });
std::cout<<"dump.. after run_cpu_threads end."<<std::endl;
m5_dump_stats(,);
}

kernel.cu

cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha,
T *matrix_out, T *matrix, int *flags
#ifdef CUDA_8_0
, int l_mem_size, int *worklist
#endif
){
std::cout<<"call_pad start."<>>(n, m, pad, n_tasks, alpha,
matrix_out, matrix, flags
#ifdef CUDA_8_0
, worklist
#endif
);
cudaError_t err = cudaGetLastError();
std::cout<<"dump.. after call_pad end."<<std::endl;
m5_dump_stats(,);
return err;
}

main.cpp

for(int rep = ; rep < p.n_warmup + p.n_reps; rep++) {

    // Reset  

#ifdef CUDA_8_0
for(int i = ; i < p.n_bins; i++) {
h_histo[i].store();
}
#else
memset(h_histo, , p.n_bins * sizeof(unsigned int));
cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaThreadSynchronize();
CUDA_ERR();
#endif

    std::cout<<"m5 work begin."<<std::endl;

    // Launch GPU threads  
    // Kernel launch  
    if(p.n\_gpu\_blocks > ) {  
        std::cout<<"launch gpu."<<std::endl;  
        cudaStatus = call\_Histogram\_kernel(p.n\_gpu\_blocks, p.n\_gpu\_threads, p.in\_size, p.n\_bins, n\_cpu\_bins,  
            d\_in, (unsigned int\*)d\_histo, p.n\_bins \* sizeof(unsigned int));  
        CUDA\_ERR();  
    }

    // Launch CPU threads  
    std::cout<<"launch cpu."<<std::endl;  
    std::thread main\_thread(run\_cpu\_threads, (unsigned int \*)h\_histo, h\_in, p.in\_size, p.n\_bins, p.n\_threads,  
        p.n\_gpu\_threads, n\_cpu\_bins);  
        std::cout<<"cuda sync."<<std::endl;

    cudaThreadSynchronize();  
    std::cout<<"cpu join after cuda sync."<<std::endl;  
    main\_thread.join();

    //m5\_work\_end(0, 0);  
    std::cout<<"m5 work end."<<std::endl;  
}