其他:
CUDA内存(一) 寄存器
共享内存实际上是可受用户控制的一级缓存. [1]
只有当数据重复利用, 全局内存合并, 或者线程之间有共享数据时, 使用共享内存才合适.
SortArray.h
#pragma once
#include "Global.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#define MAX_NUM_LISTS 32
#define NUM_ELEM 4096
//************************************
// Method: cpu_sort
// Brief: CPU实现基数排序
// Access: public
// Returns: void
// Qualifier:
// Param(I/O): u32 * puData
// Param(I/O): u32 uArrayLen
//************************************
__host__ void cpu_sort(u32* h_puData, u32 uArrayLen);
//************************************
// Method: find_min
// Brief: 从num_lists个列表中找出最小值.
// Access: public
// Returns: u32
// Qualifier:
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const list_indexes
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements_per_list
//************************************
u32 find_min(const u32 * const src_array,
u32 * const list_indexes,
const u32 num_lists,
const u32 num_elements_per_list);
//************************************
// Method: merge_array
// Brief: 将num_lists个排序好的列表合并
// Access: public
// Returns: void
// Qualifier:
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const dest_array
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
void merge_array(const u32 * const src_array,
u32 * const dest_array,
const u32 num_lists,
const u32 num_elements);
//************************************
// Method: gpu_sort_array
// Brief: gpu基数排序host函数
// Access: public
// Returns: void
// Qualifier:
// Param(I/O): u32 * const data
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
void gpu_sort_array(u32 * const data,
const u32 num_lists,
const u32 num_elements);
SortArray.cu
#include "SortArray.h"
#include <stdio.h>
__host__ void cpu_sort(u32* puData, u32 uArrayLen)
{
static u32* puCpuTemp0 = (u32*)malloc(uArrayLen*sizeof(u32));
static u32* puCpuTemp1 = (u32*)malloc(uArrayLen*sizeof(u32));
for (u32 bit = 0; bit < sizeof(u32)*8; bit++)
{
u32 uBaseCnt0 = 0;
u32 uBaseCnt1 = 0;
u32 bit_mask = (1 << bit);
for (u32 i = 0; i < uArrayLen; i++)
{
u32 d = puData[i];
if ((d&bit_mask) > 0)
{
puCpuTemp1[uBaseCnt1] = d;
uBaseCnt1++;
}
else
{
puData[uBaseCnt0] = d;
uBaseCnt0++;
}
}
// Copy data back to source
for (u32 i = 0; i < uBaseCnt1; i++)
{
puData[uBaseCnt0 + i] = puCpuTemp1[i];
}
}
// 释放临时资源.
free((void*)puCpuTemp0);
free((void*)puCpuTemp1);
}
//************************************
// Method: radix_sort
// Brief: GPU基数排序
// Access: public
// Returns: __device__ void
// Qualifier:
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
// Param(I/O): u32 * const sort_tmp_0
// Param(I/O): u32 * const sort_tmp_1
//************************************
__device__ void radix_sort(u32 * const sort_tmp,
const u32 num_lists,
const u32 num_elements,
const u32 tid,
u32 * const sort_tmp_0,
u32 * const sort_tmp_1)
{
// Sort into num_list, lists
// Apply radix sort on 32 bits of data
for (u32 bit=0; bit < 32; bit++)
{
u32 base_cnt_0 = 0;
u32 base_cnt_1 = 0;
for (u32 i=0; i < num_elements; i+=num_lists)
{
const u32 elem = sort_tmp[i+tid];
const u32 bit_mask =(1 << bit);
if ((elem & bit_mask) > 0)
{
sort_tmp_1[base_cnt_1+tid] = elem;
base_cnt_1+=num_lists;
}
else
{
sort_tmp_0[base_cnt_0+tid] = elem;
base_cnt_0+=num_lists;
}
}
// Copy data back to source - first the zero list
for (u32 i=0; i < base_cnt_0; i+=num_lists)
{
sort_tmp[i+tid] = sort_tmp_0[i+tid];
}
// Copy data back to source - then the one list
for (u32 i=0; i < base_cnt_1; i+=num_lists)
{
sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
}
}
__syncthreads();
}
u32 find_min(const u32 * const src_array,
u32 * const list_indexes,
const u32 num_lists,
const u32 num_elements_per_list)
{
u32 min_val = 0xFFFFFFFF;
u32 min_idx = 0;
// Iterate over each of the lists
for (u32 i=0; i < num_lists; i++)
{
// If the current list has already been emptied
// then ignore it
if (list_indexes[i] < num_elements_per_list)
{
const u32 src_idx = i +(list_indexes[i] * num_lists);
const u32 data = src_array[src_idx];
if (data <= min_val)
{
min_val = data;
min_idx = i;
}
}
}
list_indexes[min_idx]++;
return min_val;
}
void merge_array(const u32 * const src_array,
u32 * const dest_array,
const u32 num_lists,
const u32 num_elements)
{
const u32 num_elements_per_list =(num_elements / num_lists);
u32 list_indexes[MAX_NUM_LISTS];
for (u32 list=0; list < num_lists; list++)
{
list_indexes[list] = 0;
}
for (u32 i=0; i < num_elements; i++)
{
dest_array[i] = find_min(src_array,
list_indexes,
num_lists,
num_elements_per_list);
}
}
//************************************
// Method: radix_sort2_device
// Brief: GPU基数排序,优化2
// Access: public
// Returns: __device__ void
// Qualifier:
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
// Param(I/O): u32 * const sort_tmp_1
//************************************
__device__ void radix_sort2(u32 * const sort_tmp,
const u32 num_lists,
const u32 num_elements,
const u32 tid,
u32 * const sort_tmp_1)
{
for (u32 bit = 0; bit < 32;bit++)
{
const u32 bit_mask = (1 << bit);
u32 base_cnt_0 = 0;
u32 base_cnt_1 = 0;
for (u32 i = 0; i < num_elements;i+=num_lists)
{
const u32 elem = sort_tmp[i + tid];
if ((elem&bit_mask) > 0)
{
sort_tmp_1[base_cnt_1 + tid] = elem;
base_cnt_1 += num_lists;
}
else
{
sort_tmp[base_cnt_0 + tid] = elem;
base_cnt_0 += num_lists;
}
}
// copy back
for (u32 i = 0; i < base_cnt_1;i+=num_lists)
{
sort_tmp[base_cnt_0 + i + tid] = sort_tmp_1[i + tid];
}
}
}
//************************************
// Method: copy_data_to_shared
// Brief: 将全局内存读入共享内存
// Access: public
// Returns: __device__ void
// Qualifier:
// Param(I/O): const u32 * const data
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
//************************************
__device__ void copy_data_to_shared(const u32 * const data,
u32 * const sort_tmp,
const u32 num_lists,
const u32 num_elements,
const u32 tid)
{
// Copy data into temp store
for (u32 i=0; i < num_elements; i+=num_lists)
{
sort_tmp[i+tid] = data[i+tid];
}
__syncthreads();
}
//************************************
// Method: merge_array6
// Brief: 多线程合并
// Access: public
// Returns: __device__ void
// Qualifier:
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const dest_array
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
//************************************
__device__ void merge_array6(const u32 * const src_array,
u32 * const dest_array,
const u32 num_lists,
const u32 num_elements,
const u32 tid)
{
const u32 num_elements_per_list =(num_elements / num_lists);
__shared__ u32 list_indexes[MAX_NUM_LISTS];
list_indexes[tid] = 0;
// Wait for list_indexes[tid] to be cleared
__syncthreads();
// Iterate over all elements
for (u32 i=0; i < num_elements; i++)
{
// Create a value shared with the other threads
__shared__ u32 min_val;
__shared__ u32 min_tid;
// Use a temp register for work purposes
u32 data;
// If the current list has not already been
// emptied then read from it, else ignore it
if (list_indexes[tid] < num_elements_per_list)
{
// Work out from the list_index, the index into
// the linear array
const u32 src_idx = tid +(list_indexes[tid] * num_lists);
// Read the data from the list for the given
// thread
data = src_array[src_idx];
}
else
{
data = 0xFFFFFFFF;
}
// Have thread zero clear the min values
if (tid == 0)
{
// Write a very large value so the first
// thread thread wins the min
min_val = 0xFFFFFFFF;
min_tid = 0xFFFFFFFF;
}
// Wait for all threads
__syncthreads();
// Have every thread try to store it’s value into
// min_val. Only the thread with the lowest value
// will win
atomicMin(&min_val, data);
// Make sure all threads have taken their turn.
__syncthreads();
// If this thread was the one with the minimum
if (min_val == data)
{
// Check for equal values
// Lowest tid wins and does the write
atomicMin(&min_tid, tid);
}
// Make sure all threads have taken their turn.
__syncthreads();
// If this thread has the lowest tid
if (tid == min_tid)
{
// Incremene the list pointer for this thread
list_indexes[tid]++;
// Store the winning value
dest_array[i] = data;
}
}
}
//************************************
// Method: gpu_sort_array_array
// Brief: gpu基数排序
// Access: public
// Returns: __global__ void
// Qualifier:
// Param(I/O): u32 * const data
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
__global__ void gpu_sort_array_array(
u32 * const data,
const u32 num_lists,
const u32 num_elements)
{
const u32 tid =(blockIdx.x * blockDim.x) + threadIdx.x;
__shared__ u32 sort_tmp[NUM_ELEM];
__shared__ u32 sort_tmp_1[NUM_ELEM];
copy_data_to_shared(data, sort_tmp, num_lists,
num_elements, tid);
radix_sort2(sort_tmp, num_lists, num_elements,
tid, sort_tmp_1);
merge_array6(sort_tmp, data, num_lists,
num_elements, tid);
}
void gpu_sort_array(u32 * const data,
const u32 num_lists,
const u32 num_elements)
{
gpu_sort_array_array<<<1, 32>>>(data, num_lists, num_elements);
}
GPU效率还不如CPU.
而且, 序列长度过大时, nvcc编译会报错.
可以继续优化!
参考文献
[1] Shane Cook. CUDA Programming: A developer’s guide to parallel computing with GPUs.
手机扫一扫
移动阅读更方便
你可能感兴趣的文章