CUDA内存(二) 共享内存 shared memory
阅读原文时间:2021年04月22日阅读:1

其他:
CUDA内存(一) 寄存器

目录

共享内存

共享内存实际上是可受用户控制的一级缓存. [1]
只有当数据重复利用, 全局内存合并, 或者线程之间有共享数据时, 使用共享内存才合适.

使用共享内存排序:

SortArray.h

#pragma once

#include "Global.h"
#include "device_launch_parameters.h"
#include <stdlib.h>

#define MAX_NUM_LISTS 32
#define NUM_ELEM 4096
//************************************
// Method:     cpu_sort
// Brief:       CPU实现基数排序
// Access:     public 
// Returns:    void
// Qualifier: 
// Param(I/O): u32 * puData
// Param(I/O): u32 uArrayLen
//************************************
__host__ void cpu_sort(u32* h_puData, u32 uArrayLen);

//************************************
// Method:     find_min
// Brief:       从num_lists个列表中找出最小值.
// Access:     public 
// Returns:    u32
// Qualifier: 
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const list_indexes
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements_per_list
//************************************
u32 find_min(const u32 * const src_array,
    u32 * const list_indexes,
    const u32 num_lists,
    const u32 num_elements_per_list);

//************************************
// Method:     merge_array
// Brief:       将num_lists个排序好的列表合并
// Access:     public 
// Returns:    void
// Qualifier: 
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const dest_array
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
void merge_array(const u32 * const src_array,
    u32 * const dest_array,
    const u32 num_lists,
    const u32 num_elements);

//************************************
// Method:     gpu_sort_array
// Brief:       gpu基数排序host函数
// Access:     public 
// Returns:    void
// Qualifier: 
// Param(I/O): u32 * const data
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
void gpu_sort_array(u32 * const data,
    const u32 num_lists,
    const u32 num_elements);

SortArray.cu

#include "SortArray.h"
#include <stdio.h>

__host__ void cpu_sort(u32* puData, u32 uArrayLen)
{
    static u32* puCpuTemp0 = (u32*)malloc(uArrayLen*sizeof(u32));
    static u32* puCpuTemp1 = (u32*)malloc(uArrayLen*sizeof(u32));

    for (u32 bit = 0; bit < sizeof(u32)*8; bit++)
    {
        u32 uBaseCnt0 = 0;
        u32 uBaseCnt1 = 0;
        u32 bit_mask = (1 << bit);
        for (u32 i = 0; i < uArrayLen; i++)
        {
            u32 d = puData[i];
            if ((d&bit_mask) > 0)
            {
                puCpuTemp1[uBaseCnt1] = d;
                uBaseCnt1++;
            }
            else
            {
                puData[uBaseCnt0] = d;
                uBaseCnt0++;
            }
        }
        // Copy data back to source
        for (u32 i = 0; i < uBaseCnt1; i++)
        {
            puData[uBaseCnt0 + i] = puCpuTemp1[i];
        }
    }
    // 释放临时资源.
    free((void*)puCpuTemp0);
    free((void*)puCpuTemp1);
}

//************************************
// Method:     radix_sort
// Brief:       GPU基数排序
// Access:     public 
// Returns:    __device__ void
// Qualifier: 
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
// Param(I/O): u32 * const sort_tmp_0
// Param(I/O): u32 * const sort_tmp_1
//************************************
__device__ void radix_sort(u32 * const sort_tmp,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid,
    u32 * const sort_tmp_0,
    u32 * const sort_tmp_1)
{
    // Sort into num_list, lists
    // Apply radix sort on 32 bits of data
    for (u32 bit=0; bit < 32; bit++)
    {
        u32 base_cnt_0 = 0;
        u32 base_cnt_1 = 0;
        for (u32 i=0; i < num_elements; i+=num_lists)
        {
            const u32 elem = sort_tmp[i+tid];
            const u32 bit_mask =(1 << bit);
            if ((elem & bit_mask) > 0)
            {
                sort_tmp_1[base_cnt_1+tid] = elem;
                base_cnt_1+=num_lists;
            }
            else
            {
                sort_tmp_0[base_cnt_0+tid] = elem;
                base_cnt_0+=num_lists;
            }
        }
        // Copy data back to source - first the zero list
        for (u32 i=0; i < base_cnt_0; i+=num_lists)
        {
            sort_tmp[i+tid] = sort_tmp_0[i+tid];
        }
        // Copy data back to source - then the one list
        for (u32 i=0; i < base_cnt_1; i+=num_lists)
        {
            sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
        }
    }
    __syncthreads();
}
u32 find_min(const u32 * const src_array,
    u32 * const list_indexes,
    const u32 num_lists,
    const u32 num_elements_per_list)
{
    u32 min_val = 0xFFFFFFFF;
    u32 min_idx = 0;
    // Iterate over each of the lists
    for (u32 i=0; i < num_lists; i++)
    {
        // If the current list has already been emptied
        // then ignore it
        if (list_indexes[i] < num_elements_per_list)
        {
            const u32 src_idx = i +(list_indexes[i] * num_lists);
            const u32 data = src_array[src_idx];
            if (data <= min_val)
            {
                min_val = data;
                min_idx = i;
            }
        }
    }
    list_indexes[min_idx]++;
    return min_val;
}

void merge_array(const u32 * const src_array,
    u32 * const dest_array,
    const u32 num_lists,
    const u32 num_elements)
{
    const u32 num_elements_per_list =(num_elements / num_lists);
    u32 list_indexes[MAX_NUM_LISTS];
    for (u32 list=0; list < num_lists; list++)
    {
        list_indexes[list] = 0;
    }
    for (u32 i=0; i < num_elements; i++)
    {
        dest_array[i] = find_min(src_array,
            list_indexes,
            num_lists,
            num_elements_per_list);
    }
}

//************************************
// Method:     radix_sort2_device
// Brief:       GPU基数排序,优化2
// Access:     public 
// Returns:    __device__ void
// Qualifier: 
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
// Param(I/O): u32 * const sort_tmp_1
//************************************
__device__ void radix_sort2(u32 * const sort_tmp,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid,
    u32 * const sort_tmp_1)
{
    for (u32 bit = 0; bit < 32;bit++)
    {
        const u32 bit_mask = (1 << bit);
        u32 base_cnt_0 = 0;
        u32 base_cnt_1 = 0;
        for (u32 i = 0; i < num_elements;i+=num_lists)
        {
            const u32 elem = sort_tmp[i + tid];
            if ((elem&bit_mask) > 0)
            {
                sort_tmp_1[base_cnt_1 + tid] = elem;
                base_cnt_1 += num_lists;
            }
            else
            {
                sort_tmp[base_cnt_0 + tid] = elem;
                base_cnt_0 += num_lists;
            }
        }
        // copy back
        for (u32 i = 0; i < base_cnt_1;i+=num_lists)
        {
            sort_tmp[base_cnt_0 + i + tid] = sort_tmp_1[i + tid];
        }
    }
}

//************************************
// Method:     copy_data_to_shared
// Brief:       将全局内存读入共享内存
// Access:     public 
// Returns:    __device__ void
// Qualifier: 
// Param(I/O): const u32 * const data
// Param(I/O): u32 * const sort_tmp
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
//************************************
__device__ void copy_data_to_shared(const u32 * const data,
    u32 * const sort_tmp,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid)
{
    // Copy data into temp store
    for (u32 i=0; i < num_elements; i+=num_lists)
    {
        sort_tmp[i+tid] = data[i+tid];
    }
    __syncthreads();
}


//************************************
// Method:     merge_array6
// Brief:       多线程合并
// Access:     public 
// Returns:    __device__ void
// Qualifier: 
// Param(I/O): const u32 * const src_array
// Param(I/O): u32 * const dest_array
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
// Param(I/O): const u32 tid
//************************************
__device__ void merge_array6(const u32 * const src_array,
    u32 * const dest_array,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid)
{
    const u32 num_elements_per_list =(num_elements / num_lists);
    __shared__ u32 list_indexes[MAX_NUM_LISTS];
    list_indexes[tid] = 0;
    // Wait for list_indexes[tid] to be cleared
    __syncthreads();
    // Iterate over all elements
    for (u32 i=0; i < num_elements; i++)
    {
        // Create a value shared with the other threads
        __shared__ u32 min_val;
        __shared__ u32 min_tid;
        // Use a temp register for work purposes
        u32 data;
        // If the current list has not already been
        // emptied then read from it, else ignore it
        if (list_indexes[tid] < num_elements_per_list)
        {
            // Work out from the list_index, the index into
            // the linear array
            const u32 src_idx = tid +(list_indexes[tid] * num_lists);
            // Read the data from the list for the given
            // thread
            data = src_array[src_idx];
        }
        else
        {
            data = 0xFFFFFFFF;
        }
        // Have thread zero clear the min values
        if (tid == 0)
        {
            // Write a very large value so the first
            // thread thread wins the min
            min_val = 0xFFFFFFFF;
            min_tid = 0xFFFFFFFF;
        }
        // Wait for all threads
        __syncthreads();
        // Have every thread try to store it’s value into
        // min_val. Only the thread with the lowest value
        // will win
        atomicMin(&min_val, data);
        // Make sure all threads have taken their turn.
        __syncthreads();
        // If this thread was the one with the minimum
        if (min_val == data)
        {
            // Check for equal values
            // Lowest tid wins and does the write
            atomicMin(&min_tid, tid);
        }
        // Make sure all threads have taken their turn.
        __syncthreads();
        // If this thread has the lowest tid
        if (tid == min_tid)
        {
            // Incremene the list pointer for this thread
            list_indexes[tid]++;
            // Store the winning value
            dest_array[i] = data;
        }
    }
}
//************************************
// Method:     gpu_sort_array_array
// Brief:       gpu基数排序
// Access:     public 
// Returns:    __global__ void
// Qualifier: 
// Param(I/O): u32 * const data
// Param(I/O): const u32 num_lists
// Param(I/O): const u32 num_elements
//************************************
__global__ void gpu_sort_array_array(
    u32 * const data,
    const u32 num_lists,
    const u32 num_elements)
{
    const u32 tid =(blockIdx.x * blockDim.x) + threadIdx.x;
    __shared__ u32 sort_tmp[NUM_ELEM];
    __shared__ u32 sort_tmp_1[NUM_ELEM];
    copy_data_to_shared(data, sort_tmp, num_lists,
        num_elements, tid);
    radix_sort2(sort_tmp, num_lists, num_elements,
        tid, sort_tmp_1);
    merge_array6(sort_tmp, data, num_lists,
        num_elements, tid);
}

void gpu_sort_array(u32 * const data,
    const u32 num_lists,
    const u32 num_elements)
{
    gpu_sort_array_array<<<1, 32>>>(data, num_lists, num_elements);
}

测试结果

GPU效率还不如CPU.

而且, 序列长度过大时, nvcc编译会报错.
可以继续优化!

参考文献

[1] Shane Cook. CUDA Programming: A developer’s guide to parallel computing with GPUs.