規約算法.求內積

0.引言

有的地方也稱之爲歸約算法.
內積： $d=\langle x, y\rangle$

$\mathrm{v}_{1}=\left(\mathrm{a}_{1}, \mathrm{a}_{2}, \mathrm{a}_{3}, \ldots, \mathrm{a}_{\mathrm{n}}\right)$ $\mathrm{v}_{2}=\left(\mathrm{b}_{1}, \mathrm{b}_{2}, \mathrm{b}_{3}, \ldots, \mathrm{b}_{\mathrm{n}}\right)$ $\mathrm{v}_{1} \cdot \mathrm{v}_{2}=\mathrm{a}_{1} \mathrm{b}_{1}+\mathrm{a}_{2} \mathrm{b}_{2}+\mathrm{a}_{3} \mathrm{b}_{3}+\ldots+\mathrm{a}_{\mathrm{n}} \mathrm{b}_{\mathrm{n}}$ 類似可以類比內積思想求解的一些問題：

求和
乘積
邏輯運算, 例如 and, or, xor…
極值, 例如 max, min.

$V=\left\{v_{0}=\left(\begin{array}{c}{e_{0}^{0}} \\ {\vdots} \\ {e_{0}^{m-1}}\end{array}\right), v_{1}=\left(\begin{array}{c}{e_{1}^{0}} \\ {\vdots} \\ {e_{1}^{m-1}}\end{array}\right), \ldots, v_{p-1}=\left(\begin{array}{c}{e_{p-1}^{0}} \\ {\vdots} \\ {e_{p-1}^{m-1}}\end{array}\right)\right\}$ $r=\left(\begin{array}{c}{e_{0}^{0} \oplus e_{1}^{0} \oplus \cdots \oplus e_{-1}^{0}} \\ {\vdots} \\ {e_{0}^{m-1} \oplus e_{1}^{m-1} \oplus \cdots \oplus e_{p-1}^{m-1}}\end{array}\right)=\left(\begin{array}{c}{\oplus_{10}^{p-1} e_{i}^{0}} \\ {\vdots} \\ {\oplus_{i=0}^{p-1} e_{i}^{m-1}}\end{array}\right)$

1.code

解析見註釋：

/* dot product of two vectors: d = <x, y> */

#include "reduction_aux.h"
#include <assert.h>

/* host, add 使用CPU只需要一個for循環就ok*/
FLOAT dot_host(FLOAT *x, FLOAT *y, int N)
{
    int i;
    FLOAT t = 0;

    assert(x != NULL);
    assert(y != NULL);

    for (i = 0; i < N; i++) t += x[i] * y[i];

    return t;
}

__device__ void warpReduce(volatile FLOAT *sdata, int tid)
{
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

/* partial dot product 注意這裏沒有求完，只是求解了一個block,256個線程*/
__global__ void dot_stg_1(const FLOAT *x, FLOAT *y, FLOAT *z, int N)
{
    __shared__ FLOAT sdata[256];
    /********************************************************************************
    get thread id: 1D block and 2D grid ，blockDim.x 內置變量 
    #define get_tid() (blockDim.x * (blockIdx.x + blockIdx.y * gridDim.x) + threadIdx.x)  //獲取全局的線程ID
    get block id: 2D grid 
    #define get_bid() (blockIdx.x + blockIdx.y * gridDim.x) //獲取block在Grid中的ID
    ********************************************************************************/
    int idx = get_tid();//idx爲0～N-1
    int tid = threadIdx.x;//threaIdx指明線程所在block中的位置，tid爲0~255
    int bid = get_bid();//block指明在Grid中的ID，bid爲0~198**198即是grid的大小（注意是二維）

    /* load data to shared mem 加載數據並將乘積的結果保存至共享內存中*/
    if (idx < N) {
        sdata[tid] = x[idx] * y[idx];
    }
    else {
        sdata[tid] = 0;//開闢的線程實際上是大於N的，可以嘗試一下注釋掉這句，結果肯定是亂碼.
    }

    __syncthreads();

    /* reduction using shared mem 然後使用規約算法將乘積的結果累加起來*/
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid < 64) sdata[tid] += sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) z[bid] = sdata[0];
}

/* sum all entries in x and asign to y
 * block dim must be 256 */
__global__ void dot_stg_2(const FLOAT *x, FLOAT *y, int N)
{
    __shared__ FLOAT sdata[256];
    int idx = get_tid();
    int tid = threadIdx.x;
    int bid = get_bid();

    /* load data to shared mem */
    if (idx < N) {
        sdata[tid] = x[idx];
    }
    else {
        sdata[tid] = 0;
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid < 64) sdata[tid] += sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) y[bid] = sdata[0];
}

__global__ void dot_stg_3(FLOAT *x, int N)
{
    __shared__ FLOAT sdata[128];
    int tid = threadIdx.x;
    int i;

    sdata[tid] = 0;

    /* load data to shared mem */
    for (i = 0; i < N; i += 128) {
        if (tid + i < N) sdata[tid] += x[i + tid];
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 64) sdata[tid] = sdata[tid] + sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) x[0] = sdata[0];
}

/* dz and d serve as cache: result stores in d[0] */
void dot_device(FLOAT *dx, FLOAT *dy, FLOAT *dz, FLOAT *d, int N)
{
    /* 1D block */
    int bs = 256;

    /* 2D grid */
    int s = ceil(sqrt((N + bs - 1.) / bs));
    dim3 grid = dim3(s, s);
    int gs = 0;

    /* stage 1 */
    dot_stg_1<<<grid, bs>>>(dx, dy, dz, N);

    /* stage 2 */
    {
        /* 1D grid */
        int N2 = (N + bs - 1) / bs;

        int s2 = ceil(sqrt((N2 + bs - 1.) / bs));
        dim3 grid2 = dim3(s2, s2);

        dot_stg_2<<<grid2, bs>>>(dz, d, N2);

        /* record gs */
        gs = (N2 + bs - 1.) / bs;
    }

    /* stage 3 */
    dot_stg_3<<<1, 128>>>(d, gs);
}

int main(int argc, char **argv)
{
    int N = 10000070;
    int nbytes = N * sizeof(FLOAT);

    FLOAT *hx = NULL, *hy = NULL;
    FLOAT *dx = NULL, *dy = NULL, *dz = NULL, *d = NULL;
    int i, itr = 20;
    FLOAT asd = 0, ash;
    double td, th;

    if (argc == 2) {
        int an;

        an = atoi(argv[1]);
        if (an > 0) N = an;
    }

    /*************** allocate GPU mem ***************/
    cudaMalloc((void **)&dx, nbytes);
    cudaMalloc((void **)&dy, nbytes);
    cudaMalloc((void **)&dz, sizeof(FLOAT) * ((N + 255) / 256));//塊大小
    cudaMalloc((void **)&d, sizeof(FLOAT) * ((N + 255) / 256));
    if (dx == NULL || dy == NULL || dz == NULL || d == NULL) {
        printf("couldn't allocate GPU memory\n");
        return -1;
    }
    printf("allocated %e MB on GPU\n", nbytes / (1024.f * 1024.f));



    /***************alllocate CPU mem ***************/
    hx = (FLOAT *) malloc(nbytes);
    hy = (FLOAT *) malloc(nbytes);
    if (hx == NULL || hy == NULL) {
        printf("couldn't allocate CPU memory\n");
        return -2;
    }
    printf("allocated %e MB on CPU\n", nbytes / (1024.f * 1024.f));
    /* init */
    for (i = 0; i < N; i++) {
        hx[i] = 1;
        hy[i] = 2;
    }

    /*************** copy data to GPU ***************/
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
    /*************** let dust fall ***************/
    cudaThreadSynchronize();



    td = get_time();
    /*************** call GPU ***************/
    for (i = 0; i < itr; i++) dot_device(dx, dy, dz, d, N);//爲什麼是循環20次？？
    /*************** let GPU finish ***************/
    cudaThreadSynchronize();
    td = get_time() - td;




    /*************** CPU計算 ***************/
    th = get_time();
    for (i = 0; i < itr; i++) ash = dot_host(hx, hy, N);
    th = get_time() - th;

    /*************** copy data from GPU ***************/
    cudaMemcpy(&asd, d, sizeof(FLOAT), cudaMemcpyDeviceToHost);
    printf("dot, answer: %d, calculated by GPU:%f, calculated by CPU:%f\n", 2 * N, asd, ash);
    printf("GPU time: %e, CPU time: %e, speedup: %g\n", td, th, th / td);



    cudaFree(dx);
    cudaFree(dy);
    cudaFree(dz);
    cudaFree(d);
    free(hx);
    free(hy);

    return 0;
}

總體思想分爲三個階段：

塊大小, 256: 數組長度降低256倍：大規模數組依舊很長, 例如 256萬降低到1萬；
對部分和繼續使用上一步的算法；
使用一個塊, 將最後結果規約；

如草稿紙：

2.result

測試：註釋掉那一句

    /* load data to shared mem 加載數據並將乘積的結果保存至共享內存中*/
    if (idx < N) {
        sdata[tid] = x[idx] * y[idx];
    }
    else {
        //sdata[tid] = 0;//開闢的線程實際上是大於N的，可以嘗試一下注釋掉這句，結果肯定是亂碼.
    }

猜測是亂碼，竟然不是亂碼，但是結果也是錯誤的，按照塊的執行，多開的線程依然在執行，所以必須將其他沒用到的線程裏面的值設置爲0，不然會導致結果錯誤！

規約算法.求內積