CUDA-2

CUDA-2

这次开始进入cuda编程的field.

先看一个两个数组相加的例子,如果用C来写的话,

#include <stdio.h>

#define N 10

void add(int *a, int *b, int *c){
    int tid=0;
    while(tid<N){
        c[tid] = a[tid]+b[tid];
        tid += 1;
    }
}

int main(void){
    int a[N],b[N],c[N];
    for(int i=0;i<N;i++){
        a[i] = -i;
        b[i] = i*i;
    }

    add(a,b,c);

    // show results
    for(int i=0;i<N;i++){
        printf("%d + %d = %d\n", a[i], b[i], c[i]);

    }

    return 0;


下面这个是我根据书上改编的cuda的version

#include <stdio.h>

#define N 10

//our kernel function

__global__ void add(int *a, int *b, int *c){
    
    int tid = blockIdx.x;  // handle the data at this index
    if(tid<N)
        c[tid] = a[tid] + b[tid];


}

int main(void){
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;
    
    //allocate the memory on the GPU
    cudaMalloc((void**)&dev_a, N*sizeof(int));
    cudaMalloc((void**)&dev_b, N*sizeof(int));
    cudaMalloc((void**)&dev_c, N*sizeof(int));
    
    
    // fill the arrays on the cpu

    for(int i=0;i<N;i++){
        a[i] = -i;  
        b[i] = i*i;
    }   

    // copy the arrays to the gpu
    cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

    // run the kernle
    add<<<N, 1>>>(dev_a, dev_b, dev_c);
    

    // copy the result c to cpu
    cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
    
    //display the results
    for(int i=0;i<N;i++){
        printf("%d + %d = %d \n", a[i], b[i], c[i]);

    }   

    //free the memory on the device
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;

}


然后运行结果是

pengkun@ubuntu:~/exerciseC$ nvcc add.cu 
pengkun@ubuntu:~/exerciseC$ ./a.out add.cu 
0 + 0 = 0 
-1 + 1 = 0 
-2 + 4 = 2 
-3 + 9 = 6 
-4 + 16 = 12 
-5 + 25 = 20 
-6 + 36 = 30 
-7 + 49 = 42 
-8 + 64 = 56 
-9 + 81 = 72 


下面需要先解释下 核函数里的<<<N, 1>>>是什么意思,

其中第一个参数代表我们想要device用多少个 parrel blocks 的数目来执行我们的核函数。可以理解成把核函数创建了N份,然后并行地运行它们。每个并行的invocations就是一个block. <<<256, 1>>> 就代表可以得到256个blocks运行在GPU上。 而grid指的就是这些并行的blocks的集合.

还有一点要说的是其实在上面的代码中调用核函数时<<<N+1, 1>>>也是可以的,但是实际上用N个就可以了,没有必要再多用一个。

其中blockIdx.x可以理解成下面的过程。

第二个参数的意思是我们想让每个block运行多少个threads, 在上面的向量相加的问题上,我们开了N个block,其实严格的说是 N-blocks * 1-thread/per-block=N-parallel-threads, 其实我们也可以把参数变成<<<N/2, 2>>>>, <<<1,N>>>> 也是可以的。

比如可以把代码修改成下面的形式。

 1 #include <stdio.h>
  2 
  3 #define N 10
  4 
  5 //our kernel function
  6 
  7 __global__ void add(int *a, int *b, int *c){
  8 
  9     //int tid = blockIdx.x;  // handle the data at this index
 10     int tid = threadIdx.x;  // handle the data at this index
 11     if(tid<N)
 12         c[tid] = a[tid] + b[tid];
 13 
 14 
 15 }
 16 
 17 int main(void){
 18     int a[N], b[N], c[N];
 19     int *dev_a, *dev_b, *dev_c;
 20 
 21     //allocate the memory on the GPU
 22     cudaMalloc((void**)&dev_a, N*sizeof(int));
 23     cudaMalloc((void**)&dev_b, N*sizeof(int));
 24     cudaMalloc((void**)&dev_c, N*sizeof(int));
 25 
 26 
 27     // fill the arrays on the cpu
 28 
 29     for(int i=0;i<N;i++){
 30         a[i] = -i;
 31         b[i] = i*i;
 32     }
 33 
 34     // copy the arrays to the gpu
 35     cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
 36     cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
 37 
 38     // run the kernle
 39     add<<<1, N>>>(dev_a, dev_b, dev_c);
 40 
 41 
 42     // copy the result c to cpu
 43     cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
 44 
 45     //display the results
 46     for(int i=0;i<N;i++){
 47         printf("%d + %d = %d \n", a[i], b[i], c[i]);
 48 
 49     }
 50 
 51     //free the memory on the device
 52     cudaFree(dev_a);
 53     cudaFree(dev_b);
 54     cudaFree(dev_c);
 55 
 56     return 0;
 57 


上面的N太小了,再放一个N比较大的

#include "../common/book.h"

#define N   (32 * 1024)

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += gridDim.x;
    }
}

int main( void ) {
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the CPU
    a = (int*)malloc( N * sizeof(int) );
    b = (int*)malloc( N * sizeof(int) );
    c = (int*)malloc( N * sizeof(int) );

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = i;
        b[i] = 2 * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );

    add<<<128,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost ) );

    // verify that the GPU did the work we requested
    bool success = true;
    for (int i=0; i<N; i++) {
        if ((a[i] + b[i]) != c[i]) {
            printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );
            success = false;
        }
    }
    if (success)    printf( "We did it!\n" );

    // free the memory we allocated on the GPU
    HANDLE_ERROR( cudaFree( dev_a ) );
    HANDLE_ERROR( cudaFree( dev_b ) );
    HANDLE_ERROR( cudaFree( dev_c ) );

    // free the memory we allocated on the CPU
    free( a );
    free( b );
    free( c );

    return 0;
}

注意到这次在核函数里面又出现了更高维度的gridDim.x, 因为这时候的N非常的大,但是根据刚才显示的GPU的信息,有可能提供的blocks的数目可能不够用。比如现在就给了128个,这时候加了gridDim.x之后就相当于每算完128个向前进行一批,算下一批.

打赏,谢谢~~

取消

感谢您的支持,我会继续努力的!

扫码支持
扫码打赏,多谢支持~

打开微信扫一扫,即可进行扫码打赏哦