CUDA-5

# 小郑之家~

• 读x的值

• 在x的值上面加1

• 把加完后的值从新写入x

#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define SIZE (100*1024*1024)

//generate random function

void* big_random_block(int size){
unsigned char *data = (unsigned char*)malloc(size);
for(int i=0; i<size; i++){
data[i] = rand();
}
return data;
}

int main(){
unsigned char *buffer = (unsigned char*) big_random_block(SIZE);

clock_t start, stop;
start = clock();

unsigned int histo[256];
for(int i=0;i<256;i++){
histo[i] = 0;
}

for(int i=0;i<SIZE; i++)
histo[buffer[i]]++;

stop = clock();
float elapsedTime = (float)(stop-start)/(float)CLOCKS_PER_SEC*1000.0f;

printf("Time to generate: %3.1f ms\n", elapsedTime);

//check the result
long histoCount = 0;
for(int i=0;i<256;i++)
histoCount += histo[i];

printf("Histogram sum: %ld\n", histoCount);
free(buffer);

return 0;

}



Time to generate: 150.1 ms
Histogram sum: 104857600



void* big_random_block(int size){
unsigned char *data = (unsigned char*)malloc(size);
for(int i=0; i<size; i++){
data[i] = rand();
printf("%d\n", data[i]);
}
return data;
}



256 是因为each random 8-bit byte can be any of 256 different values(from 0x00 to 0xFF),所以需要256个桶来统计每个数字出现的次数。

### 接下来是gpu的version了，

#include <stdio.h>
#include <stdlib.h>

#define SIZE (100*1024*1024)

//generate random function

void* big_random_block(int size){
unsigned char *data = (unsigned char*)malloc(size);
for(int i=0; i<size; i++){
data[i] = rand();
}
return data;
}

// our kernel function

__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){

int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i<size){
i += stride;
}
}

int main(void){
unsigned char *buffer = (unsigned char*) big_random_block(SIZE);

clock_t start, stop;
start = clock();

unsigned char *dev_buffer;
unsigned int *dev_histo;

//allocate
cudaMalloc((void**)&dev_buffer, SIZE);
cudaMalloc((void**)&dev_histo, 256*sizeof(int));

//cp data from cpu to gpu
cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice);

cudaMemset(dev_histo, 0, 256*sizeof(int));

cudaGetDeviceProperties(&prop, 0);
int blocks = prop.multiProcessorCount;
//printf("%d\n", blocks);
histo_kernel<<<blocks*2, 256>>>(dev_buffer, SIZE, dev_histo);   // 2倍的mps数量最好。

//get the result
unsigned int histo[256];
cudaMemcpy(histo, dev_histo, 256*sizeof(int), cudaMemcpyDeviceToHost);

stop = clock();
float elapsedTime = (float)(stop-start)/(float)CLOCKS_PER_SEC*1000.0f;
printf("Time to generate: %3.1f ms\n", elapsedTime);

//check the result
long histoCount = 0;
for(int i=0;i<256;i++)
histoCount += histo[i];

printf("Histogram sum: %ld\n", histoCount);

cudaFree(dev_histo);
cudaFree(dev_buffer);
free(buffer);

return 0;

}



Time to generate: 178.0 ms
Histogram sum: 104857600



__global__ void histo_kernel( unsigned char *buffer,
long size,
unsigned int *histo ) {

__shared__  unsigned int temp[256];

int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < size) {
i += stride;
}
}



Time to generate:  26.4 ms
Histogram Sum:  104857600