cuda-计算roc

cuda-计算roc

计算roc以前在看书的时候遇到过,当时就假想过如果数据量非常大的时候该如何算。就做了个demo,试了一下,矩阵的大小是 15W*15W左右,花费在2秒内,感觉比cpu快了太多了。

探索了好久,才探索出一个小的demo, 如果数据量再加很大的话,比如加大到GPU不够了,再加大CPU也不够了,这两种情况下又该去如何处理。 现在的例子是GPU还可以,也不需要用多个GPU。 代码如下, 写的比较差,不够通用。有些地方可以提出来,也有很多地方是重复的,比如读文件和写文件的时候。后面继续修改。

#include <stdio.h>
#include <fstream>
#include <iostream>
#include "cuda_runtime.h"

#define QNUM 14536
#define GNUM 14589
#define NT (QNUM*GNUM)
#define NUM 1024

__global__ void roc_kernel(const float *score, int *histo1, int *histo2, int *label_q, int *label_g){

    __shared__ int temp1[NUM];
    __shared__ int temp2[NUM];

    temp1[threadIdx.x]=0;
    temp2[threadIdx.x]=0;
    __syncthreads();

    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    const int stride = blockDim.x*gridDim.x;

    while(idx<NT){
        atomicAdd(&(temp1[(int)(0.5*(score[idx]+1)*(NUM-1))]), 1);
        atomicAdd(&(temp2[(int)(0.5*(score[idx]+1)*(NUM-1))]), label_q[idx/GNUM]==label_g[idx%GNUM]);
        idx += stride;
    }

    __syncthreads();
    atomicAdd(&histo1[threadIdx.x], temp1[threadIdx.x]);
    atomicAdd(&histo2[threadIdx.x], temp2[threadIdx.x]);
}

int main(void){

    //read score and label
    float *score = new float[NT];
    int *ql = new int[QNUM];
    int *gl = new int[GNUM];

    std::ifstream file1;
    file1.open("score.bin", std::ios::binary|std::ios::in);
    /*
    for(int i=0;i<NT;++i){
        file1.read((char*)&score[i],sizeof(float));
    }
    */
    file1.read((char*)score, sizeof(float)*NT);
    file1.close();


    std::ifstream file2;
    file2.open("labelQ.bin", std::ios::binary|std::ios::in);
    /*
    for(int i=0;i<QNUM;++i){
        file2.read((char*)&ql[i],sizeof(int));
    }
    */
    file2.read((char*)ql, sizeof(int)*QNUM);
    file2.close();

                                                                                                  29,1          27%
    std::ifstream file3;
    file3.open("labelG.bin", std::ios::binary|std::ios::in);
    /*
    for(int i=0;i<GNUM;++i){
        file3.read((char*)&gl[i],sizeof(int));
    }
    */
    file3.read((char*)gl, sizeof(int)*GNUM);
    file3.close();
    // allocate on dev
    float *dev_score;
    int *dev_ql, *dev_gl, *dev_histo1, *dev_histo2;
    int histo1[NUM] = {0};
    int histo2[NUM] = {0};

    cudaMalloc((void**)&dev_score, NT*sizeof(float));
    cudaMalloc((void**)&dev_ql, QNUM*sizeof(int));
    cudaMalloc((void**)&dev_gl, GNUM*sizeof(int));
    cudaMalloc((void**)&dev_histo1, NUM*sizeof(int));
    cudaMalloc((void**)&dev_histo2, NUM*sizeof(int));

    cudaMemcpy(dev_score, score, NT*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_ql, ql, QNUM*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_gl, gl, GNUM*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_histo1, histo1, NUM*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_histo2, histo2, NUM*sizeof(int), cudaMemcpyHostToDevice);

    //run on the dev
    roc_kernel<<<NUM, NUM>>>(dev_score, dev_histo1, dev_histo2, dev_ql, dev_gl);


    //copy result to host
    cudaMemcpy(histo1, dev_histo1, NUM*sizeof(int),cudaMemcpyDeviceToHost);
    cudaMemcpy(histo2, dev_histo2, NUM*sizeof(int),cudaMemcpyDeviceToHost);

    //check the result and deal with the result on host
    int num_total = 0;
    int num_real_true = 0;
    for(int i=0;i<NUM;i++){
        num_total += histo1[i];
        num_real_true += histo2[i];
    }
    printf("num_total: %d, num_real_true: %d\n", num_total, num_real_true);
    int num_real_false = num_total-num_real_true;

    //deal with the result and save tpr,fpr
    float fpr[NUM];
    float tpr[NUM];
    int pre_true=0,real_true=0;
    for(int i=NUM-1;i>-1;i--){
        pre_true += histo1[i];
        real_true += histo2[i];
        tpr[i] = 1.0*real_true/num_real_true;
        fpr[i] = 1.0*(pre_true-real_true)/(num_real_false);
    }

    std::ofstream out1;
    out1.open("tpr.bin", std::ios::binary|std::ios::out);
    out1.write((char*)tpr, NUM*sizeof(float));
    out1.close();
    std::ofstream out2;
    out2.open("fpr.bin", std::ios::binary|std::ios::out);
    out2.write((char*)fpr, NUM*sizeof(float));
    out2.close();


    //free others
    cudaFree(dev_score);
    cudaFree(dev_ql);
    cudaFree(dev_gl);
    cudaFree(dev_histo1);
    cudaFree(dev_histo2);
    
    delete []score;
    delete []ql;
    delete []gl;
    return 0;
}

上面的编译完之后就可以在python中画图了,其实编译的命令是

nvcc roc_cuda.cu --compiler-options -fPIC -shared -o roc.so

然后就是python的代码了

# ===========================================
# --coding:UTF-8 --
# file: test.py
# author: ZhengPengkun
# date: 2018-12-21
# email: pkzhengmath@pku.edu.cn
# description: 
# ===========================================

import matplotlib.pyplot as plt 
import numpy as np
import os
import time
import datetime

def plot():
    fpr = np.fromfile("fpr.bin", np.float32)
    tpr = np.fromfile("tpr.bin", np.float32)
    print(fpr.shape, tpr.shape)
    plt.plot(fpr, tpr,"b")
    plt.xscale("log")
    plt.xlim([1e-8,1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.savefig("roc.png")
    #plt.show()


def main():
    if os.path.exists("tpr.bin") and os.path.exists("fpr.bin"):
        plot()
    else:
        print("Calculate tpr and fpr")
        from ctypes import CDLL
        dev = CDLL("./roc.so")
        dev.main()
        if os.path.exists("tpr.bin") and os.path.exists("fpr.bin"):
            plot()
        else:
            print("error!")
    os.system("rm tpr.bin fpr.bin")
if __name__ == "__main__":
    start = time.time()
    main()
    print("spends %.4f secs"%(time.time()-start))


测试的结果是

pengkun@ubuntu:/mnt/data1/cuda_roc$ python test.py 
Calculate tpr and fpr
num_total: 212065704, num_real_true: 10934
(1024,) (1024,)
QApplication: invalid style override passed, ignoring it.
spends 1.7987 secs
pengkun@ubuntu:/mnt/data1/cuda_roc$ 

图是这样的

avator

需要注意的地方

«««< HEAD

  • 并不是说线程越多越好,

比如现在我的1060的上面的的信息是这样的。

   --- General Information for device 0 ---
Name:  GeForce GTX 1060 6GB
Compute capability:  6.1
Clock rate:  1733500
Device copy overlap:  Enabled
Kernel execution timeout :  Enabled
   --- Memory Information for device 0 ---
Total global mem:  6371475456
Total constant Mem:  65536
Max mem pitch:  2147483647
Texture Alignment:  512
   --- MP Information for device 0 ---
Multiprocessor count:  10
Shared mem per mp:  49152
Registers per mp:  65536
Threads in warp:  32
Max threads per block:  1024
Max thread dimensions:  (1024, 1024, 64)
Max grid dimensions:  (2147483647, 65535, 65535)


从上面可以看出来,1024我选的已经是极限了,但是histo并不说就得一定申请长为1024,也可以申请的很大,只是这时候比较方便。 grid的选取我选的也是1024,其实我发现设置到1好像比1024还要快,这是因为开启那么多的线程,是需要调度的时间的,比如当设置的非常大的时候,时间就会更慢。出现这种情况,就像当于是请100个人来搬10块砖头,找这100个人都得需要好多时间,还不如找一个人,哪怕运两次也该搞定了。所以用cuda的时候,要恒量好这个关系。开多少个线程合适,当然也不是说任务是多少就开多少个线程,就比如说上面的代码中如果grid指定为(NT-1)/NUM+1的话,我测的时间是6秒多,反而慢了许多。

Calculate tpr and fpr
num_total: 212065704, num_real_true: 10934
((1024,), (1024,))
spends 6.1431 secs

第二个是读数据的时候,之前我用for去读的时候特别地慢。后来注释掉了,改成一次性读完,就快了很多。 第三个是三维的grid和block现在已经可以用了,用三维的应该会更快一些,这是下一步的目标。之后再把这个算法推广到用多个GPU,如果多个GPU仍然不够的话,就分batch. 第四是在不同的机器上测的时候结果可能不太一样,我在自己的电脑上测试的时候效果又快了一些

pengkun@pc:/mnt/data1/cuda_roc$ bash do.sh 
Calculate tpr and fpr
num_total: 212065704, num_real_true: 10934
((1024,), (1024,))
spends 0.8412 secs

关于读和写的更改的相关代码在下面

 35 template<typename T>
 36 void read_data(const char* fname, int num, T* content){
 37     std::ifstream file;
 38     file.open(fname, std::ios::binary|std::ios::in);
 39     file.read((char*)content, sizeof(T)*num);
 40     file.close();
 41 }
 42 

对应的读的地方改成了

    const char* file1 = "score.bin";
    const char* file2 = "labelQ.bin";
    const char* file3 = "labelG.bin";
    float *score = new float[NT];
    read_data(file1, NT, score);

    int *ql = new int[QNUM];
    read_data(file2, QNUM, ql);
        
    int *gl = new int[GNUM];
    read_data(file3, GNUM, gl);



也可以按下面的方式更改

template<typename T>
T* read_data(const char* fname, int num){
    T* content = new T[num];
    std::ifstream file;
    file.open(fname, std::ios::binary|std::ios::in);
    file.read((char*)content, sizeof(T)*num);
    file.close();
    return content;
}

读的地方

    //read score and label
    const char* file1 = "score.bin";
    const char* file2 = "labelQ.bin";
    const char* file3 = "labelG.bin";
    float *score = read_data<float>(file1, NT);
    int *ql = read_data<int>(file2, QNUM);
    int *gl = read_data<int>(file3, GNUM);

感觉这种更方便一些

然后是写的代码可以改成

void write_data(const char* fname, float *content){
    std::ofstream out;
    out.open(fname, std::ios::binary|std::ios::out);
    out.write((char*)content, sizeof(float)*NUM);
    out.close();
    //delete []content;
}


对应的写的部分改成了

    const char* FPR = "fpr.bin";
    const char* TPR = "tpr.bin";
    write_data(FPR, fpr);
    write_data(TPR, tpr);


打赏,谢谢~~

取消

感谢您的支持,我会继续努力的!

扫码支持
扫码打赏,多谢支持~

打开微信扫一扫,即可进行扫码打赏哦