pytorch-cuda和c++的extensions

pytorch-cuda和c++的extensions

这次是基于最新的pytorch 1.0.0 版本的来看的。

先以官方的教程为例子来试一下。

c++ extension

  • 第一步写.cpp文件
#include <torch/torch.h>
#include <iostream>
#include <vector>

at::Tensor d_sigmoid(at::Tensor z) {
  auto s = at::sigmoid(z);
  return (1 - s) * s;
}


std::vector<at::Tensor> lltm_forward(
    at::Tensor input,
    at::Tensor weights,
    at::Tensor bias,
    at::Tensor old_h,
    at::Tensor old_cell) {
  auto X = at::cat({old_h, input}, /*dim=*/1);

  auto gate_weights = at::addmm(bias, X, weights.transpose(0, 1));
  auto gates = gate_weights.chunk(3, /*dim=*/1);

  auto input_gate = at::sigmoid(gates[0]);
  auto output_gate = at::sigmoid(gates[1]);
  auto candidate_cell = at::elu(gates[2], /*alpha=*/1.0);

  auto new_cell = old_cell + candidate_cell * input_gate;
  auto new_h = at::tanh(new_cell) * output_gate;

  return {new_h,
          new_cell,
          input_gate,
          output_gate,
          candidate_cell,
          X,
          gate_weights};
}

at::Tensor d_tanh(at::Tensor z) {
  return 1 - z.tanh().pow(2);
}

// elu'(z) = relu'(z) + { alpha * exp(z) if (alpha * (exp(z) - 1)) < 0, else 0}
at::Tensor d_elu(at::Tensor z, at::Scalar alpha = 1.0) {
  auto e = z.exp();
  auto mask = (alpha * (e - 1)) < 0;
  return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e);
}

std::vector<at::Tensor> lltm_backward(
    at::Tensor grad_h,
    at::Tensor grad_cell,
    at::Tensor new_cell,
    at::Tensor input_gate,
    at::Tensor output_gate,
    at::Tensor candidate_cell,
    at::Tensor X,
    at::Tensor gate_weights,
    at::Tensor weights) {
  auto d_output_gate = at::tanh(new_cell) * grad_h;
  auto d_tanh_new_cell = output_gate * grad_h;
  auto d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell;

  auto d_old_cell = d_new_cell;
  auto d_candidate_cell = input_gate * d_new_cell;
  auto d_input_gate = candidate_cell * d_new_cell;


auto gates = gate_weights.chunk(3, /*dim=*/1);
  d_input_gate *= d_sigmoid(gates[0]);
  d_output_gate *= d_sigmoid(gates[1]);
  d_candidate_cell *= d_elu(gates[2]);

  auto d_gates =
     at::cat({d_input_gate, d_output_gate, d_candidate_cell}, /*dim=*/1);

  auto d_weights = d_gates.t().mm(X);
  auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true);

  auto d_X = d_gates.mm(weights);
  const auto state_size = grad_h.size(1);
  auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size);
  auto d_input = d_X.slice(/*dim=*/1, state_size);

  return {d_old_h, d_input, d_weights, d_bias, d_old_cell};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &lltm_forward, "LLTM forward");
  m.def("backward", &lltm_backward, "LLTM backward");
}

注意最后的PYBIND11那些不要忘记了,其作用是告诉在外边该怎么调这个接口,比如最后lltm.forward 通过这里的指向,就会调用lltm_forward这个函数,后面加引号的是对其的描述。

  • 第二步写setup.py
from setuptools import setup
from torch.utils.cpp_extension import CppExtension, BuildExtension
import setuptools
import torch
setup(name='lltm',
      ext_modules=[CppExtension('lltm', ['lltm.cpp'])],
     cmdclass={'build_ext': BuildExtension})


"""
setuptools.Extension(
   name='lltm',
   sources=['lltm.cpp'],
   include_dirs=torch.utils.cpp_extension.include_paths(),
   language='c++')
"""


setup.py有两种方式可以用,上面的一种是会生成一些东西,而下面的不会,先以上面的为例,写好之后

运行python setup.py install 这时候结果如下

running install
running bdist_egg
running egg_info
writing lltm.egg-info/PKG-INFO
writing dependency_links to lltm.egg-info/dependency_links.txt
writing top-level names to lltm.egg-info/top_level.txt
reading manifest file 'lltm.egg-info/SOURCES.txt'
writing manifest file 'lltm.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_ext
building 'lltm' extension
gcc -pthread -B /home/pengkun/anaconda3/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/pengkun/anaconda3/lib/python3.6/site-packages/torch/lib/include -I/home/pengkun/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include -I/home/pengkun/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -I/home/pengkun/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -I/home/pengkun/anaconda3/include/python3.6m -c lltm.cpp -o build/temp.linux-x86_64-3.6/lltm.o -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=lltm -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11
cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++
In file included from lltm.cpp:1:0:
/home/pengkun/anaconda3/lib/python3.6/site-packages/torch/lib/include/torch/csrc/api/include/torch/torch.h:7:2: warning: #warning "Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h" [-Wcpp]
 #warning \
  ^
g++ -pthread -shared -B /home/pengkun/anaconda3/compiler_compat -L/home/pengkun/anaconda3/lib -Wl,-rpath=/home/pengkun/anaconda3/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.6/lltm.o -o build/lib.linux-x86_64-3.6/lltm.cpython-36m-x86_64-linux-gnu.so
creating build/bdist.linux-x86_64/egg
copying build/lib.linux-x86_64-3.6/lltm.cpython-36m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg
creating stub loader for lltm.cpython-36m-x86_64-linux-gnu.so
byte-compiling build/bdist.linux-x86_64/egg/lltm.py to lltm.cpython-36.pyc
creating build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt
zip_safe flag not set; analyzing archive contents...
__pycache__.lltm.cpython-36: module references __file__
creating 'dist/lltm-0.0.0-py3.6-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it
removing 'build/bdist.linux-x86_64/egg' (and everything under it)
Processing lltm-0.0.0-py3.6-linux-x86_64.egg
removing '/home/pengkun/anaconda3/lib/python3.6/site-packages/lltm-0.0.0-py3.6-linux-x86_64.egg' (and everything under it)
creating /home/pengkun/anaconda3/lib/python3.6/site-packages/lltm-0.0.0-py3.6-linux-x86_64.egg
Extracting lltm-0.0.0-py3.6-linux-x86_64.egg to /home/pengkun/anaconda3/lib/python3.6/site-packages
lltm 0.0.0 is already the active version in easy-install.pth

Installed /home/pengkun/anaconda3/lib/python3.6/site-packages/lltm-0.0.0-py3.6-linux-x86_64.egg
Processing dependencies for lltm==0.0.0
Finished processing dependencies for lltm==0.0.0

没编译之前文件目录结构是

.
├── lltm.cpp
└── setup.py

编译完成后是

.
├── build
│   ├── bdist.linux-x86_64
│   ├── lib.linux-x86_64-3.6
│   │   └── lltm.cpython-36m-x86_64-linux-gnu.so
│   └── temp.linux-x86_64-3.6
│       └── lltm.o
├── dist
│   └── lltm-0.0.0-py3.6-linux-x86_64.egg
├── lltm.cpp
├── lltm.egg-info
│   ├── dependency_links.txt
│   ├── PKG-INFO
│   ├── SOURCES.txt
│   └── top_level.txt
└── setup.py


这时候可以测试一下

>>> import torch
>>> import lltm
>>> lltm.forward
<built-in method forward of PyCapsule object at 0x7f8c2bc5ef00>
>>> 

注意一定是import torch在前面!!!

用的时候可以这样使用

import torch
import lltm
outputs = lltm.forward(input, weights, bias, old_h, old_cell)

  • 用第二种方法编译

第二种方法没有生成东西

pengkun@ubuntu:~/torch_learn/lltm_extension$ vim setup.py 
pengkun@ubuntu:~/torch_learn/lltm_extension$ python setup.py install
pengkun@ubuntu:~/torch_learn/lltm_extension$ ls
lltm.cpp  setup.py
pengkun@ubuntu:~/torch_learn/lltm_extension$ 

建议用第一种,因为第二种教程里面没有怎么说。

还可以更方便地采取jit的方式(Just in time)的方式

这时候不需要像之前那样python setup.py install 直接用的时候像下面这样用就可以了,不过第一次用的时候会慢,

>>> from torch.utils.cpp_extension import load
>>> 
>>> lltm = load(name="lltm", sources=["lltm.cpp"])
>>> lltm.forward
<built-in method forward of PyCapsule object at 0x7f524d32e750>
>>> 

lltm.forward 后面跟上输出的数据就可以用了

cuda extensions

  • 写核函数 这里把主要部分拿出来,核函数的细节省略了,官方的教程上面是有的,
// lltm_cuda_kernel.cu 
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>

/*

kernel funciton

*/

std::vector<at::Tensor> lltm_cuda_forward(
    at::Tensor input,
    at::Tensor weights,
    at::Tensor bias,
    at::Tensor old_h,
    at::Tensor old_cell)
{ 
  auto X = at::cat({old_h, input}, /*dim=*/1);
  auto gates = at::addmm(bias, X, weights.transpose(0, 1));
  
  const auto batch_size = old_cell.size(0);
  const auto state_size = old_cell.size(1);
  
  auto new_h = at::zeros_like(old_cell);
  auto new_cell = at::zeros_like(old_cell);
  auto input_gate = at::zeros_like(old_cell);
  auto output_gate = at::zeros_like(old_cell);
  auto candidate_cell = at::zeros_like(old_cell);
  
  const int threads = 1024;
  const dim3 blocks((state_size + threads - 1) / threads, batch_size);
  
  AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] {
    lltm_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
        gates.data<scalar_t>(),
        old_cell.data<scalar_t>(),
        new_h.data<scalar_t>(),
        new_cell.data<scalar_t>(),
        input_gate.data<scalar_t>(),
        output_gate.data<scalar_t>(),
        candidate_cell.data<scalar_t>(),
        state_size);
  }));



/*

kernel function

*/

std::vector<at::Tensor> lltm_cuda_backward(
    at::Tensor grad_h,
    at::Tensor grad_cell,
    at::Tensor new_cell,
    at::Tensor input_gate,
    at::Tensor output_gate,
    at::Tensor candidate_cell,
    at::Tensor X,
    at::Tensor gate_weights,
    at::Tensor weights)
{

  auto d_old_cell = at::zeros_like(new_cell);
  auto d_gates = at::zeros_like(gate_weights);

  const auto batch_size = new_cell.size(0);
  const auto state_size = new_cell.size(1);

  const int threads = 1024;
  const dim3 blocks((state_size + threads - 1) / threads, batch_size);

  AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] {
    lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
        d_old_cell.data<scalar_t>(),
        d_gates.data<scalar_t>(),
        grad_h.contiguous().data<scalar_t>(),
        grad_cell.contiguous().data<scalar_t>(),
        new_cell.contiguous().data<scalar_t>(),
        input_gate.contiguous().data<scalar_t>(),
        output_gate.contiguous().data<scalar_t>(),
        candidate_cell.contiguous().data<scalar_t>(),
        gate_weights.contiguous().data<scalar_t>(),
        state_size);
  }));

  auto d_weights = d_gates.t().mm(X);
  auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true);

  auto d_X = d_gates.mm(weights);
  auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size);
  auto d_input = d_X.slice(/*dim=*/1, state_size);

  return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates};
}

注意这里不像上一个blog写的那样进行编译cuda核函数。

  • 写一个”.cpp”的连接文件
#include <torch/torch.h>
#include <vector>
// CUDA forward declarations
std::vector<at::Tensor> lltm_cuda_forward(
    at::Tensor input,
    at::Tensor weights,
    at::Tensor bias,
    at::Tensor old_h,
    at::Tensor old_cell);
std::vector<at::Tensor> lltm_cuda_backward(
    at::Tensor grad_h,
    at::Tensor grad_cell,
    at::Tensor new_cell,
    at::Tensor input_gate,
    at::Tensor output_gate,
    at::Tensor candidate_cell,
    at::Tensor X,
    at::Tensor gate_weights,
    at::Tensor weights);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std::vector<at::Tensor> lltm_forward(
    at::Tensor input,
    at::Tensor weights,
    at::Tensor bias,
    at::Tensor old_h,
    at::Tensor old_cell) {
  CHECK_INPUT(input);
 CHECK_INPUT(weights);
  CHECK_INPUT(bias);
  CHECK_INPUT(old_h);
  CHECK_INPUT(old_cell);
  return lltm_cuda_forward(input, weights, bias, old_h, old_cell);
}
std::vector<at::Tensor> lltm_backward(
    at::Tensor grad_h,
    at::Tensor grad_cell,
    at::Tensor new_cell,
    at::Tensor input_gate,
    at::Tensor output_gate,
    at::Tensor candidate_cell,
    at::Tensor X,
    at::Tensor gate_weights,
    at::Tensor weights) {
  CHECK_INPUT(grad_h);
  CHECK_INPUT(grad_cell);
  CHECK_INPUT(input_gate);
  CHECK_INPUT(output_gate);
  CHECK_INPUT(candidate_cell);
  CHECK_INPUT(X);
  CHECK_INPUT(gate_weights);
  CHECK_INPUT(weights);

  return lltm_cuda_backward(
      grad_h,
      grad_cell,
      new_cell,
      input_gate,
      output_gate,
      candidate_cell,
      X,

      gate_weights,
      weights);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &lltm_forward, "LLTM forward (CUDA)");
  m.def("backward", &lltm_backward, "LLTM backward (CUDA)");
}


仍然不要忘记最后让外边用的接口,这里其实就是一个如何调核函数的问题,这里面的return 后面调的就是cuda 核函数里面写的那两个函数,不过这里面并没有include 刚才写的文件,可能也是内部已经实现好了吧。

  • 写编译文件
# setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name='lltm_cuda',
    ext_modules=[
        CUDAExtension('lltm_cuda', [
            'lltm_cuda.cpp',
            'lltm_cuda_kernel.cu',
        ])  
    ],  
    cmdclass={
        'build_ext': BuildExtension
    })  


运行python setup.py install, 和之前结果差不多,不过这时候是这样的,因为名字变了,

...
...
...
xtracting lltm_cuda-0.0.0-py3.6-linux-x86_64.egg to /home/pengkun/anaconda3/lib/python3.6/site-packages
lltm-cuda 0.0.0 is already the active version in easy-install.pth

Installed /home/pengkun/anaconda3/lib/python3.6/site-packages/lltm_cuda-0.0.0-py3.6-linux-x86_64.egg
Processing dependencies for lltm-cuda==0.0.0
Finished processing dependencies for lltm-cuda==0.0.0

然后可以这样用

import torch
import lltm_cuda
lltm_cuda.forward
<built-in method forward of PyCapsule object at 0x7f669bae2b40>

jit

当然这次也可以用jit的方式

from torch.utils.cpp_extension import load
lltmCuda = load(name='lltm_cuda', sources=['lltm_cuda.cpp', 'lltm_cuda_kernel.cu'])
lltmCuda.forward
<built-in method forward of PyCapsule object at 0x7f63a7049f30>

后记

下面就准备用这种方式来重写一下之前的roc计算的。

打赏,谢谢~~

取消

感谢您的支持,我会继续努力的!

扫码支持
扫码打赏,多谢支持~

打开微信扫一扫,即可进行扫码打赏哦