Diff of /gcn/csrc/cuda/GOF_cuda.cu [000000] .. [f77492]

Switch to unified view

a b/gcn/csrc/cuda/GOF_cuda.cu
1
#include <ATen/ATen.h>
2
#include <ATen/cuda/CUDAContext.h>
3
4
#include <THC/THC.h>
5
#include <THC/THCAtomics.cuh>
6
#include <THC/THCDeviceUtils.cuh>
7
8
// TODO make it in a common file
9
#define CUDA_1D_KERNEL_LOOP(i, n)                            \
10
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
11
       i += blockDim.x * gridDim.x)
12
13
14
template <typename T>
15
__global__ void GOFForward_cuda_kernel(const int nthreads,
16
                                       const T* weight_data,
17
                                       const T* gaborFilterBank_data, 
18
                                       const int nOutputPlane,
19
                                       const int nInputPlane,
20
                                       const int nChannel,
21
                                       const int kH,
22
                                       const int kW,
23
                                       T* output_data) {
24
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
25
    auto w = index % kW;
26
    auto h = (index / kW) % kH;
27
    auto c = (index / kW / kH) % nChannel;
28
    auto in = (index / kW / kH / nChannel) % nInputPlane;
29
    auto ori = (index / kW / kH / nChannel / nInputPlane) % nChannel;
30
    auto ou = index / kW / kH / nChannel / nInputPlane / nChannel;
31
    T val = *(weight_data + (((ou * nInputPlane + in) * nChannel + c) * kH + h) * kW + w);
32
    T *target = output_data + index;
33
    T gabortmp = *(gaborFilterBank_data + ori * (kH * kW)
34
                                        + h * kW
35
                                        + w);
36
    *target = val * gabortmp;
37
  }
38
}
39
40
template <typename T>
41
__global__ void GOFBackward_cuda_kernel(const int nthreads,
42
                                       const T* grad_output_data,
43
                                       const T* gaborFilterBank_data, 
44
                                       const int nOutputPlane,
45
                                       const int nInputPlane,
46
                                       const int nChannel,
47
                                       const int kH,
48
                                       const int kW,
49
                                       T* grad_weight_data) {
50
  auto nEntry = nChannel * kH * kW;
51
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
52
    auto l = index % nEntry;
53
    auto j = (index / nEntry) % nInputPlane;
54
    auto i = index / nEntry / nInputPlane;
55
    T *val = grad_weight_data + index;
56
    *val = 0;
57
    for (int k = 0; k < nChannel; k++) {
58
      T gabortmp = *(gaborFilterBank_data + k * (kW * kH)
59
                                          + l % (kW * kH));
60
      T target = *(grad_output_data + i * (nChannel * nInputPlane * nEntry)
61
                                    + k * (nInputPlane * nEntry)
62
                                    + j * (nEntry)
63
                                    + l);     
64
            *val = *val + target * gabortmp;
65
    }
66
  }
67
}
68
69
at::Tensor GOF_forward_cuda(const at::Tensor& weight,
70
                            const at::Tensor& gaborFilterBank) {
71
  AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
72
  AT_ASSERTM(gaborFilterBank.type().is_cuda(), "gaborFilterBank must be a CUDA tensor");
73
74
  auto nOutputPlane = weight.size(0);
75
  auto nInputPlane = weight.size(1);
76
  auto nChannel = weight.size(2);
77
  auto kH = weight.size(3);
78
  auto kW = weight.size(4);
79
80
  auto output = at::empty({nOutputPlane * nChannel, nInputPlane * nChannel, kH, kW}, weight.options());
81
  // auto nEntry = nChannel * kH * kW;
82
  auto output_size = nOutputPlane * nChannel* nInputPlane * nChannel * kH * kW;
83
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
84
85
  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
86
  dim3 block(512);
87
88
  if (output.numel() == 0) {
89
    THCudaCheck(cudaGetLastError());
90
    return output;
91
  }
92
93
  AT_DISPATCH_FLOATING_TYPES(weight.type(), "GOF_forward", [&] {
94
    GOFForward_cuda_kernel<scalar_t><<<grid, block, 0, stream>>>(
95
      output_size,
96
      weight.data<scalar_t>(),
97
      gaborFilterBank.data<scalar_t>(),
98
      nOutputPlane,
99
      nInputPlane,
100
      nChannel,
101
      kH,
102
      kW,
103
      output.data<scalar_t>());
104
  });
105
  THCudaCheck(cudaGetLastError());
106
  return output;
107
}
108
109
at::Tensor GOF_backward_cuda(const at::Tensor& grad_output,
110
                             const at::Tensor& gaborFilterBank) {
111
  AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
112
  AT_ASSERTM(gaborFilterBank.type().is_cuda(), "gaborFilterBank must be a CUDA tensor");
113
114
  auto nChannel = gaborFilterBank.size(0);
115
  auto nOutputPlane = grad_output.size(0) / nChannel;
116
  auto nInputPlane = grad_output.size(1) / nChannel;
117
  auto kH = grad_output.size(2);
118
  auto kW = grad_output.size(3);
119
120
  auto grad_weight = at::empty({nOutputPlane, nInputPlane, nChannel, kH, kW}, grad_output.options());
121
  auto nEntry = nChannel * kH * kW;
122
  auto grad_weight_size = nOutputPlane * nInputPlane * nEntry;
123
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
124
125
  dim3 grid(std::min(THCCeilDiv(grad_weight_size, 512L), 4096L));
126
  dim3 block(512);
127
128
  if (grad_weight.numel() == 0) {
129
    THCudaCheck(cudaGetLastError());
130
    return grad_weight;
131
  }
132
133
  AT_DISPATCH_FLOATING_TYPES(grad_output.type(), "GOF_backward", [&] {
134
    GOFBackward_cuda_kernel<scalar_t><<<grid, block, 0, stream>>>(
135
      grad_weight_size,
136
      grad_output.data<scalar_t>(),
137
      gaborFilterBank.data<scalar_t>(),
138
      nOutputPlane,
139
      nInputPlane,
140
      nChannel,
141
      kH,
142
      kW,
143
      grad_weight.data<scalar_t>());
144
  });
145
  THCudaCheck(cudaGetLastError());
146
  return grad_weight;
147
}