|
a |
|
b/gcn/csrc/cuda/GOF_cuda.cu |
|
|
1 |
#include <ATen/ATen.h> |
|
|
2 |
#include <ATen/cuda/CUDAContext.h> |
|
|
3 |
|
|
|
4 |
#include <THC/THC.h> |
|
|
5 |
#include <THC/THCAtomics.cuh> |
|
|
6 |
#include <THC/THCDeviceUtils.cuh> |
|
|
7 |
|
|
|
8 |
// TODO make it in a common file |
|
|
9 |
#define CUDA_1D_KERNEL_LOOP(i, n) \ |
|
|
10 |
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ |
|
|
11 |
i += blockDim.x * gridDim.x) |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
template <typename T> |
|
|
15 |
__global__ void GOFForward_cuda_kernel(const int nthreads, |
|
|
16 |
const T* weight_data, |
|
|
17 |
const T* gaborFilterBank_data, |
|
|
18 |
const int nOutputPlane, |
|
|
19 |
const int nInputPlane, |
|
|
20 |
const int nChannel, |
|
|
21 |
const int kH, |
|
|
22 |
const int kW, |
|
|
23 |
T* output_data) { |
|
|
24 |
CUDA_1D_KERNEL_LOOP(index, nthreads) { |
|
|
25 |
auto w = index % kW; |
|
|
26 |
auto h = (index / kW) % kH; |
|
|
27 |
auto c = (index / kW / kH) % nChannel; |
|
|
28 |
auto in = (index / kW / kH / nChannel) % nInputPlane; |
|
|
29 |
auto ori = (index / kW / kH / nChannel / nInputPlane) % nChannel; |
|
|
30 |
auto ou = index / kW / kH / nChannel / nInputPlane / nChannel; |
|
|
31 |
T val = *(weight_data + (((ou * nInputPlane + in) * nChannel + c) * kH + h) * kW + w); |
|
|
32 |
T *target = output_data + index; |
|
|
33 |
T gabortmp = *(gaborFilterBank_data + ori * (kH * kW) |
|
|
34 |
+ h * kW |
|
|
35 |
+ w); |
|
|
36 |
*target = val * gabortmp; |
|
|
37 |
} |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
template <typename T> |
|
|
41 |
__global__ void GOFBackward_cuda_kernel(const int nthreads, |
|
|
42 |
const T* grad_output_data, |
|
|
43 |
const T* gaborFilterBank_data, |
|
|
44 |
const int nOutputPlane, |
|
|
45 |
const int nInputPlane, |
|
|
46 |
const int nChannel, |
|
|
47 |
const int kH, |
|
|
48 |
const int kW, |
|
|
49 |
T* grad_weight_data) { |
|
|
50 |
auto nEntry = nChannel * kH * kW; |
|
|
51 |
CUDA_1D_KERNEL_LOOP(index, nthreads) { |
|
|
52 |
auto l = index % nEntry; |
|
|
53 |
auto j = (index / nEntry) % nInputPlane; |
|
|
54 |
auto i = index / nEntry / nInputPlane; |
|
|
55 |
T *val = grad_weight_data + index; |
|
|
56 |
*val = 0; |
|
|
57 |
for (int k = 0; k < nChannel; k++) { |
|
|
58 |
T gabortmp = *(gaborFilterBank_data + k * (kW * kH) |
|
|
59 |
+ l % (kW * kH)); |
|
|
60 |
T target = *(grad_output_data + i * (nChannel * nInputPlane * nEntry) |
|
|
61 |
+ k * (nInputPlane * nEntry) |
|
|
62 |
+ j * (nEntry) |
|
|
63 |
+ l); |
|
|
64 |
*val = *val + target * gabortmp; |
|
|
65 |
} |
|
|
66 |
} |
|
|
67 |
} |
|
|
68 |
|
|
|
69 |
at::Tensor GOF_forward_cuda(const at::Tensor& weight, |
|
|
70 |
const at::Tensor& gaborFilterBank) { |
|
|
71 |
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); |
|
|
72 |
AT_ASSERTM(gaborFilterBank.type().is_cuda(), "gaborFilterBank must be a CUDA tensor"); |
|
|
73 |
|
|
|
74 |
auto nOutputPlane = weight.size(0); |
|
|
75 |
auto nInputPlane = weight.size(1); |
|
|
76 |
auto nChannel = weight.size(2); |
|
|
77 |
auto kH = weight.size(3); |
|
|
78 |
auto kW = weight.size(4); |
|
|
79 |
|
|
|
80 |
auto output = at::empty({nOutputPlane * nChannel, nInputPlane * nChannel, kH, kW}, weight.options()); |
|
|
81 |
// auto nEntry = nChannel * kH * kW; |
|
|
82 |
auto output_size = nOutputPlane * nChannel* nInputPlane * nChannel * kH * kW; |
|
|
83 |
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); |
|
|
84 |
|
|
|
85 |
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L)); |
|
|
86 |
dim3 block(512); |
|
|
87 |
|
|
|
88 |
if (output.numel() == 0) { |
|
|
89 |
THCudaCheck(cudaGetLastError()); |
|
|
90 |
return output; |
|
|
91 |
} |
|
|
92 |
|
|
|
93 |
AT_DISPATCH_FLOATING_TYPES(weight.type(), "GOF_forward", [&] { |
|
|
94 |
GOFForward_cuda_kernel<scalar_t><<<grid, block, 0, stream>>>( |
|
|
95 |
output_size, |
|
|
96 |
weight.data<scalar_t>(), |
|
|
97 |
gaborFilterBank.data<scalar_t>(), |
|
|
98 |
nOutputPlane, |
|
|
99 |
nInputPlane, |
|
|
100 |
nChannel, |
|
|
101 |
kH, |
|
|
102 |
kW, |
|
|
103 |
output.data<scalar_t>()); |
|
|
104 |
}); |
|
|
105 |
THCudaCheck(cudaGetLastError()); |
|
|
106 |
return output; |
|
|
107 |
} |
|
|
108 |
|
|
|
109 |
at::Tensor GOF_backward_cuda(const at::Tensor& grad_output, |
|
|
110 |
const at::Tensor& gaborFilterBank) { |
|
|
111 |
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); |
|
|
112 |
AT_ASSERTM(gaborFilterBank.type().is_cuda(), "gaborFilterBank must be a CUDA tensor"); |
|
|
113 |
|
|
|
114 |
auto nChannel = gaborFilterBank.size(0); |
|
|
115 |
auto nOutputPlane = grad_output.size(0) / nChannel; |
|
|
116 |
auto nInputPlane = grad_output.size(1) / nChannel; |
|
|
117 |
auto kH = grad_output.size(2); |
|
|
118 |
auto kW = grad_output.size(3); |
|
|
119 |
|
|
|
120 |
auto grad_weight = at::empty({nOutputPlane, nInputPlane, nChannel, kH, kW}, grad_output.options()); |
|
|
121 |
auto nEntry = nChannel * kH * kW; |
|
|
122 |
auto grad_weight_size = nOutputPlane * nInputPlane * nEntry; |
|
|
123 |
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); |
|
|
124 |
|
|
|
125 |
dim3 grid(std::min(THCCeilDiv(grad_weight_size, 512L), 4096L)); |
|
|
126 |
dim3 block(512); |
|
|
127 |
|
|
|
128 |
if (grad_weight.numel() == 0) { |
|
|
129 |
THCudaCheck(cudaGetLastError()); |
|
|
130 |
return grad_weight; |
|
|
131 |
} |
|
|
132 |
|
|
|
133 |
AT_DISPATCH_FLOATING_TYPES(grad_output.type(), "GOF_backward", [&] { |
|
|
134 |
GOFBackward_cuda_kernel<scalar_t><<<grid, block, 0, stream>>>( |
|
|
135 |
grad_weight_size, |
|
|
136 |
grad_output.data<scalar_t>(), |
|
|
137 |
gaborFilterBank.data<scalar_t>(), |
|
|
138 |
nOutputPlane, |
|
|
139 |
nInputPlane, |
|
|
140 |
nChannel, |
|
|
141 |
kH, |
|
|
142 |
kW, |
|
|
143 |
grad_weight.data<scalar_t>()); |
|
|
144 |
}); |
|
|
145 |
THCudaCheck(cudaGetLastError()); |
|
|
146 |
return grad_weight; |
|
|
147 |
} |