1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include <stdio.h>
#include <iostream>
#include "pad.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
__global__ void PadData(
const float *in,
int in_widthheight,
int in_width,
int in_height,
int out_width,
int out_height,
int channels,
int padding,
float *out) {
int xy = blockIdx.x * blockDim.x + threadIdx.x;
int x = xy % in_width;
int y = xy / in_width;
int ch = blockIdx.y;
int n = blockIdx.z;
if (xy >= in_widthheight) {
out[((n * out_height + y) * out_width + x) * channels + ch] = 0.0;
return;
}
float value = in[((n * in_height + y) * in_width + x) * channels + ch];
__syncthreads();
int xpad = x + padding;
int ypad = y + padding;
out[((n * out_height + ypad) * out_width + xpad) * channels + ch] = value;
}
void Pad(const GPUDevice& device,
const float *input,
int batch_size,
int input_height,
int input_width,
int input_channels,
int output_height,
int output_width,
float *output) {
int in_widthheight = input_width * input_height;
int threads_per_block = 16;
dim3 totalBlocks((in_widthheight - 1) / threads_per_block + 1, input_channels, batch_size);
cudaMemset(output, 0, batch_size * output_height * output_width * input_channels * sizeof(float));
int padding = (output_height - input_height) / 2;
// LAUNCH KERNEL
PadData << < totalBlocks, threads_per_block, 0, device.stream() >> > (
input,
in_widthheight,
input_width,
input_height,
output_width,
output_height,
input_channels,
padding,
output);
}
}
#endif // if GOOGLE_CUDA
|