diff options
| author | StevenLiuWen <liuwen@shanghaitech.edu.cn> | 2018-03-13 03:28:06 -0400 |
|---|---|---|
| committer | StevenLiuWen <liuwen@shanghaitech.edu.cn> | 2018-03-13 03:28:06 -0400 |
| commit | fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch) | |
| tree | af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc | |
first commit
Diffstat (limited to 'Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc')
| -rw-r--r-- | Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc new file mode 100644 index 0000000..2007151 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc @@ -0,0 +1,130 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "flow_warp.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +#define RA_TILE 32 +#define RA_ROWS 8 + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void FlowWarpKernel( + const float *image, + const float *flow, + float *warped, + const int batch_size, + const int channels, + const int cblocks, + const int width, + const int wblocks, + const int height, + const int width_height) { + int y = blockIdx.y; + int n = blockIdx.z; + + __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X]; + __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1]; + + int x; + int c; + + x = blockIdx.x * FW_TILE_X + threadIdx.x; + + if ((threadIdx.y == 0) && (x < width)) { + const int idx = ((n * height + y) * width + x) * 2; + x2_buf[threadIdx.x] = float(x) + flow[idx]; + y2_buf[threadIdx.x] = float(y) + flow[idx + 1]; + } + + __syncthreads(); + + float x2 = x2_buf[threadIdx.y]; + float y2 = y2_buf[threadIdx.y]; + + int ix2_L = int(x2); + int iy2_T = int(y2); + int ix2_R = min(ix2_L + 1, width - 1); + int iy2_B = min(iy2_T + 1, height - 1); + + int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels; + int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels; + int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels; + int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels; + + float alpha = x2 - ix2_L; + float beta = y2 - iy2_T; + float coeffTL = (1 - alpha) * (1 - beta); + float coeffTR = alpha * (1 - beta); + float coeffBL = (1 - alpha) * beta; + float coeffBR = alpha * beta; + + for (int cb = 0; cb < cblocks; cb++) { + __syncthreads(); + + buffer[threadIdx.y][threadIdx.x] = 0.0; + + __syncthreads(); + + c = cb * FW_TILE_C + threadIdx.x; + + if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) { + buffer[threadIdx.y][threadIdx.x] = // buffer [x][c] + coeffTL * image[off_TL + c] + + coeffTR * image[off_TR + c] + + coeffBL * image[off_BL + c] + + coeffBR * image[off_BR + c]; + } + + __syncthreads(); + + c = cb * FW_TILE_C + threadIdx.y; + x = blockIdx.x * FW_TILE_X + threadIdx.x; + + if ((c < channels) && (x < width)) { + warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y]; + } + } +} + +void FlowWarp(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor input, + typename TTypes<float, 4>::ConstTensor flow, + typename TTypes<float, 4>::Tensor output) { + const int batch_size = input.dimension(0); + const int height = input.dimension(1); + const int width = input.dimension(2); + const int channels = input.dimension(3); + + const int width_height = width * height; + int wblocks = ((width - 1) / FW_TILE_X + 1); + int cblocks = ((channels - 1) / FW_TILE_C + 1); + dim3 warpThreads(FW_TILE_X, FW_TILE_C); + dim3 warpBlocks(wblocks, height, batch_size); + + cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float)); + + FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > ( + input.data(), + flow.data(), + output.data(), + batch_size, + channels, + cblocks, + width, + wblocks, + height, + width_height); +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA |
