first commit

author: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
committer: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
commit: fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree: af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
1 files changed, 130 insertions, 0 deletions
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
new file mode 100644
index 0000000..2007151
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
@@ -0,0 +1,130 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define RA_TILE 32
+#define RA_ROWS 8
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpKernel(
+  const float *image,
+  const float *flow,
+  float       *warped,
+  const int    batch_size,
+  const int    channels,
+  const int    cblocks,
+  const int    width,
+  const int    wblocks,
+  const int    height,
+  const int    width_height) {
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X];
+  __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1];
+
+  int x;
+  int c;
+
+  x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if ((threadIdx.y == 0) && (x < width)) {
+    const int idx = ((n * height + y) * width + x) * 2;
+    x2_buf[threadIdx.x] = float(x) + flow[idx];
+    y2_buf[threadIdx.x] = float(y) + flow[idx + 1];
+  }
+
+  __syncthreads();
+
+  float x2 = x2_buf[threadIdx.y];
+  float y2 = y2_buf[threadIdx.y];
+
+  int ix2_L = int(x2);
+  int iy2_T = int(y2);
+  int ix2_R = min(ix2_L + 1, width - 1);
+  int iy2_B = min(iy2_T + 1, height - 1);
+
+  int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels;
+  int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels;
+  int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels;
+  int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels;
+
+  float alpha   = x2 - ix2_L;
+  float beta    = y2 - iy2_T;
+  float coeffTL = (1 - alpha) * (1 - beta);
+  float coeffTR = alpha * (1 - beta);
+  float coeffBL = (1 - alpha) * beta;
+  float coeffBR = alpha * beta;
+
+  for (int cb = 0; cb < cblocks; cb++) {
+    __syncthreads();
+
+    buffer[threadIdx.y][threadIdx.x] = 0.0;
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.x;
+
+    if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) {
+      buffer[threadIdx.y][threadIdx.x] = // buffer [x][c]
+                                         coeffTL * image[off_TL + c] +
+                                         coeffTR * image[off_TR + c] +
+                                         coeffBL * image[off_BL + c] +
+                                         coeffBR * image[off_BR + c];
+    }
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.y;
+    x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+    if ((c < channels) && (x < width)) {
+      warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y];
+    }
+  }
+}
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output) {
+  const int batch_size = input.dimension(0);
+  const int height     = input.dimension(1);
+  const int width      = input.dimension(2);
+  const int channels   = input.dimension(3);
+
+  const int width_height = width * height;
+  int  wblocks           = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks           = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, FW_TILE_C);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    input.data(),
+    flow.data(),
+    output.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
author	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
committer	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
commit	fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree	af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc