1 files changed, 348 insertions, 0 deletions
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
new file mode 100644
index 0000000..7a2101d
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
@@ -0,0 +1,348 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+inline __device__ __host__ float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+__global__ void SpatialAugmentation(
+  const int32  nthreads,
+  const int    src_width,
+  const int    src_height,
+  const int    channels,
+  const int    src_count,
+  const int    out_width,
+  const int    out_height,
+  const float *src_data,
+  float       *out_data,
+  const float *transMats) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    int c = index % channels;
+    int x = (index / channels) % out_width;
+    int y = (index / channels / out_width) % out_height;
+    int n = index / channels / out_width / out_height;
+
+    const float *transMat = transMats + n * 6;
+    float xpos            = x * transMat[0] + y * transMat[1] + transMat[2];
+    float ypos            = x * transMat[3] + y * transMat[4] + transMat[5];
+
+    xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+    ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+    float tlx = floor(xpos);
+    float tly = floor(ypos);
+
+    // Bilinear interpolation
+    int srcTLIdx = ((n * src_height + tly) * src_width + tlx) * channels + c;
+    int srcTRIdx = min((int)(((n * src_height + tly) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+    int srcBLIdx = min((int)(((n * src_height + (tly + 1)) * src_width + tlx) * channels + c),
+                       src_count);
+    int srcBRIdx = min((int)(((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+
+    float xdist = xpos - tlx;
+    float ydist = ypos - tly;
+
+    float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                 + (xdist) * (ydist) * src_data[srcBRIdx]
+                 + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                 + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+    out_data[index] = dest;
+  }
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template<>
+void Augment(OpKernelContext *context,
+             const GPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int out_count     = batch_size * out_height * out_width * channels;
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_count, d);
+
+  printf("Chromatic transform not yet implemented on GPU, ignoring.");
+
+  SpatialAugmentation << < config.block_count, config.thread_per_block, 0, d.stream() >> > (
+    config.virtual_thread_count, src_width, src_height, channels, src_count,
+    out_width, out_height,
+    src_data, out_data, transMats);
+}
+
+//
+// template<typename Device>
+// class DataAugmentation : public OpKernel {
+//   public:
+//     explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+//       // Get the crop [height, width] tensor and verify its dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+//       OP_REQUIRES(ctx, crop_.size() == 2,
+//                   errors::InvalidArgument("crop must be 2 dimensions"));
+//
+//       // TODO: Verify params are all the same length
+//
+//       // Get the tensors for params_a and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_a_rand_type",
+// &params_a_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread",
+// &params_a_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+//
+//       // Get the tensors for params_b and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_b_rand_type",
+// &params_b_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread",
+// &params_b_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+//     }
+//
+//     void Compute(OpKernelContext *ctx) override {
+//       const GPUDevice& device = ctx->eigen_gpu_device();
+//
+//       // Get the input images
+//       const Tensor& input_a_t = ctx->input(0);
+//       const Tensor& input_b_t = ctx->input(1);
+//
+//       // Dimension constants
+//       const int batch_size = input_a_t.dim_size(0);
+//       const int src_height = input_a_t.dim_size(1);
+//       const int src_width  = input_a_t.dim_size(2);
+//       const int channels   = input_a_t.dim_size(3);
+//       const int src_count  = batch_size * src_height * src_width * channels;
+//       const int out_height = crop_[0];
+//       const int out_width  = crop_[1];
+//       const int out_count  = batch_size * out_height * out_width * channels;
+//
+//       // Allocate the memory for the output images
+//       Tensor *output_a_t;
+//       Tensor *output_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(0, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(1, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_b_t));
+//
+//       // Allocate the memory for the output spatial transforms
+//       Tensor *spat_transform_a_t;
+//       Tensor *spat_transform_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+//  &spat_transform_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+// &spat_transform_b_t));
+//
+//       // Allocate temporary pinned memory for the spatial transforms to be
+// used
+//       // on the GPU
+//       tensorflow::AllocatorAttributes pinned_allocator;
+//       pinned_allocator.set_on_host(true);
+//       pinned_allocator.set_gpu_compatible(true);
+//
+//       Tensor spat_transform_a_pinned_t;
+//       Tensor spat_transform_b_pinned_t;
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_a_pinned_t,
+// pinned_allocator));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_b_pinned_t,
+// pinned_allocator));
+//       auto spat_transform_a_pinned = spat_transform_a_pinned_t.tensor<float,
+// 2>();
+//       auto spat_transform_b_pinned = spat_transform_b_pinned_t.tensor<float,
+// 2>();
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE A ***/
+//       auto input_a  = input_a_t.tensor<float, 4>();
+//       auto output_a = output_a_t->tensor<float, 4>();
+//
+//       // Load augmentation parameters for image A
+//       AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+//                                                     params_a_name_,
+//                                                     params_a_rand_type_,
+//                                                     params_a_exp_,
+//                                                     params_a_mean_,
+//                                                     params_a_spread_,
+//                                                     params_a_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_a;
+//
+//       bool gen_spatial_transform = aug_a.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_a, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_a.push_back(coeff);
+//       }
+//
+//       // Copy spatial coefficients A to the output Tensor on the CPU (output
+// for
+//       // FlowAugmentation)
+//       auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_a);
+//
+//       // ...as well as a Tensor going to the GPU
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+//                                                            out_height,
+//                                                            src_width,
+//                                                            src_height,
+//
+//
+//
+//                                                    spat_transform_a_pinned);
+//
+//       CudaLaunchConfig config = GetCudaLaunchConfig(out_count, device);
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_a.data(), output_a.data(), spat_transform_a_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE A ***/
+//
+//       /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//       AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+//                                                     params_b_name_,
+//                                                     params_b_rand_type_,
+//                                                     params_b_exp_,
+//                                                     params_b_mean_,
+//                                                     params_b_spread_,
+//                                                     params_b_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_b;
+//
+//       gen_spatial_transform = aug_b.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_b, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_b.push_back(coeff);
+//       }
+//
+//       /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE B ***/
+//       auto input_b  = input_b_t.tensor<float, 4>();
+//       auto output_b = output_b_t->tensor<float, 4>();
+//
+//       // Copy spatial coefficients B to the output Tensor on the CPU
+//       auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_b,
+//                                                            true);
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//
+//
+//
+//                                                    spat_transform_b_pinned);
+//
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_b.data(), output_b.data(), spat_transform_b_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE B ***/
+//     }
+//
+//   private:
+//     std::vector<int32>crop_;
+//
+//     // Params A
+//     std::vector<string>params_a_name_;
+//     std::vector<string>params_a_rand_type_;
+//     std::vector<bool>params_a_exp_;
+//     std::vector<float>params_a_mean_;
+//     std::vector<float>params_a_spread_;
+//     std::vector<float>params_a_prob_;
+//
+//     // Params B
+//     std::vector<string>params_b_name_;
+//     std::vector<string>params_b_rand_type_;
+//     std::vector<bool>params_b_exp_;
+//     std::vector<float>params_b_mean_;
+//     std::vector<float>params_b_spread_;
+//     std::vector<float>params_b_prob_;
+// };
+} // namespace tensorflow
+#endif // GOOGLE_CUDA