#if GOOGLE_CUDA #define EIGEN_USE_GPU #include #include #include "flow_augmentation.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; inline __device__ __host__ int clamp(int f, int a, int b) { return max(a, min(f, b)); } __global__ void FillFlowAugmentationKernel( const int32 nthreads, const float *flow_ptr, const float *transforms_from_a, const float *inv_transforms_from_b, const int src_total_count, const int src_height, const int src_width, const int batch_size, const int out_height, const int out_width, float *output_ptr) { CUDA_1D_KERNEL_LOOP(index, nthreads) { const float x = (float)(index % out_width); const float y = (float)((index / out_width) % out_height); const int n = (index / out_width / out_height); const int transformIdx = n * 6; // Apply transformation matrix applied to second image const float xpos1 = x * transforms_from_a[transformIdx + 0] + y * transforms_from_a[transformIdx + 1] + transforms_from_a[transformIdx + 2]; const float ypos1 = x * transforms_from_a[transformIdx + 3] + y * transforms_from_a[transformIdx + 4] + transforms_from_a[transformIdx + 5]; // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w) // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) const int srcXIdx = ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * 2 + 0; const int srcYIdx = srcXIdx + 1; const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)]; const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)]; // Apply inverse of the transformation matrix applied to first image const float xpos3 = xpos2 * inv_transforms_from_b[transformIdx + 0] + ypos2 * inv_transforms_from_b[transformIdx + 1] + inv_transforms_from_b[transformIdx + 2]; const float ypos3 = xpos2 * inv_transforms_from_b[transformIdx + 3] + ypos2 * inv_transforms_from_b[transformIdx + 4] + inv_transforms_from_b[transformIdx + 5]; output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 0] = xpos3 - x; output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 1] = ypos3 - y; } } template<> void FillFlowAugmentation(const GPUDevice& device, typename TTypes::Tensor output, typename TTypes::ConstTensor flows, typename TTypes::ConstTensor transforms_from_a, typename TTypes::ConstTensor transforms_from_b) { const int batch_size = output.dimension(0); const int out_height = output.dimension(1); const int out_width = output.dimension(2); const int depth = 2; const int total_count = batch_size * out_height * out_width * depth; const int src_total_count = flows.dimension(0) * flows.dimension(1) * flows.dimension(2) * flows.dimension(3); CudaLaunchConfig config = GetCudaLaunchConfig(total_count / 2, device); FillFlowAugmentationKernel << < config.block_count, config.thread_per_block, 0, device.stream() >> > ( total_count / 2, flows.data(), transforms_from_a.data(), transforms_from_b.data(), src_total_count, flows.dimension(1), flows.dimension(2), batch_size, out_height, out_width, output.data()); } } // end namespace tensorflow #endif // GOOGLE_CUDA