first commit

author: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
committer: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
commit: fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree: af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src
62 files changed, 5848 insertions, 0 deletions
diff --git a/Codes/flownet2/src/__init__.py b/Codes/flownet2/src/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/__init__.py
diff --git a/Codes/flownet2/src/correlation.py b/Codes/flownet2/src/correlation.py
new file mode 100644
index 0000000..60a5c37
--- /dev/null
+++ b/Codes/flownet2/src/correlation.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+
+_correlation_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/correlation.so"))
+
+
+def correlation(input_a, input_b, kernel_size, max_displacement, stride_1, stride_2, padding):
+    return _correlation_ops.correlation(input_a,
+                                        input_b,
+                                        kernel_size,
+                                        max_displacement,
+                                        stride_1,
+                                        stride_2,
+                                        padding)
+
+
+@tf.RegisterGradient("Correlation")
+def _correlation_grad(corr_op, gradients):
+    kernel_size = corr_op.get_attr("kernel_size")
+    max_displacement = corr_op.get_attr("max_displacement")
+    stride_1 = corr_op.get_attr("stride_1")
+    stride_2 = corr_op.get_attr("stride_2")
+    pad = corr_op.get_attr("pad")
+
+    corr_grads = _correlation_ops.correlation_grad(gradients,
+                                                   corr_op.inputs[0],
+                                                   corr_op.inputs[1],
+                                                   kernel_size,
+                                                   max_displacement,
+                                                   stride_1,
+                                                   stride_2,
+                                                   pad)
+
+    # Return the gradients with respect to input_a and input_b
+    return corr_grads.backprops_a, corr_grads.backprops_b
diff --git a/Codes/flownet2/src/dataloader.py b/Codes/flownet2/src/dataloader.py
new file mode 100644
index 0000000..22a6ddb
--- /dev/null
+++ b/Codes/flownet2/src/dataloader.py
@@ -0,0 +1,329 @@
+# -*- coding: utf-8 -*-
+import tensorflow as tf
+import copy
+slim = tf.contrib.slim
+
+_preprocessing_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/preprocessing.so"))
+
+
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+class Image(slim.tfexample_decoder.ItemHandler):
+    """An ItemHandler that decodes a parsed Tensor as an image."""
+
+    def __init__(self,
+                 image_key=None,
+                 format_key=None,
+                 shape=None,
+                 channels=3,
+                 dtype=tf.uint8,
+                 repeated=False):
+        """Initializes the image.
+        Args:
+          image_key: the name of the TF-Example feature in which the encoded image
+            is stored.
+          shape: the output shape of the image as 1-D `Tensor`
+            [height, width, channels]. If provided, the image is reshaped
+            accordingly. If left as None, no reshaping is done. A shape should
+            be supplied only if all the stored images have the same shape.
+          channels: the number of channels in the image.
+          dtype: images will be decoded at this bit depth. Different formats
+            support different bit depths.
+              See tf.image.decode_image,
+                  tf.decode_raw,
+          repeated: if False, decodes a single image. If True, decodes a
+            variable number of image strings from a 1D tensor of strings.
+        """
+        if not image_key:
+            image_key = 'image/encoded'
+
+        super(Image, self).__init__([image_key])
+        self._image_key = image_key
+        self._shape = shape
+        self._channels = channels
+        self._dtype = dtype
+        self._repeated = repeated
+
+    def tensors_to_item(self, keys_to_tensors):
+        """See base class."""
+        image_buffer = keys_to_tensors[self._image_key]
+
+        if self._repeated:
+            return functional_ops.map_fn(lambda x: self._decode(x),
+                                         image_buffer, dtype=self._dtype)
+        else:
+            return self._decode(image_buffer)
+
+    def _decode(self, image_buffer):
+        """Decodes the image buffer.
+        Args:
+          image_buffer: The tensor representing the encoded image tensor.
+        Returns:
+          A tensor that represents decoded image of self._shape, or
+          (?, ?, self._channels) if self._shape is not specified.
+        """
+        def decode_raw():
+            """Decodes a raw image."""
+            return tf.decode_raw(image_buffer, out_type=self._dtype)
+
+        image = decode_raw()
+        # image.set_shape([None, None, self._channels])
+        if self._shape is not None:
+            image = tf.reshape(image, self._shape)
+
+        return image
+
+
+def __get_dataset(dataset_config, split_name):
+    """
+    dataset_config: A dataset_config defined in datasets.py
+    split_name: 'train'/'validate'
+    """
+    with tf.name_scope('__get_dataset'):
+        if split_name not in dataset_config['SIZES']:
+            raise ValueError('split name %s not recognized' % split_name)
+
+        IMAGE_HEIGHT, IMAGE_WIDTH = dataset_config['IMAGE_HEIGHT'], dataset_config['IMAGE_WIDTH']
+        reader = tf.TFRecordReader
+        keys_to_features = {
+            'image_a': tf.FixedLenFeature((), tf.string),
+            'image_b': tf.FixedLenFeature((), tf.string),
+            'flow': tf.FixedLenFeature((), tf.string),
+        }
+        items_to_handlers = {
+            'image_a': Image(
+                image_key='image_a',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'image_b': Image(
+                image_key='image_b',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'flow': Image(
+                image_key='flow',
+                dtype=tf.float32,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 2],
+                channels=2),
+        }
+        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
+        return slim.dataset.Dataset(
+            data_sources=dataset_config['PATHS'][split_name],
+            reader=reader,
+            decoder=decoder,
+            num_samples=dataset_config['SIZES'][split_name],
+            items_to_descriptions=dataset_config['ITEMS_TO_DESCRIPTIONS'])
+
+
+def config_to_arrays(dataset_config):
+    output = {
+        'name': [],
+        'rand_type': [],
+        'exp': [],
+        'mean': [],
+        'spread': [],
+        'prob': [],
+        'coeff_schedule': [],
+    }
+    config = copy.deepcopy(dataset_config)
+
+    if 'coeff_schedule_param' in config:
+        del config['coeff_schedule_param']
+
+    # Get all attributes
+    for (name, value) in config.iteritems():
+        if name == 'coeff_schedule_param':
+            output['coeff_schedule'] = [value['half_life'],
+                                        value['initial_coeff'],
+                                        value['final_coeff']]
+        else:
+            output['name'].append(name)
+            output['rand_type'].append(value['rand_type'])
+            output['exp'].append(value['exp'])
+            output['mean'].append(value['mean'])
+            output['spread'].append(value['spread'])
+            output['prob'].append(value['prob'])
+
+    return output
+
+
+# https://github.com/tgebru/transform/blob/master/src/caffe/layers/data_augmentation_layer.cpp#L34
+def _generate_coeff(param, discount_coeff=tf.constant(1.0), default_value=tf.constant(0.0)):
+    if not all(name in param for name in ['rand_type', 'exp', 'mean', 'spread', 'prob']):
+        raise RuntimeError('Expected rand_type, exp, mean, spread, prob in `param`')
+
+    rand_type = param['rand_type']
+    exp = float(param['exp'])
+    mean = tf.convert_to_tensor(param['mean'], dtype=tf.float32)
+    spread = float(param['spread'])  # AKA standard deviation
+    prob = float(param['prob'])
+
+    # Multiply spread by our discount_coeff so it changes over time
+    spread = spread * discount_coeff
+
+    if rand_type == 'uniform':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_uniform([], mean - spread, mean + spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'gaussian':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_normal([], mean, spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'bernoulli':
+        if prob > 0.0:
+            value = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            value = 0.0
+    elif rand_type == 'uniform_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_uniform([], mean - spread, mean + spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    elif rand_type == 'gaussian_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_normal([], mean, spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    else:
+        raise ValueError('Unknown distribution type %s.' % rand_type)
+    return value
+
+
+def load_batch(dataset_config, split_name, global_step):
+    num_threads = 32
+    reader_kwargs = {'options': tf.python_io.TFRecordOptions(
+        tf.python_io.TFRecordCompressionType.ZLIB)}
+
+    with tf.name_scope('load_batch'):
+        dataset = __get_dataset(dataset_config, split_name)
+        data_provider = slim.dataset_data_provider.DatasetDataProvider(
+            dataset,
+            num_readers=num_threads,
+            common_queue_capacity=2048,
+            common_queue_min=1024,
+            reader_kwargs=reader_kwargs)
+        image_a, image_b, flow = data_provider.get(['image_a', 'image_b', 'flow'])
+        image_a, image_b, flow = map(tf.to_float, [image_a, image_b, flow])
+
+        if dataset_config['PREPROCESS']['scale']:
+            image_a = image_a / 255.0
+            image_b = image_b / 255.0
+
+        crop = [dataset_config['PREPROCESS']['crop_height'],
+                dataset_config['PREPROCESS']['crop_width']]
+        config_a = config_to_arrays(dataset_config['PREPROCESS']['image_a'])
+        config_b = config_to_arrays(dataset_config['PREPROCESS']['image_b'])
+
+        image_as, image_bs, flows = map(lambda x: tf.expand_dims(x, 0), [image_a, image_b, flow])
+
+        # Perform data augmentation on GPU
+        with tf.device('/cpu:0'):
+            image_as, image_bs, transforms_from_a, transforms_from_b = \
+                _preprocessing_ops.data_augmentation(image_as,
+                                                     image_bs,
+                                                     global_step,
+                                                     crop,
+                                                     config_a['name'],
+                                                     config_a['rand_type'],
+                                                     config_a['exp'],
+                                                     config_a['mean'],
+                                                     config_a['spread'],
+                                                     config_a['prob'],
+                                                     config_a['coeff_schedule'],
+                                                     config_b['name'],
+                                                     config_b['rand_type'],
+                                                     config_b['exp'],
+                                                     config_b['mean'],
+                                                     config_b['spread'],
+                                                     config_b['prob'],
+                                                     config_b['coeff_schedule'])
+
+            noise_coeff_a = None
+            noise_coeff_b = None
+
+            # Generate and apply noise coeff for A if defined in A params
+            if 'noise' in dataset_config['PREPROCESS']['image_a']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_a']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+
+                noise_coeff_a = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_a']['noise'], discount_coeff)
+                noise_a = tf.random_normal(shape=tf.shape(image_as),
+                                           mean=0.0, stddev=noise_coeff_a,
+                                           dtype=tf.float32)
+                image_as = tf.clip_by_value(image_as + noise_a, 0.0, 1.0)
+
+            # Generate noise coeff for B if defined in B params
+            if 'noise' in dataset_config['PREPROCESS']['image_b']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_b']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+                noise_coeff_b = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_b']['noise'], discount_coeff)
+
+            # Combine coeff from a with coeff from b
+            if noise_coeff_a is not None:
+                if noise_coeff_b is not None:
+                    noise_coeff_b = noise_coeff_a * noise_coeff_b
+                else:
+                    noise_coeff_b = noise_coeff_a
+
+            # Add noise to B if needed
+            if noise_coeff_b is not None:
+                noise_b = tf.random_normal(shape=tf.shape(image_bs),
+                                           mean=0.0, stddev=noise_coeff_b,
+                                           dtype=tf.float32)
+                image_bs = tf.clip_by_value(image_bs + noise_b, 0.0, 1.0)
+
+                # Perform flow augmentation using spatial parameters from data augmentation
+            flows = _preprocessing_ops.flow_augmentation(
+                flows, transforms_from_a, transforms_from_b, crop)
+
+            return tf.train.batch([image_as, image_bs, flows],
+                                  enqueue_many=True,
+                                  batch_size=dataset_config['BATCH_SIZE'],
+                                  capacity=dataset_config['BATCH_SIZE'] * 4,
+                                  num_threads=num_threads,
+                                  allow_smaller_final_batch=False)
diff --git a/Codes/flownet2/src/dataset_configs.py b/Codes/flownet2/src/dataset_configs.py
new file mode 100644
index 0000000..fbda5d0
--- /dev/null
+++ b/Codes/flownet2/src/dataset_configs.py
@@ -0,0 +1,153 @@
+"""
+Add dataset configurations here. Each dataset must have the following structure:
+
+NAME = {
+    IMAGE_HEIGHT: int,
+    IMAGE_WIDTH: int,
+    ITEMS_TO_DESCRIPTIONS: {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    SIZES: {
+        'train': int,
+        'validate': int,    (optional)
+        ...
+    },
+    BATCH_SIZE: int,
+    PATHS: {
+        'train': '',
+        'validate': '', (optional)
+        ...
+    }
+}
+"""
+
+"""
+note that one step = one batch of data processed, ~not~ an entire epoch
+'coeff_schedule_param': {
+    'half_life': 50000,         after this many steps, the value will be i + (f - i)/2
+    'initial_coeff': 0.5,       initial value
+    'final_coeff': 1,           final value
+},
+"""
+
+FLYING_CHAIRS_DATASET_CONFIG = {
+    'IMAGE_HEIGHT': 384,
+    'IMAGE_WIDTH': 512,
+    'ITEMS_TO_DESCRIPTIONS': {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    'SIZES': {
+        'train': 22232,
+        'validate': 640,
+        'sample': 8,
+    },
+    'BATCH_SIZE': 8,
+    'PATHS': {
+        'train': './data/tfrecords/fc_train.tfrecords',
+        'validate': './data/tfrecords/fc_val.tfrecords',
+        'sample': './data/tfrecords/fc_sample.tfrecords',
+    },
+    'PREPROCESS': {
+        'scale': False,
+        'crop_height': 320,
+        'crop_width': 448,
+        'image_a': {
+            'translate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0.2,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'squeeze': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.3,
+                'prob': 1.0,
+            },
+            'noise': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0.03,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+        },
+        # All preprocessing to image A will be applied to image B in addition to the following.
+        'image_b': {
+            'translate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'gamma': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'brightness': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'contrast': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'color': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'coeff_schedule_param': {
+                'half_life': 50000,
+                'initial_coeff': 0.5,
+                'final_coeff': 1,
+            },
+        }
+    },
+}
diff --git a/Codes/flownet2/src/downsample.py b/Codes/flownet2/src/downsample.py
new file mode 100644
index 0000000..5e6fc95
--- /dev/null
+++ b/Codes/flownet2/src/downsample.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+_downsample = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/downsample.so"))
+
+
+def downsample(tensor, size):
+    return _downsample.downsample(tensor, size)
diff --git a/Codes/flownet2/src/flow_warp.py b/Codes/flownet2/src/flow_warp.py
new file mode 100644
index 0000000..fe5fd4d
--- /dev/null
+++ b/Codes/flownet2/src/flow_warp.py
@@ -0,0 +1,15 @@
+import tensorflow as tf
+
+_flow_warp_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/flow_warp.so"))
+
+
+def flow_warp(image, flow):
+    return _flow_warp_ops.flow_warp(image, flow)
+
+
+@tf.RegisterGradient("FlowWarp")
+def _flow_warp_grad(flow_warp_op, gradients):
+    return _flow_warp_ops.flow_warp_grad(flow_warp_op.inputs[0],
+                                         flow_warp_op.inputs[1],
+                                         gradients)
diff --git a/Codes/flownet2/src/flowlib.py b/Codes/flownet2/src/flowlib.py
new file mode 100644
index 0000000..36c56d4
--- /dev/null
+++ b/Codes/flownet2/src/flowlib.py
@@ -0,0 +1,554 @@
+#!/usr/bin/python
+"""
+# ==============================
+# flowlib.py
+# library for optical flow processing
+# Author: Ruoteng Li
+# Date: 6th Aug 2016
+# ==============================
+"""
+import png
+import numpy as np
+import matplotlib.colors as cl
+import matplotlib.pyplot as plt
+from PIL import Image
+import tensorflow as tf
+
+
+UNKNOWN_FLOW_THRESH = 1e7
+SMALLFLOW = 0.0
+LARGEFLOW = 1e8
+
+"""
+=============
+Flow Section
+=============
+"""
+
+
+def show_flow(filename):
+    """
+    visualize optical flow map using matplotlib
+    :param filename: optical flow file
+    :return: None
+    """
+    flow = read_flow(filename)
+    img = flow_to_image(flow)
+    plt.imshow(img)
+    plt.show()
+
+
+def visualize_flow(flow, mode='Y'):
+    """
+    this function visualize the input flow
+    :param flow: input flow in array
+    :param mode: choose which color mode to visualize the flow (Y: Ccbcr, RGB: RGB color)
+    :return: None
+    """
+    if mode == 'Y':
+        # Ccbcr color wheel
+        img = flow_to_image(flow)
+        plt.imshow(img)
+        plt.show()
+    elif mode == 'RGB':
+        (h, w) = flow.shape[0:2]
+        du = flow[:, :, 0]
+        dv = flow[:, :, 1]
+        valid = flow[:, :, 2]
+        max_flow = max(np.max(du), np.max(dv))
+        img = np.zeros((h, w, 3), dtype=np.float64)
+        # angle layer
+        img[:, :, 0] = np.arctan2(dv, du) / (2 * np.pi)
+        # magnitude layer, normalized to 1
+        img[:, :, 1] = np.sqrt(du * du + dv * dv) * 8 / max_flow
+        # phase layer
+        img[:, :, 2] = 8 - img[:, :, 1]
+        # clip to [0,1]
+        small_idx = img[:, :, 0:3] < 0
+        large_idx = img[:, :, 0:3] > 1
+        img[small_idx] = 0
+        img[large_idx] = 1
+        # convert to rgb
+        img = cl.hsv_to_rgb(img)
+        # remove invalid point
+        img[:, :, 0] = img[:, :, 0] * valid
+        img[:, :, 1] = img[:, :, 1] * valid
+        img[:, :, 2] = img[:, :, 2] * valid
+        # show
+        plt.imshow(img)
+        plt.show()
+
+    return None
+
+
+def read_flow(filename):
+    """
+    read optical flow from Middlebury .flo file
+    :param filename: name of the flow file
+    :return: optical flow data in matrix
+    """
+    f = open(filename, 'rb')
+    magic = np.fromfile(f, np.float32, count=1)
+    data2d = None
+
+    if 202021.25 != magic:
+        print('Magic number incorrect. Invalid .flo file')
+    else:
+        w = np.fromfile(f, np.int32, count=1)
+        h = np.fromfile(f, np.int32, count=1)
+        print("Reading %d x %d flo file" % (h, w))
+        data2d = np.fromfile(f, np.float32, count=2 * w * h)
+        # reshape data into 3D array (columns, rows, channels)
+        data2d = np.resize(data2d, (h[0], w[0], 2))
+    f.close()
+    return data2d
+
+
+def read_flow_png(flow_file):
+    """
+    Read optical flow from KITTI .png file
+    :param flow_file: name of the flow file
+    :return: optical flow data in matrix
+    """
+    flow_object = png.Reader(filename=flow_file)
+    flow_direct = flow_object.asDirect()
+    flow_data = list(flow_direct[2])
+    (w, h) = flow_direct[3]['size']
+    flow = np.zeros((h, w, 3), dtype=np.float64)
+    for i in range(len(flow_data)):
+        flow[i, :, 0] = flow_data[i][0::3]
+        flow[i, :, 1] = flow_data[i][1::3]
+        flow[i, :, 2] = flow_data[i][2::3]
+
+    invalid_idx = (flow[:, :, 2] == 0)
+    flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0
+    flow[invalid_idx, 0] = 0
+    flow[invalid_idx, 1] = 0
+    return flow
+
+
+def write_flow(flow, filename):
+    """
+    write optical flow in Middlebury .flo format
+    :param flow: optical flow map
+    :param filename: optical flow file path to be saved
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = flow.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    flow.tofile(f)
+    f.close()
+
+
+def segment_flow(flow):
+    h = flow.shape[0]
+    w = flow.shape[1]
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW))
+    idx2 = (abs(u) == SMALLFLOW)
+    class0 = (v == 0) & (u == 0)
+    u[idx2] = 0.00001
+    tan_value = v / u
+
+    class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0)
+    class2 = (tan_value >= 1) & (u >= 0) & (v >= 0)
+    class3 = (tan_value < -1) & (u <= 0) & (v >= 0)
+    class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0)
+    class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0)
+    class7 = (tan_value < -1) & (u >= 0) & (v <= 0)
+    class6 = (tan_value >= 1) & (u <= 0) & (v <= 0)
+    class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0)
+
+    seg = np.zeros((h, w))
+
+    seg[class1] = 1
+    seg[class2] = 2
+    seg[class3] = 3
+    seg[class4] = 4
+    seg[class5] = 5
+    seg[class6] = 6
+    seg[class7] = 7
+    seg[class8] = 8
+    seg[class0] = 0
+    seg[idx] = 0
+
+    return seg
+
+
+def flow_error(tu, tv, u, v):
+    """
+    Calculate average end point error
+    :param tu: ground-truth horizontal flow map
+    :param tv: ground-truth vertical flow map
+    :param u:  estimated horizontal flow map
+    :param v:  estimated vertical flow map
+    :return: End point error of the estimated flow
+    """
+    smallflow = 0.0
+    '''
+    stu = tu[bord+1:end-bord,bord+1:end-bord]
+    stv = tv[bord+1:end-bord,bord+1:end-bord]
+    su = u[bord+1:end-bord,bord+1:end-bord]
+    sv = v[bord+1:end-bord,bord+1:end-bord]
+    '''
+    stu = tu[:]
+    stv = tv[:]
+    su = u[:]
+    sv = v[:]
+
+    idxUnknow = (abs(stu) > UNKNOWN_FLOW_THRESH) | (abs(stv) > UNKNOWN_FLOW_THRESH)
+    stu[idxUnknow] = 0
+    stv[idxUnknow] = 0
+    su[idxUnknow] = 0
+    sv[idxUnknow] = 0
+
+    ind2 = [(np.absolute(stu) > smallflow) | (np.absolute(stv) > smallflow)]
+    index_su = su[ind2]
+    index_sv = sv[ind2]
+    an = 1.0 / np.sqrt(index_su ** 2 + index_sv ** 2 + 1)
+    un = index_su * an
+    vn = index_sv * an
+
+    index_stu = stu[ind2]
+    index_stv = stv[ind2]
+    tn = 1.0 / np.sqrt(index_stu ** 2 + index_stv ** 2 + 1)
+    tun = index_stu * tn
+    tvn = index_stv * tn
+
+    '''
+    angle = un * tun + vn * tvn + (an * tn)
+    index = [angle == 1.0]
+    angle[index] = 0.999
+    ang = np.arccos(angle)
+    mang = np.mean(ang)
+    mang = mang * 180 / np.pi
+    '''
+
+    epe = np.sqrt((stu - su) ** 2 + (stv - sv) ** 2)
+    epe = epe[ind2]
+    mepe = np.mean(epe)
+    return mepe
+
+
+def flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    maxu = -999.
+    maxv = -999.
+    minu = 999.
+    minv = 999.
+
+    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
+    u[idxUnknow] = 0
+    v[idxUnknow] = 0
+
+    maxu = max(maxu, np.max(u))
+    minu = min(minu, np.min(u))
+
+    maxv = max(maxv, np.max(v))
+    minv = min(minv, np.min(v))
+
+    rad = np.sqrt(u ** 2 + v ** 2)
+    maxrad = max(-1, np.max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu,maxu, minv, maxv))
+
+    u = u/(maxrad + np.finfo(float).eps)
+    v = v/(maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    img[idx] = 0
+
+    return np.uint8(img)
+
+
+def tf_flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, :, 0]
+    v = flow[:, :, :, 1]
+
+    maxu = tf.constant(-999.)
+    maxv = tf.constant(-999.)
+    minu = tf.constant(999.)
+    minv = tf.constant(999.)
+
+    zeros = tf.zeros_like(u, dtype=tf.float32)
+    u = tf.where(tf.greater(u, UNKNOWN_FLOW_THRESH), zeros, u)
+    v = tf.where(tf.greater(v, UNKNOWN_FLOW_THRESH), zeros, v)
+
+    rad = tf.sqrt(u ** 2 + v ** 2)
+    maxrad = tf.reduce_max(-1, tf.reduce_max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu, maxu, minv, maxv))
+
+    u = u / (maxrad + np.finfo(float).eps)
+    v = v / (maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    # idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    # img[idx] = 0
+
+    return np.uint8(img)
+
+
+def evaluate_flow_file(gt, pred):
+    """
+    evaluate the estimated optical flow end point error according to ground truth provided
+    :param gt: ground truth file path
+    :param pred: estimated optical flow file path
+    :return: end point error, float32
+    """
+    # Read flow files and calculate the errors
+    gt_flow = read_flow(gt)        # ground truth flow
+    eva_flow = read_flow(pred)     # predicted flow
+    # Calculate errors
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], eva_flow[:, :, 0], eva_flow[:, :, 1])
+    return average_pe
+
+
+def evaluate_flow(gt_flow, pred_flow):
+    """
+    gt: ground-truth flow
+    pred: estimated flow
+    """
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], pred_flow[:, :, 0], pred_flow[:, :, 1])
+    return average_pe
+
+
+"""
+==============
+Disparity Section
+==============
+"""
+
+
+def read_disp_png(file_name):
+    """
+    Read optical flow from KITTI .png file
+    :param file_name: name of the flow file
+    :return: optical flow data in matrix
+    """
+    image_object = png.Reader(filename=file_name)
+    image_direct = image_object.asDirect()
+    image_data = list(image_direct[2])
+    (w, h) = image_direct[3]['size']
+    channel = len(image_data[0]) / w
+    flow = np.zeros((h, w, channel), dtype=np.uint16)
+    for i in range(len(image_data)):
+        for j in range(channel):
+            flow[i, :, j] = image_data[i][j::channel]
+    return flow[:, :, 0] / 256
+
+
+def disp_to_flowfile(disp, filename):
+    """
+    Read KITTI disparity file in png format
+    :param disp: disparity matrix
+    :param filename: the flow file name to save
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = disp.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    empty_map = np.zeros((height, width), dtype=np.float32)
+    data = np.dstack((disp, empty_map))
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    data.tofile(f)
+    f.close()
+
+
+"""
+==============
+Image Section
+==============
+"""
+
+
+def read_image(filename):
+    """
+    Read normal image of any format
+    :param filename: name of the image file
+    :return: image data in matrix uint8 type
+    """
+    img = Image.open(filename)
+    im = np.array(img)
+    return im
+
+
+def warp_image(im, flow):
+    """
+    Use optical flow to warp image to the next
+    :param im: image to warp
+    :param flow: optical flow
+    :return: warped image
+    """
+    from scipy import interpolate
+    image_height = im.shape[0]
+    image_width = im.shape[1]
+    flow_height = flow.shape[0]
+    flow_width = flow.shape[1]
+    n = image_height * image_width
+    (iy, ix) = np.mgrid[0:image_height, 0:image_width]
+    (fy, fx) = np.mgrid[0:flow_height, 0:flow_width]
+    fx += flow[:,:,0]
+    fy += flow[:,:,1]
+    mask = np.logical_or(fx <0 , fx > flow_width)
+    mask = np.logical_or(mask, fy < 0)
+    mask = np.logical_or(mask, fy > flow_height)
+    fx = np.minimum(np.maximum(fx, 0), flow_width)
+    fy = np.minimum(np.maximum(fy, 0), flow_height)
+    points = np.concatenate((ix.reshape(n,1), iy.reshape(n,1)), axis=1)
+    xi = np.concatenate((fx.reshape(n, 1), fy.reshape(n,1)), axis=1)
+    warp = np.zeros((image_height, image_width, im.shape[2]))
+    for i in range(im.shape[2]):
+        channel = im[:, :, i]
+        plt.imshow(channel, cmap='gray')
+        values = channel.reshape(n, 1)
+        new_channel = interpolate.griddata(points, values, xi, method='cubic')
+        new_channel = np.reshape(new_channel, [flow_height, flow_width])
+        new_channel[mask] = 1
+        warp[:, :, i] = new_channel.astype(np.uint8)
+
+    return warp.astype(np.uint8)
+
+
+"""
+==============
+Others
+==============
+"""
+
+
+def scale_image(image, new_range):
+    """
+    Linearly scale the image into desired range
+    :param image: input image
+    :param new_range: the new range to be aligned
+    :return: image normalized in new range
+    """
+    min_val = np.min(image).astype(np.float32)
+    max_val = np.max(image).astype(np.float32)
+    min_val_new = np.array(min(new_range), dtype=np.float32)
+    max_val_new = np.array(max(new_range), dtype=np.float32)
+    scaled_image = (image - min_val) / (max_val - min_val) * (max_val_new - min_val_new) + min_val_new
+    return scaled_image.astype(np.uint8)
+
+
+def compute_color(u, v):
+    """
+    compute optical flow color map
+    :param u: optical flow horizontal map
+    :param v: optical flow vertical map
+    :return: optical flow in color code
+    """
+    [h, w] = u.shape
+    img = np.zeros([h, w, 3])
+    nanIdx = np.isnan(u) | np.isnan(v)
+    u[nanIdx] = 0
+    v[nanIdx] = 0
+
+    colorwheel = make_color_wheel()
+    # ncols = np.size(colorwheel, 0)
+    ncols = colorwheel.shape[0]
+
+    rad = np.sqrt(u**2+v**2)
+
+    a = np.arctan2(-v, -u) / np.pi
+
+    fk = (a+1) / 2 * (ncols - 1) + 1
+
+    k0 = np.floor(fk).astype(int)
+
+    k1 = k0 + 1
+    k1[k1 == ncols+1] = 1
+    f = fk - k0
+
+    for i in range(0, np.size(colorwheel, 1)):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0-1] / 255
+        col1 = tmp[k1-1] / 255
+        col = (1-f) * col0 + f * col1
+
+        idx = rad <= 1
+        col[idx] = 1-rad[idx]*(1-col[idx])
+        notidx = np.logical_not(idx)
+
+        col[notidx] *= 0.75
+        img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx)))
+
+    return img
+
+
+def make_color_wheel():
+    """
+    Generate color wheel according Middlebury color code
+    :return: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+
+    colorwheel = np.zeros([ncols, 3])
+
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY))
+    col += RY
+
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG))
+    colorwheel[col:col+YG, 1] = 255
+    col += YG
+
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC))
+    col += GC
+
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB))
+    colorwheel[col:col+CB, 2] = 255
+    col += CB
+
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM))
+    col += + BM
+
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
+    colorwheel[col:col+MR, 0] = 255
+
+    return colorwheel
diff --git a/Codes/flownet2/src/flownet2/__init__.py b/Codes/flownet2/src/flownet2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/__init__.py
diff --git a/Codes/flownet2/src/flownet2/flownet2.py b/Codes/flownet2/src/flownet2/flownet2.py
new file mode 100644
index 0000000..d44ed10
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/flownet2.py
@@ -0,0 +1,118 @@
+from ..net import Net, Mode
+from ..flownet_css.flownet_css import FlowNetCSS
+from ..flownet_sd.flownet_sd import FlowNetSD
+from ..flow_warp import flow_warp
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNet2(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_css = FlowNetCSS(mode, debug)
+        self.net_sd = FlowNetSD(mode, debug)
+        super(FlowNet2, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNet2'):
+            # Forward pass through FlowNetCSS and FlowNetSD with weights frozen
+            net_css_predictions = self.net_css.model(inputs, training_schedule, trainable=True)
+            net_sd_predictions = self.net_sd.model(inputs, training_schedule, trainable=True)
+
+            def ChannelNorm(tensor):
+                sq = tf.square(tensor)
+                r_sum = tf.reduce_sum(sq, keep_dims=True, axis=3)
+                return tf.sqrt(r_sum)
+
+            sd_flow_norm = ChannelNorm(net_sd_predictions['flow'])
+            css_flow_norm = ChannelNorm(net_css_predictions['flow'])
+
+            flow_warp_sd = flow_warp(inputs['input_b'], net_sd_predictions['flow'])
+            img_diff_sd = inputs['input_a'] - flow_warp_sd
+            img_diff_sd_norm = ChannelNorm(img_diff_sd)
+
+            flow_warp_css = flow_warp(inputs['input_b'], net_css_predictions['flow'])
+            img_diff_css = inputs['input_a'] - flow_warp_css
+            img_diff_css_norm = ChannelNorm(img_diff_css)
+
+            input_to_fusion = tf.concat([inputs['input_a'],
+                                         net_sd_predictions['flow'],
+                                         net_css_predictions['flow'],
+                                         sd_flow_norm,
+                                         css_flow_norm,
+                                         img_diff_sd_norm,
+                                         img_diff_css_norm], axis=3)
+
+            # Fusion Network
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    fuse_conv0 = slim.conv2d(pad(input_to_fusion), 64, 3, scope='fuse_conv0')
+                    fuse_conv1 = slim.conv2d(pad(fuse_conv0), 64, 3, stride=2, scope='fuse_conv1')
+                    fuse_conv1_1 = slim.conv2d(pad(fuse_conv1), 128, 3, scope='fuse_conv1_1')
+                    fuse_conv2 = slim.conv2d(pad(fuse_conv1_1), 128, 3,
+                                             stride=2, scope='fuse_conv2')
+                    fuse_conv2_1 = slim.conv2d(pad(fuse_conv2), 128, 3, scope='fuse_conv2_1')
+
+                    predict_flow2 = slim.conv2d(pad(fuse_conv2_1), 2, 3,
+                                                scope='predict_flow2',
+                                                activation_fn=None)
+                    fuse_deconv1 = antipad(slim.conv2d_transpose(fuse_conv2_1, 32, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv1'))
+                    fuse_upsample_flow2to1 = antipad(slim.conv2d_transpose(predict_flow2, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow2to1',
+                                                                           activation_fn=None))
+                    concat1 = tf.concat([fuse_conv1_1, fuse_deconv1,
+                                         fuse_upsample_flow2to1], axis=3)
+                    fuse_interconv1 = slim.conv2d(pad(concat1), 32, 3,
+                                                  activation_fn=None, scope='fuse_interconv1')
+
+                    predict_flow1 = slim.conv2d(pad(fuse_interconv1), 2, 3,
+                                                scope='predict_flow1',
+                                                activation_fn=None)
+                    fuse_deconv0 = antipad(slim.conv2d_transpose(concat1, 16, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv0'))
+                    fuse_upsample_flow1to0 = antipad(slim.conv2d_transpose(predict_flow1, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow1to0',
+                                                                           activation_fn=None))
+                    concat0 = tf.concat([fuse_conv0, fuse_deconv0, fuse_upsample_flow1to0], axis=3)
+                    fuse_interconv0 = slim.conv2d(pad(concat0), 16, 3,
+                                                  activation_fn=None, scope='fuse_interconv0')
+
+                    predict_flow0 = slim.conv2d(pad(fuse_interconv0), 2,
+                                                3, activation_fn=None, scope='predict_flow0')
+
+                    flow = tf.image.resize_bilinear(
+                        predict_flow0, tf.stack([height, width]), align_corners=True)
+                    print(predict_flow0)
+                    print(flow)
+                    return {
+                        'predict_flow0': predict_flow0,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        # L2 loss between predict_flow0, true flow (weighted w/ 0.005)
+        predict_flow0 = predictions['predict_flow0']
+        size = [predict_flow0.shape[1], predict_flow0.shape[2]]
+        downsampled_flow0 = downsample(flow, size)
+        loss = average_endpoint_error(downsampled_flow0, predict_flow0)
+        tf.losses.add_loss(loss)
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet2/test.py b/Codes/flownet2/src/flownet2/test.py
new file mode 100644
index 0000000..3177614
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet2 import FlowNet2
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNet2(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNet2/flownet-2.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet2/train.py b/Codes/flownet2/src/flownet2/train.py
new file mode 100644
index 0000000..40c028d
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/train.py
@@ -0,0 +1,24 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet2 import FlowNet2
+
+# Create a new network
+net = FlowNet2()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_2',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CSS and SD parts of network
+    checkpoints={
+        './checkpoints/FlowNetCSS-ft-sd/flownet-CSS-ft-sd.ckpt-0': ('FlowNet2/FlowNetCSS', 'FlowNet2'),
+        './checkpoints/FlowNetSD/flownet-SD.ckpt-0': ('FlowNet2/FlowNetSD', 'FlowNet2')
+    }
+)
diff --git a/Codes/flownet2/src/flownet_c/__init__.py b/Codes/flownet2/src/flownet_c/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/__init__.py
diff --git a/Codes/flownet2/src/flownet_c/flownet_c.py b/Codes/flownet2/src/flownet_c/flownet_c.py
new file mode 100644
index 0000000..d333ee2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/flownet_c.py
@@ -0,0 +1,167 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..correlation import correlation
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetC(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetC, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetC'):
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_a_1 = slim.conv2d(pad(inputs['input_a'], 3), 64, 7, scope='conv1')
+                        conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2')
+                        conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3')
+
+                        conv_b_1 = slim.conv2d(pad(inputs['input_b'], 3),
+                                               64, 7, scope='conv1', reuse=True)
+                        conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True)
+                        conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True)
+
+                        # Compute cross correlation with leaky relu activation
+                        cc = correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20)
+                        cc_relu = LeakyReLU(cc)
+
+                    # Combine cross correlation results with convolution of feature map A
+                    netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir')
+                    # Concatenate along the channels axis
+                    net = tf.concat([netA_conv, cc_relu], axis=3)
+
+                    conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_c/test.py b/Codes/flownet2/src/flownet_c/test.py
new file mode 100644
index 0000000..692f22d
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_c import FlowNetC
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetC(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetC/flownet-C.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_c/train.py b/Codes/flownet2/src/flownet_c/train.py
new file mode 100644
index 0000000..9296ac7
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_c import FlowNetC
+
+# Create a new network
+net = FlowNetC()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_c',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_cs/__init__.py b/Codes/flownet2/src/flownet_cs/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/__init__.py
diff --git a/Codes/flownet2/src/flownet_cs/flownet_cs.py b/Codes/flownet2/src/flownet_cs/flownet_cs.py
new file mode 100644
index 0000000..aeaea47
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/flownet_cs.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_c.flownet_c import FlowNetC
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_c = FlowNetC(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCS'):
+            # Forward pass through FlowNetC with weights frozen
+            net_c_predictions = self.net_c.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_c_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_c_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_cs/test.py b/Codes/flownet2/src/flownet_cs/test.py
new file mode 100644
index 0000000..ae00ff4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_cs import FlowNetCS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCS/flownet-CS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_cs/train.py b/Codes/flownet2/src/flownet_cs/train.py
new file mode 100644
index 0000000..9376132
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/train.py
@@ -0,0 +1,21 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_cs import FlowNetCS
+
+# Create a new network
+net = FlowNetCS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_cs',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for C part of network
+    checkpoints={'./checkpoints/FlowNetC/flownet-C.ckpt-0': ('FlowNetCS/FlowNetC', 'FlowNetCS')}
+)
diff --git a/Codes/flownet2/src/flownet_css/__init__.py b/Codes/flownet2/src/flownet_css/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/__init__.py
diff --git a/Codes/flownet2/src/flownet_css/flownet_css.py b/Codes/flownet2/src/flownet_css/flownet_css.py
new file mode 100644
index 0000000..93d9db2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/flownet_css.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_cs.flownet_cs import FlowNetCS
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCSS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_cs = FlowNetCS(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCSS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCSS'):
+            # Forward pass through FlowNetCS with weights frozen
+            net_cs_predictions = self.net_cs.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_cs_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_cs_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_css/test.py b/Codes/flownet2/src/flownet_css/test.py
new file mode 100644
index 0000000..9d1249e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_css import FlowNetCSS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCSS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCSS/flownet-CSS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_css/train.py b/Codes/flownet2/src/flownet_css/train.py
new file mode 100644
index 0000000..2964f3e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/train.py
@@ -0,0 +1,22 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_css import FlowNetCSS
+
+# Create a new network
+net = FlowNetCSS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_css',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CS part of network
+    checkpoints={
+        './checkpoints/FlowNetCS/flownet-CS.ckpt-0': ('FlowNetCSS/FlowNetCS', 'FlowNetCSS')}
+)
diff --git a/Codes/flownet2/src/flownet_s/__init__.py b/Codes/flownet2/src/flownet_s/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/__init__.py
diff --git a/Codes/flownet2/src/flownet_s/flownet_s.py b/Codes/flownet2/src/flownet_s/flownet_s.py
new file mode 100644
index 0000000..f6704b1
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/flownet_s.py
@@ -0,0 +1,161 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        stacked = False
+        with tf.variable_scope('FlowNetS'):
+            if 'warped' in inputs and 'flow' in inputs and 'brightness_error' in inputs:
+                stacked = True
+                concat_inputs = tf.concat([inputs['input_a'],
+                                           inputs['input_b'],
+                                           inputs['warped'],
+                                           inputs['flow'],
+                                           inputs['brightness_error']], axis=3)
+            else:
+                concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_1 = slim.conv2d(pad(concat_inputs, 3), 64, 7, scope='conv1')
+                        conv_2 = slim.conv2d(pad(conv_1, 2), 128, 5, scope='conv2')
+                        conv_3 = slim.conv2d(pad(conv_2, 2), 256, 5, scope='conv3')
+
+                    conv3_1 = slim.conv2d(pad(conv_3), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_s/test.py b/Codes/flownet2/src/flownet_s/test.py
new file mode 100644
index 0000000..ae1b2f3
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_s import FlowNetS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetS/flownet-S.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_s/train.py b/Codes/flownet2/src/flownet_s/train.py
new file mode 100644
index 0000000..13a792a
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_s import FlowNetS
+
+# Create a new network
+net = FlowNetS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_s_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_sd/__init__.py b/Codes/flownet2/src/flownet_sd/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/__init__.py
diff --git a/Codes/flownet2/src/flownet_sd/flownet_sd.py b/Codes/flownet2/src/flownet_sd/flownet_sd.py
new file mode 100644
index 0000000..2f5c9e4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/flownet_sd.py
@@ -0,0 +1,160 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+# from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetSD(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetSD, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True, reuse=None):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetSD', reuse=reuse):
+            concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    conv0 = slim.conv2d(pad(concat_inputs), 64, 3, scope='conv0')
+                    conv1 = slim.conv2d(pad(conv0), 64, 3, stride=2, scope='conv1')
+                    conv1_1 = slim.conv2d(pad(conv1), 128, 3, scope='conv1_1')
+                    conv2 = slim.conv2d(pad(conv1_1), 128, 3, stride=2, scope='conv2')
+                    conv2_1 = slim.conv2d(pad(conv2), 128, 3, scope='conv2_1')
+                    conv3 = slim.conv2d(pad(conv2_1), 256, 3, stride=2, scope='conv3')
+                    conv3_1 = slim.conv2d(pad(conv3), 256, 3, scope='conv3_1')
+                    conv4 = slim.conv2d(pad(conv3_1), 512, 3, stride=2, scope='conv4')
+                    conv4_1 = slim.conv2d(pad(conv4), 512, 3, scope='conv4_1')
+                    conv5 = slim.conv2d(pad(conv4_1), 512, 3, stride=2, scope='conv5')
+                    conv5_1 = slim.conv2d(pad(conv5), 512, 3, scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+                        interconv5 = slim.conv2d(pad(concat5), 512, 3,
+                                                 activation_fn=None, scope='interconv5')
+
+                        predict_flow5 = slim.conv2d(pad(interconv5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+                        interconv4 = slim.conv2d(pad(concat4), 256, 3,
+                                                 activation_fn=None, scope='interconv4')
+
+                        predict_flow4 = slim.conv2d(pad(interconv4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+                        interconv3 = slim.conv2d(pad(concat3), 128, 3,
+                                                 activation_fn=None, scope='interconv3')
+
+                        predict_flow3 = slim.conv2d(pad(interconv3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv2, deconv2, upsample_flow3to2], axis=3)
+                        interconv2 = slim.conv2d(pad(concat2), 64, 3,
+                                                 activation_fn=None, scope='interconv2')
+
+                        predict_flow2 = slim.conv2d(pad(interconv2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 0.05
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    # def loss(self, flow, predictions):
+    #     flow = flow * 20.0
+    #
+    #     losses = []
+    #     INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+    #
+    #     # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+    #     predict_flow6 = predictions['predict_flow6']
+    #     size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+    #     downsampled_flow6 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+    #
+    #     # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+    #     predict_flow5 = predictions['predict_flow5']
+    #     size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+    #     downsampled_flow5 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+    #
+    #     # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+    #     predict_flow4 = predictions['predict_flow4']
+    #     size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+    #     downsampled_flow4 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+    #
+    #     # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+    #     predict_flow3 = predictions['predict_flow3']
+    #     size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+    #     downsampled_flow3 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+    #
+    #     # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+    #     predict_flow2 = predictions['predict_flow2']
+    #     size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+    #     downsampled_flow2 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+    #
+    #     loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+    #
+    #     # Return the 'total' loss: loss fns + regularization terms defined in the model
+    #     return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_sd/test.py b/Codes/flownet2/src/flownet_sd/test.py
new file mode 100644
index 0000000..b2ac285
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_sd import FlowNetSD
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetSD(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetSD/flownet-SD.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_sd/train.py b/Codes/flownet2/src/flownet_sd/train.py
new file mode 100644
index 0000000..86c64e5
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_sd import FlowNetSD
+
+# Create a new network
+net = FlowNetSD()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_sd_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/net.py b/Codes/flownet2/src/net.py
new file mode 100644
index 0000000..43b2193
--- /dev/null
+++ b/Codes/flownet2/src/net.py
@@ -0,0 +1,177 @@
+import abc
+from enum import Enum
+import os
+import tensorflow as tf
+from .flowlib import flow_to_image, write_flow
+import numpy as np
+# from scipy.misc import imread, imsave, imresize
+import cv2
+import uuid
+from .training_schedules import LONG_SCHEDULE
+slim = tf.contrib.slim
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+class Mode(Enum):
+    TRAIN = 1
+    TEST = 2
+
+
+class Net(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.global_step = slim.get_or_create_global_step()
+        self.mode = mode
+        self.debug = debug
+
+    @abc.abstractmethod
+    def model(self, inputs, training_schedule, trainable=True):
+        """
+        Defines the model and returns a tuple of Tensors needed for calculating the loss.
+        """
+        return
+
+    @abc.abstractmethod
+    def loss(self, **kwargs):
+        """
+        Accepts prediction Tensors from the output of `model`.
+        Returns a single Tensor representing the total loss of the model.
+        """
+        return
+    """
+     python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_6.png \
+                                 --input_b /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_7.png \
+                                 --out  ./
+      python -m src.flownet_sd.test --input_a 006.png  --input_b  007.png     --out ./
+      python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/ped1/frames/testing/01/006.jpg \
+                             --input_b /home/liuwen/ssd/videogan/ped1/frames/testing/01/007.jpg \
+                             --out  ./
+    """
+    def test(self, checkpoint, input_a_path, input_b_path, out_path, save_image=True, save_flo=False):
+        input_a = cv2.imread(input_a_path)
+        input_b = cv2.imread(input_b_path)
+
+        input_a = cv2.resize(input_a, (512, 384))
+        input_b = cv2.resize(input_b, (512, 384))
+        print(input_a.shape, input_b.shape)
+
+        # Convert from RGB -> BGR
+        # input_a = input_a[..., [2, 1, 0]]
+        # input_b = input_b[..., [2, 1, 0]]
+
+        # Scale from [0, 255] -> [0.0, 1.0] if needed
+        if input_a.max() > 1.0:
+            input_a = input_a / 255.0
+        if input_b.max() > 1.0:
+            input_b = input_b / 255.0
+
+        # TODO: This is a hack, we should get rid of this
+        training_schedule = LONG_SCHEDULE
+
+        inputs = {
+            'input_a': tf.expand_dims(tf.constant(input_a, dtype=tf.float32), 0),
+            'input_b': tf.expand_dims(tf.constant(input_b, dtype=tf.float32), 0),
+        }
+        predictions = self.model(inputs, training_schedule)
+        pred_flow = predictions['flow']
+
+        saver = tf.train.Saver()
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        with tf.Session(config=config) as sess:
+            saver.restore(sess, checkpoint)
+            pred_flow = sess.run(pred_flow)[0, :, :, :]
+
+            np.save('temporal_ped1', pred_flow)
+
+            unique_name = 'flow-' + str(uuid.uuid4())
+            if save_image:
+                flow_img = flow_to_image(pred_flow)
+                full_out_path = os.path.join(out_path, unique_name + '.png')
+                cv2.imwrite(full_out_path, flow_img)
+
+            if save_flo:
+                full_out_path = os.path.join(out_path, unique_name + '.flo')
+                write_flow(pred_flow, full_out_path)
+
+    def train(self, log_dir, training_schedule, input_a, input_b, flow, checkpoints=None):
+        tf.summary.image("image_a", input_a, max_outputs=2)
+        tf.summary.image("image_b", input_b, max_outputs=2)
+
+        self.learning_rate = tf.train.piecewise_constant(
+            self.global_step,
+            [tf.cast(v, tf.int64) for v in training_schedule['step_values']],
+            training_schedule['learning_rates'])
+
+        optimizer = tf.train.AdamOptimizer(
+            self.learning_rate,
+            training_schedule['momentum'],
+            training_schedule['momentum2'])
+
+        inputs = {
+            'input_a': input_a,
+            'input_b': input_b,
+        }
+        predictions = self.model(inputs, training_schedule)
+        total_loss = self.loss(flow, predictions)
+        tf.summary.scalar('loss', total_loss)
+
+        if checkpoints:
+            for (checkpoint_path, (scope, new_scope)) in checkpoints.iteritems():
+                variables_to_restore = slim.get_variables(scope=scope)
+                renamed_variables = {
+                    var.op.name.split(new_scope + '/')[1]: var
+                    for var in variables_to_restore
+                }
+                restorer = tf.train.Saver(renamed_variables)
+                with tf.Session() as sess:
+                    restorer.restore(sess, checkpoint_path)
+
+        # Show the generated flow in TensorBoard
+        if 'flow' in predictions:
+            pred_flow_0 = predictions['flow'][0, :, :, :]
+            pred_flow_0 = tf.py_func(flow_to_image, [pred_flow_0], tf.uint8)
+            pred_flow_1 = predictions['flow'][1, :, :, :]
+            pred_flow_1 = tf.py_func(flow_to_image, [pred_flow_1], tf.uint8)
+            pred_flow_img = tf.stack([pred_flow_0, pred_flow_1], 0)
+            tf.summary.image('pred_flow', pred_flow_img, max_outputs=2)
+
+        true_flow_0 = flow[0, :, :, :]
+        true_flow_0 = tf.py_func(flow_to_image, [true_flow_0], tf.uint8)
+        true_flow_1 = flow[1, :, :, :]
+        true_flow_1 = tf.py_func(flow_to_image, [true_flow_1], tf.uint8)
+        true_flow_img = tf.stack([true_flow_0, true_flow_1], 0)
+        tf.summary.image('true_flow', true_flow_img, max_outputs=2)
+
+        train_op = slim.learning.create_train_op(
+            total_loss,
+            optimizer,
+            summarize_gradients=True)
+
+        if self.debug:
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                tf.train.start_queue_runners(sess)
+                slim.learning.train_step(
+                    sess,
+                    train_op,
+                    self.global_step,
+                    {
+                        'should_trace': tf.constant(1),
+                        'should_log': tf.constant(1),
+                        'logdir': log_dir + '/debug',
+                    }
+                )
+        else:
+            slim.learning.train(
+                train_op,
+                log_dir,
+                # session_config=tf.ConfigProto(allow_soft_placement=True),
+                global_step=self.global_step,
+                save_summaries_secs=60,
+                number_of_steps=training_schedule['max_iter']
+            )
diff --git a/Codes/flownet2/src/ops/build/.gitkeep b/Codes/flownet2/src/ops/build/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/ops/build/.gitkeep
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
new file mode 100644
index 0000000..4e92f45
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
@@ -0,0 +1,160 @@
+#define EIGEN_USE_THREADS
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class CorrelationGradKernel : public OpKernel {
+  public:
+    explicit CorrelationGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and verify their dimensions
+      const Tensor& gradients_t = ctx->input(0);
+      const Tensor& input_a_t   = ctx->input(1);
+      const Tensor& input_b_t   = ctx->input(2);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input
+      const int batch_size          = input_a_t.dim_size(0);
+      const int in_height           = input_a_t.dim_size(1);
+      const int in_width            = input_a_t.dim_size(2);
+      const int in_channels         = input_a_t.dim_size(3);
+      const int in_count_per_sample = in_height * in_width * in_channels;
+      const int padded_height       = in_height + 2 * pad;
+      const int padded_width        = in_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      const int kernel_radius = (kernel_size - 1) / 2;
+      const int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      const int out_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      const int out_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      const int neighborhood_grid_radius = max_displacement / stride_2;
+      const int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      const int out_channels             = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the outputs
+      Tensor *output_a_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_a_t.shape(), &output_a_gradient_t));
+      Tensor *output_b_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, input_b_t.shape(), &output_b_gradient_t));
+
+      // Get the tensors
+      auto gradients         = gradients_t.tensor<float, 4>();
+      auto input_a           = input_a_t.tensor<float, 4>();
+      auto input_b           = input_b_t.tensor<float, 4>();
+      auto output_a_gradient = output_a_gradient_t->tensor<float, 4>();
+      auto output_b_gradient = output_b_gradient_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      CorrelationGradA(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_b.data(),
+                       gradients.data(),
+                       output_a_gradient.data());
+
+      CorrelationGradB(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_a.data(),
+                       gradients.data(),
+                       output_b_gradient.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CorrelationGrad")
+                        .Device(DEVICE_GPU),
+                        CorrelationGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
new file mode 100644
index 0000000..19e3a40
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
@@ -0,0 +1,262 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define ROUND_OFF 50000
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateDataBackward0(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_a_gradient,
+                                       const float *input_b,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // Get X,Y ranges and clamp
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    // We add round_off before_s1 the int division and subtract round_off after
+    // it, to ensure the formula matches ceil behavior:
+    int xmin = (x - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+    int ymin = (y - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+
+    // Same here:
+    int xmax = (x - max_displacement + round_off_s1) / stride_1 - round_off;
+    int ymax = (y - max_displacement + round_off_s1) / stride_1 - round_off;
+
+    float sum = 0;
+
+    if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+      xmin = max(0, xmin);
+      xmax = min(out_width - 1, xmax);
+
+      ymin = max(0, ymin);
+      ymax = min(out_height - 1, ymax);
+
+      for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+        for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+          // Get input_b data:
+          int s2o         = stride_2 * o;
+          int s2p         = stride_2 * p;
+          int idx_input_b = ((item * padded_in_height + (y + s2p)) * padded_in_width + (x + s2o)) *
+                            in_channels + k;
+          float input_b_tmp = input_b[idx_input_b]; // input_b[x+s2o,y+s2p,k]
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              // gradient[x,y,o,p]
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_b_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_a_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_a_gradient[input_a_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+__global__ void CorrelateDataBackward1(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_b_gradient,
+                                       const float *input_a,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    float sum = 0;
+
+    // Height (y)
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      // Width (x)
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+        int s2o = stride_2 * o;
+        int s2p = stride_2 * p;
+
+        // Get X,Y ranges and clamp
+        // We add round_off before_s1 the int division and subtract round_off
+        // after it, to ensure the formula matches ceil behavior:
+        int xmin = (x - 2 * kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+        int ymin = (y - 2 * kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+
+        // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+        // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+
+        // Same here:
+        int xmax = (x - max_displacement - s2o + round_off_s1) / stride_1 - round_off;
+        int ymax = (y - max_displacement - s2p + round_off_s1) / stride_1 - round_off;
+
+        if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+          xmin = max(0, xmin);
+          xmax = min(out_width - 1, xmax);
+
+          ymin = max(0, ymin);
+          ymax = min(out_height - 1, ymax);
+
+          // Get input_a data:
+          int idx_input_a = ((item * padded_in_height + (y - s2p)) * padded_in_width + (x - s2o)) *
+                            in_channels + k;
+          float input_a_tmp = input_a[idx_input_a];
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_a_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_b_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_b_gradient[input_b_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample, // h * w * ch
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward0 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_a_gradient, input_b, gradient);
+  }
+}
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward1 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_b_gradient, input_a, gradient);
+  }
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
new file mode 100644
index 0000000..f8a5193
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
@@ -0,0 +1,137 @@
+#define EIGEN_USE_THREADS
+
+#include <utility>
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<typename Device>
+class CorrelationKernel : public OpKernel {
+  public:
+    explicit CorrelationKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input (already padded)
+      int batch_size     = input_a_t.dim_size(0);
+      int input_height   = input_a_t.dim_size(1);
+      int input_width    = input_a_t.dim_size(2);
+      int input_channels = input_a_t.dim_size(3);
+      int padded_height  = input_height + 2 * pad;
+      int padded_width   = input_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      int kernel_radius = (kernel_size - 1) / 2;
+      int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      int output_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      int output_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      OP_REQUIRES(ctx, output_height >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input height."));
+      OP_REQUIRES(ctx, output_width >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input width."));
+
+      int neighborhood_grid_radius = max_displacement / stride_2;
+      int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      int output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ batch_size, output_height, output_width, output_channels }),
+                       &output_t));
+
+      // Get the tensors
+      auto input_a = input_a_t.tensor<float, 4>();
+      auto input_b = input_b_t.tensor<float, 4>();
+      auto output  = output_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      // Perform cross correlation
+      Correlation(ctx->eigen_device<Device>(),
+                  padded_input_a.data(),
+                  padded_input_b.data(),
+                  batch_size,
+                  output_height,
+                  output_width,
+                  output_channels,
+                  output_height * output_width * output_channels,
+                  padded_height,
+                  padded_width,
+                  input_channels,
+                  max_displacement,
+                  neighborhood_grid_radius,
+                  neighborhood_grid_width,
+                  kernel_radius,
+                  kernel_size,
+                  stride_1,
+                  stride_2,
+                  output.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Correlation")
+                        .Device(DEVICE_GPU),
+                        CorrelationKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
new file mode 100644
index 0000000..c63e489
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
@@ -0,0 +1,153 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateData(int          batch_size,
+                              int          out_width,
+                              int          out_height,
+                              int          out_channels,
+                              int          out_count,
+                              int          max_displacement,
+                              int          neighborhood_grid_radius,
+                              int          neighborhood_grid_width,
+                              int          kernel_radius,
+                              int          kernel_size,
+                              int          stride_1,
+                              int          stride_2,
+                              int          in_width_padded,
+                              int          in_height_padded,
+                              int          in_channels,
+                              const float *input_a,
+                              const float *input_b,
+                              float       *output) {
+  extern __shared__ char patch_data_char[];
+
+  float *patch_data = (float *)patch_data_char;
+
+  // First (upper left) position of kernel upper-left corner in current center
+  // position of neighborhood in image 1
+  int x1     = blockIdx.x * stride_1 + max_displacement;
+  int y1     = blockIdx.y * stride_1 + max_displacement;
+  int item   = blockIdx.z;
+  int ch_off = threadIdx.x;
+
+  // Load 3D patch into shared shared memory
+  // HEIGHT
+  for (int j = 0; j < kernel_size; j++) {
+    // WIDTH
+    for (int i = 0; i < kernel_size; i++) {
+      int ji_off = ((j * kernel_size) + i) * in_channels;
+
+      // CHANNELS
+      for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+        int idx1 = ((item * in_height_padded + y1 + j) * in_width_padded + x1 + i) *
+                   in_channels + ch;
+        int idxPatchData = ji_off + ch;
+        patch_data[idxPatchData] = input_a[idx1];
+      }
+    }
+  }
+
+  __syncthreads();
+
+  __shared__ float sum[WARPS_PER_BLOCK * THREADS_PER_WARP];
+
+  // Compute correlation
+  for (int out_channel = 0; out_channel < out_channels; out_channel++) {
+    sum[ch_off] = 0;
+
+    int s2o = (out_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int s2p = (out_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int x2  = x1 + s2o;
+    int y2  = y1 + s2p;
+
+    // HEIGHT
+    for (int j = 0; j < kernel_size; j++) {
+      // WIDTH
+      for (int i = 0; i < kernel_size; i++) {
+        int ji_off = ((j * kernel_size) + i) * in_channels;
+
+        // CHANNELS
+        for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+          int idxPatchData = ji_off + ch;
+          int idx2         = ((item * in_height_padded + y2 + j) * in_width_padded + x2 + i) *
+                             in_channels + ch;
+
+          sum[ch_off] += patch_data[idxPatchData] * input_b[idx2];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    if (ch_off == 0) {
+      float total_sum = 0;
+
+      for (int idx = 0; idx < WARPS_PER_BLOCK * THREADS_PER_WARP; idx++) {
+        total_sum += sum[idx];
+      }
+      const int sumelems = kernel_size * kernel_size * in_channels;
+      const int index    = (blockIdx.y * out_width + blockIdx.x) * out_channels + out_channel;
+
+      /* from Caffe:   const int index    = ((out_channel * out_height +
+         blockIdx.y) * out_width) + blockIdx.x; */
+      output[index + item * out_count] = total_sum / (float)sumelems;
+
+      // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      // n = 0
+      // caffe: ((k * H + h) * W + w)  +   n * K * H * W
+      // tf: (h * W + w) * K + k       +   n * H * W * K
+    }
+  }
+}
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output) {
+  dim3 totalBlocksCorr(out_width, out_height, batch_size);
+  dim3 threadsPerBlock(THREADS_PER_WARP *WARPS_PER_BLOCK);
+  const int shared_memory_per_block = (kernel_size * kernel_size) * in_channels;
+
+  CorrelateData << < totalBlocksCorr, threadsPerBlock, shared_memory_per_block * sizeof(float),
+    device.stream() >> > (
+    batch_size, out_width, out_height, out_channels, out_count,
+    max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+    kernel_size, stride_1, stride_2, in_width_padded, in_height_padded, in_channels,
+    input_a, input_b, output);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.h b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
new file mode 100644
index 0000000..a1dfb62
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
@@ -0,0 +1,77 @@
+#ifndef FLOWNET_CORRELATION_H_
+#define FLOWNET_CORRELATION_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output);
+
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient);
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_CORRELATION_H_
diff --git a/Codes/flownet2/src/ops/correlation/correlation_op.cc b/Codes/flownet2/src/ops/correlation/correlation_op.cc
new file mode 100644
index 0000000..4f420f0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_op.cc
@@ -0,0 +1,83 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status SetOutput(InferenceContext *c) {
+  ShapeHandle input_a, input_b, input;
+
+  // Get shapes of both inputs and verify they are rank 4
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_a));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_b));
+
+  // Verify inputs are same dimensions
+  TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input));
+
+  // Get the attributes
+  int kernel_size, max_displacement, stride_1, stride_2, pad;
+  TF_RETURN_IF_ERROR(c->GetAttr("kernel_size", &kernel_size));
+  TF_RETURN_IF_ERROR(c->GetAttr("max_displacement", &max_displacement));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_1", &stride_1));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_2", &stride_2));
+  TF_RETURN_IF_ERROR(c->GetAttr("pad", &pad));
+
+  // Get dimensions of input (already padded)
+  int64 batch         = c->Value(c->Dim(input, 0));
+  int64 input_height  = c->Value(c->Dim(input, 1));
+  int64 input_width   = c->Value(c->Dim(input, 2));
+  int64 padded_height = input_height + 2 * pad;
+  int64 padded_width  = input_width + 2 * pad;
+
+  // The size of unreachable border region on each side
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_size   = max_displacement + kernel_radius;
+
+  // Calculate the output dimensions
+  int64 output_height = (int64)ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+  int64 output_width  = (int64)ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+  // TODO: Verify output size >= 1
+
+  int   neighborhood_grid_radius = max_displacement / stride_2;
+  int   neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+  int64 output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+  // Set output shape
+  c->set_output(0, c->MakeShape({ batch, output_height, output_width, output_channels }));
+  return Status::OK();
+}
+
+REGISTER_OP("Correlation")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("output: float32")
+.SetShapeFn(SetOutput);
+
+REGISTER_OP("CorrelationGrad")
+.Input("gradients: float32")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("backprops_a: float32")
+.Output("backprops_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Output gradients should be the same dimensions as the inputs
+    ShapeHandle out;
+    TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &out));
+    c->set_output(0, out);
+    c->set_output(1, out);
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/pad.cu.cc b/Codes/flownet2/src/ops/correlation/pad.cu.cc
new file mode 100644
index 0000000..0b6c93d
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.cu.cc
@@ -0,0 +1,76 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "pad.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void PadData(
+  const float *in,
+  int          in_widthheight,
+  int          in_width,
+  int          in_height,
+  int          out_width,
+  int          out_height,
+  int          channels,
+  int          padding,
+  float       *out) {
+  int xy = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x  = xy % in_width;
+  int y  = xy / in_width;
+  int ch = blockIdx.y;
+  int n  = blockIdx.z;
+
+  if (xy >= in_widthheight) {
+    out[((n * out_height + y) * out_width + x) * channels + ch] = 0.0;
+    return;
+  }
+
+  float value = in[((n * in_height + y) * in_width + x) * channels + ch];
+
+  __syncthreads();
+
+  int xpad = x + padding;
+  int ypad = y + padding;
+
+  out[((n * out_height + ypad) * out_width + xpad) * channels + ch] = value;
+}
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output) {
+  int  in_widthheight    = input_width * input_height;
+  int  threads_per_block = 16;
+  dim3 totalBlocks((in_widthheight - 1) / threads_per_block + 1, input_channels, batch_size);
+
+  cudaMemset(output, 0, batch_size * output_height * output_width * input_channels * sizeof(float));
+
+  int padding = (output_height - input_height) / 2;
+
+  // LAUNCH KERNEL
+  PadData << < totalBlocks, threads_per_block, 0, device.stream() >> > (
+    input,
+    in_widthheight,
+    input_width,
+    input_height,
+    output_width,
+    output_height,
+    input_channels,
+    padding,
+    output);
+}
+}
+#endif // if GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/pad.h b/Codes/flownet2/src/ops/correlation/pad.h
new file mode 100644
index 0000000..afb4df0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.h
@@ -0,0 +1,20 @@
+#ifndef FLOWNET_PAD_H_
+#define FLOWNET_PAD_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output);
+} // end namespace tensorflow
+
+#endif // ifndef FLOWNET_PAD_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
new file mode 100644
index 0000000..eefe247
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
@@ -0,0 +1,47 @@
+#define EIGEN_USE_THREADS
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+class DownsampleKernel : public OpKernel {
+ public:
+  explicit DownsampleKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    // Get the size [height, width] tensor and verify its dimensions
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("size", &size_));
+    OP_REQUIRES(ctx, size_.size() == 2, errors::InvalidArgument("size must be 2 dimensions"));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    // Get the input images and transforms and verify their dimensions
+    const Tensor& input_t = ctx->input(0);
+    OP_REQUIRES(ctx, input_t.dims() == 4,
+                errors::InvalidArgument("Input images must have rank 4"));
+
+    // Allocate the memory for the output
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+        0, TensorShape({input_t.dim_size(0), size_[0], size_[1], input_t.dim_size(3)}), &output_t));
+
+    // Perform flow augmentation
+    auto input = input_t.tensor<float, 4>();
+    auto output = output_t->tensor<float, 4>();
+
+    Downsample(ctx->eigen_gpu_device(), input, output);
+  }
+
+  private:
+    std::vector<int32> size_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Downsample")
+                          .Device(DEVICE_GPU),
+                      DownsampleKernel<GPUDevice>)
+}  // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.h b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
new file mode 100644
index 0000000..bcc4e3f
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
@@ -0,0 +1,18 @@
+#ifndef FLOWNET_DOWNSAMPLE_H_
+#define FLOWNET_DOWNSAMPLE_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output);
+
+}  // end namespace tensorflow
+
+#endif  // FLOWNET_DOWNSAMPLE_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
new file mode 100644
index 0000000..b7629a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
@@ -0,0 +1,108 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define CUDART_NAN_F            __int_as_float(0x7fffffff)
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void DownsampleKernel(
+    const int32 nthreads,
+    const float* input_ptr,
+    float* output_ptr,
+    const int in_width,
+    const int in_height,
+    const int out_width,
+    const int out_height,
+    const int channels,
+    const float width_scale,
+    const float height_scale,
+    const int wradius,
+    const int hradius) {
+        CUDA_1D_KERNEL_LOOP(index, nthreads) {
+            const int c = index % channels;
+            const int destx = (index / channels) % out_width;
+            const int desty = (index / channels / out_width) % out_height;
+            const int n = (index / channels / out_width) / out_height;
+
+            const float srcx = ((float)destx / (float)(out_width - 1)) * (float)(in_width - 1);
+            const float srcy = ((float)desty / (float)(out_height - 1)) * (float)(in_height - 1);
+
+            const int isrcx = round(srcx);
+            const int isrcy = round(srcy);
+
+            float accum_value = 0;
+            float accum_weight = 0;
+            float accum_nan = 0;
+
+            for (int dy = -hradius; dy <= hradius; dy++) {
+                int yoff = isrcy + dy;
+                //
+                for (int dx = -wradius; dx <= wradius; dx++) {
+                    int xoff = isrcx + dx;
+
+                    if (xoff >= 0 && yoff >= 0 && xoff < in_width && yoff < in_height) {
+                        int idx = ((n * in_height + yoff) * in_width + xoff) * channels + c;
+                        float sample = input_ptr[idx];
+                        float weight = fmaxf(0.0f, 1.0f - (fabsf((float)xoff - srcx) / width_scale))
+                                       * fmaxf(0.0f, 1.0f - (fabsf((float)yoff - srcy) / height_scale));
+                        if (sample != sample) { // isnan
+                            accum_nan += weight;
+                            sample = 0;
+                            weight = 0;
+                        }
+                        accum_value += sample * weight;
+                        accum_weight += weight;
+                    }
+                }
+            }
+
+            if (accum_nan / accum_weight > 0.5) {
+                output_ptr[index] = CUDART_NAN_F;
+            } else {
+                output_ptr[index] = accum_value / accum_weight;
+            }
+        }
+}
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int out_height = output.dimension(1);
+    const int out_width = output.dimension(2);
+    const int out_channels = output.dimension(3);
+    const int total_count = batch_size * out_height * out_width * out_channels;
+
+    const int in_height = input.dimension(1);
+    const int in_width = input.dimension(2);
+
+    const float width_scale = (float)(in_width - 1) / (float)(out_width - 1);
+    const float height_scale = (float)(in_height - 1) / (float)(out_height - 1);
+
+    const int wradius = ceil(width_scale);
+    const int hradius = ceil(height_scale);
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, device);
+    DownsampleKernel<<<config.block_count, config.thread_per_block, 0,
+                        device.stream()>>>(total_count, input.data(), output.data(),
+                        in_width, in_height, out_width, out_height, out_channels,
+                        width_scale, height_scale, wradius, hradius);
+    return device.ok();
+}
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/downsample/downsample_op.cc b/Codes/flownet2/src/ops/downsample/downsample_op.cc
new file mode 100644
index 0000000..6980dc7
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_op.cc
@@ -0,0 +1,30 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch = c->Dim(input, 0);
+  DimensionHandle depth = c->Dim(input, 3);
+  std::vector<int32> size_;
+  c->GetAttr("size", &size_);
+  DimensionHandle height = c->MakeDim(size_[0]);
+  DimensionHandle width  = c->MakeDim(size_[1]);
+  c->set_output(0, c->MakeShape({batch, height, width, depth}));
+  return Status::OK();
+}
+
+REGISTER_OP("Downsample")
+    .Input("input: float32")
+    .Attr("size: list(int) >= 2")
+    .Output("output: float32")
+    .SetShapeFn(SetOutputToSizedImage);
+
+}  // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
new file mode 100644
index 0000000..b5d9602
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
@@ -0,0 +1,48 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpKernel : public OpKernel {
+  public:
+    explicit FlowWarpKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& input_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  input_t.dim_size(0) == flow_t.dim_size(0) && input_t.dim_size(
+                    1) == flow_t.dim_size(1) && input_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_t.shape(), &output_t));
+
+      // Perform flow augmentation
+      auto input  = input_t.tensor<float, 4>();
+      auto flow   = flow_t.tensor<float, 4>();
+      auto output = output_t->tensor<float, 4>();
+
+      FlowWarp(ctx->eigen_gpu_device(), input, flow, output);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarp")
+                        .Device(DEVICE_GPU),
+                        FlowWarpKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
new file mode 100644
index 0000000..2007151
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
@@ -0,0 +1,130 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define RA_TILE 32
+#define RA_ROWS 8
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpKernel(
+  const float *image,
+  const float *flow,
+  float       *warped,
+  const int    batch_size,
+  const int    channels,
+  const int    cblocks,
+  const int    width,
+  const int    wblocks,
+  const int    height,
+  const int    width_height) {
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X];
+  __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1];
+
+  int x;
+  int c;
+
+  x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if ((threadIdx.y == 0) && (x < width)) {
+    const int idx = ((n * height + y) * width + x) * 2;
+    x2_buf[threadIdx.x] = float(x) + flow[idx];
+    y2_buf[threadIdx.x] = float(y) + flow[idx + 1];
+  }
+
+  __syncthreads();
+
+  float x2 = x2_buf[threadIdx.y];
+  float y2 = y2_buf[threadIdx.y];
+
+  int ix2_L = int(x2);
+  int iy2_T = int(y2);
+  int ix2_R = min(ix2_L + 1, width - 1);
+  int iy2_B = min(iy2_T + 1, height - 1);
+
+  int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels;
+  int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels;
+  int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels;
+  int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels;
+
+  float alpha   = x2 - ix2_L;
+  float beta    = y2 - iy2_T;
+  float coeffTL = (1 - alpha) * (1 - beta);
+  float coeffTR = alpha * (1 - beta);
+  float coeffBL = (1 - alpha) * beta;
+  float coeffBR = alpha * beta;
+
+  for (int cb = 0; cb < cblocks; cb++) {
+    __syncthreads();
+
+    buffer[threadIdx.y][threadIdx.x] = 0.0;
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.x;
+
+    if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) {
+      buffer[threadIdx.y][threadIdx.x] = // buffer [x][c]
+                                         coeffTL * image[off_TL + c] +
+                                         coeffTR * image[off_TR + c] +
+                                         coeffBL * image[off_BL + c] +
+                                         coeffBR * image[off_BR + c];
+    }
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.y;
+    x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+    if ((c < channels) && (x < width)) {
+      warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y];
+    }
+  }
+}
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output) {
+  const int batch_size = input.dimension(0);
+  const int height     = input.dimension(1);
+  const int width      = input.dimension(2);
+  const int channels   = input.dimension(3);
+
+  const int width_height = width * height;
+  int  wblocks           = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks           = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, FW_TILE_C);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    input.data(),
+    flow.data(),
+    output.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.h b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
new file mode 100644
index 0000000..2780316
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
@@ -0,0 +1,28 @@
+#ifndef FLOWNET_FLOWWARP_H_
+#define FLOWNET_FLOWWARP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+#define FW_THREADS 32
+#define FW_TILE_X FW_THREADS
+#define FW_TILE_C FW_THREADS
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output);
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOWWARP_H_
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
new file mode 100644
index 0000000..9f3e7ea
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
@@ -0,0 +1,57 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpGradKernel : public OpKernel {
+  public:
+    explicit FlowWarpGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& image_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+      const Tensor& grad_t  = ctx->input(2);
+
+      OP_REQUIRES(ctx, image_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  image_t.dim_size(0) == flow_t.dim_size(0) && image_t.dim_size(
+                    1) == flow_t.dim_size(1) && image_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *image_grad_t;
+      Tensor *flow_grad_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, image_t.shape(), &image_grad_t));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, flow_t.shape(), &flow_grad_t));
+
+      auto image      = image_t.tensor<float, 4>();
+      auto flow       = flow_t.tensor<float, 4>();
+      auto gradient   = grad_t.tensor<float, 4>();
+      auto image_grad = image_grad_t->tensor<float, 4>();
+      auto flow_grad  = flow_grad_t->tensor<float, 4>();
+
+      FlowWarpGrad(ctx->eigen_gpu_device(),
+                   image,
+                   flow,
+                   gradient,
+                   image_grad,
+                   flow_grad);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarpGrad")
+                        .Device(DEVICE_GPU),
+                        FlowWarpGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
new file mode 100644
index 0000000..25248c8
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
@@ -0,0 +1,126 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "flow_warp.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpGradKernel(
+  const float *image,
+  float       *image_grad,
+  const float *flow,
+  float       *flow_grad,
+  const float *gradient,
+  int          batch_size,
+  int          channels,
+  int          cblocks,
+  int          width,
+  int          wblocks,
+  int          height,
+  int          widthheight) {
+  int x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if (x >= width) return;
+
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  const int flow_idx = ((n * height + y) * width + x) * 2;
+  float     x2       = float(x) + flow[flow_idx];
+  float     y2       = float(y) + flow[flow_idx + 1];
+
+  if ((x2 >= 0.f) && (y2 >= 0.f) && (x2 < width) && (y2 < height)) {
+    int ix2_L = int(x2);
+    int iy2_T = int(y2);
+    int ix2_R = min(ix2_L + 1, width - 1);
+    int iy2_B = min(iy2_T + 1, height - 1);
+
+    float alpha = x2 - ix2_L;
+    float beta  = y2 - iy2_T;
+
+    for (int c = 0; c < channels; c++) {
+      float warped_diff_value = gradient[((n * height + y) * width + x) * channels + c];
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * beta);
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * beta);
+    }
+
+    float gamma    = iy2_B - y2;
+    float bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      int   ch_off = (n * channels + c) * height;
+      float temp   = 0;
+      temp += gamma *
+              (image[((n * height + iy2_T) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_B) * width + ix2_L) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2] = bot_diff;
+
+    gamma    = ix2_R - x2;
+    bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      float temp = 0;
+      temp += gamma *
+              (image[((n * height + iy2_B) * width + ix2_L) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_R) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2 + 1] = bot_diff;
+  }
+}
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad) {
+  const int batch_size   = image.dimension(0);
+  const int height       = image.dimension(1);
+  const int width        = image.dimension(2);
+  const int channels     = image.dimension(3);
+  const int width_height = width * height;
+
+  int  wblocks = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, 1);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(image_grad.data(), 0, batch_size * height * width * channels * sizeof(float));
+  cudaMemset(flow_grad.data(),  0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpGradKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    image.data(),
+    image_grad.data(),
+    flow.data(),
+    flow_grad.data(),
+    gradient.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
new file mode 100644
index 0000000..aef9c74
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
@@ -0,0 +1,23 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("FlowWarp")
+.Input("image: float32")
+.Input("flow: float32")
+.Output("output: float32")
+.SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+REGISTER_OP("FlowWarpGrad")
+.Input("image: float32")
+.Input("flow: float32")
+.Input("gradient: float32")
+.Output("image_grad: float32")
+.Output("flow_grad: float32")
+.SetShapeFn([](shape_inference::InferenceContext *c) {
+    c->set_output(0, c->input(0));
+    c->set_output(1, c->input(1));
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
new file mode 100644
index 0000000..b93dfa6
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
@@ -0,0 +1,420 @@
+#include "augmentation_base.h"
+
+#include <math.h>
+#include <random>
+
+namespace tensorflow {
+/** TransMat Functions **/
+void AugmentationLayerBase::TransMat::fromCoeff(AugmentationCoeff *coeff,
+                                                int                out_width,
+                                                int                out_height,
+                                                int                src_width,
+                                                int                src_height) {
+  leftMultiply(1, 0, -0.5 * out_width,
+               0, 1, -0.5 * out_height);
+
+  if (coeff->angle) {
+    leftMultiply(cos(coeff->angle()), -sin(coeff->angle()), 0,
+                 sin(coeff->angle()), cos(coeff->angle()), 0);
+  }
+
+  if (coeff->dx || coeff->dy) {
+    leftMultiply(1, 0, coeff->dx() * out_width,
+                 0, 1, coeff->dy() * out_height);
+  }
+
+  if (coeff->zoom_x || coeff->zoom_y) {
+    leftMultiply(1.0 / coeff->zoom_x(), 0, 0,
+                 0, 1.0 / coeff->zoom_y(), 0);
+  }
+
+  leftMultiply(1, 0, 0.5 * src_width,
+               0, 1, 0.5 * src_height);
+}
+
+void AugmentationLayerBase::TransMat::fromTensor(const float *tensor_data) {
+  t0 = tensor_data[0];
+  t1 = tensor_data[1];
+  t2 = tensor_data[2];
+  t3 = tensor_data[3];
+  t4 = tensor_data[4];
+  t5 = tensor_data[5];
+}
+
+AugmentationLayerBase::TransMat AugmentationLayerBase::TransMat::inverse() {
+  float a = this->t0, b = this->t1, c = this->t2;
+  float d = this->t3, e = this->t4, f = this->t5;
+
+  float denom = a * e - b * d;
+
+  TransMat result;
+
+  result.t0 = e / denom;
+  result.t1 = b / -denom;
+  result.t2 = (c * e - b * f) / -denom;
+  result.t3 = d / -denom;
+  result.t4 = a / denom;
+  result.t5 = (c * d - a * f) / denom;
+
+  return result;
+}
+
+void AugmentationLayerBase::TransMat::leftMultiply(float u0,
+                                                   float u1,
+                                                   float u2,
+                                                   float u3,
+                                                   float u4,
+                                                   float u5) {
+  float t0 = this->t0, t1 = this->t1, t2 = this->t2;
+  float t3 = this->t3, t4 = this->t4, t5 = this->t5;
+
+  this->t0 = t0 * u0 + t3 * u1;
+  this->t1 = t1 * u0 + t4 * u1;
+  this->t2 = t2 * u0 + t5 * u1 + u2;
+  this->t3 = t0 * u3 + t3 * u4;
+  this->t4 = t1 * u3 + t4 * u4;
+  this->t5 = t2 * u3 + t5 * u4 + u5;
+}
+
+void AugmentationLayerBase::TransMat::toIdentity() {
+  t0 = 1; t1 = 0; t2 = 0;
+  t3 = 0; t4 = 1; t5 = 0;
+}
+
+/** AugmentationCoeff Functions **/
+void AugmentationCoeff::clear() {
+  // Spatial variables
+  dx.clear();
+  dy.clear();
+  angle.clear();
+  zoom_x.clear();
+  zoom_y.clear();
+
+  // Chromatic variables
+  gamma.clear();
+  brightness.clear();
+  contrast.clear();
+  color1.clear();
+  color2.clear();
+  color3.clear();
+}
+
+void AugmentationCoeff::combine_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = dx() * coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = dy() * coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = angle() * coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = zoom_x() * coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = zoom_y() * coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = brightness() * coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = contrast() * coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = color1() * coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = color2() * coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = color3() * coeff.color3();
+  }
+}
+
+void AugmentationCoeff::replace_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = coeff.color3();
+  }
+}
+
+/** AugmentationLayerBase Functions **/
+float AugmentationLayerBase::rng_generate(const AugmentationParam& param,
+                                          float                    discount_coeff,
+                                          const float              default_value) {
+  std::random_device rd;  // Will be used to obtain a seed for the random number
+                          // engine
+  std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+
+  float spread = param.spread * discount_coeff;
+
+  if (param.rand_type == "uniform_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (param.spread > 0.0) {
+      std::uniform_real_distribution<> uniform(param.mean - spread,
+                                               param.mean + spread);
+      tmp1 = uniform(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else if (param.rand_type == "gaussian_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (spread > 0.0) {
+      std::normal_distribution<> normal(param.mean, spread);
+      tmp1 = normal(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else {
+    throw "Unknown random type: " + param.rand_type;
+  }
+}
+
+void AugmentationLayerBase::generate_chromatic_coeffs(float                     discount_coeff,
+                                                      const AugmentationParams& aug,
+                                                      AugmentationCoeff       & coeff) {
+  if (aug.gamma) {
+    coeff.gamma = rng_generate(aug.gamma(), discount_coeff, coeff.gamma.get_default());
+  }
+
+  if (aug.brightness) {
+    coeff.brightness =
+      rng_generate(aug.brightness(), discount_coeff, coeff.brightness.get_default());
+  }
+
+  if (aug.contrast) {
+    coeff.contrast = rng_generate(aug.contrast(), discount_coeff, coeff.contrast.get_default());
+  }
+
+  if (aug.color) {
+    coeff.color1 = rng_generate(aug.color(), discount_coeff, coeff.color1.get_default());
+    coeff.color2 = rng_generate(aug.color(), discount_coeff, coeff.color2.get_default());
+    coeff.color3 = rng_generate(aug.color(), discount_coeff, coeff.color3.get_default());
+  }
+}
+
+void AugmentationLayerBase::generate_spatial_coeffs(float                     discount_coeff,
+                                                    const AugmentationParams& aug,
+                                                    AugmentationCoeff       & coeff) {
+  if (aug.translate) {
+    coeff.dx = rng_generate(aug.translate(), discount_coeff, coeff.dx.get_default());
+    coeff.dy = rng_generate(aug.translate(), discount_coeff, coeff.dy.get_default());
+  }
+
+  if (aug.rotate) {
+    coeff.angle = rng_generate(aug.rotate(), discount_coeff, coeff.angle.get_default());
+  }
+
+  if (aug.zoom) {
+    coeff.zoom_x = rng_generate(aug.zoom(), discount_coeff, coeff.zoom_x.get_default());
+    coeff.zoom_y = coeff.zoom_x();
+  }
+
+  if (aug.squeeze) {
+    float squeeze_coeff = rng_generate(aug.squeeze(), discount_coeff, 1.0);
+    coeff.zoom_x = coeff.zoom_x() * squeeze_coeff;
+    coeff.zoom_y = coeff.zoom_y() * squeeze_coeff;
+  }
+}
+
+void AugmentationLayerBase::generate_valid_spatial_coeffs(
+  float                     discount_coeff,
+  const AugmentationParams& aug,
+  AugmentationCoeff       & coeff,
+  int                       src_width,
+  int                       src_height,
+  int                       out_width,
+  int                       out_height) {
+  int   x, y;
+  float x1, y1, x2, y2;
+  int   counter     = 0;
+  int   good_params = 0;
+  AugmentationCoeff incoming_coeff(coeff);
+
+  while (good_params < 4 && counter < 50) {
+    coeff.clear();
+    AugmentationLayerBase::generate_spatial_coeffs(discount_coeff, aug, coeff);
+    coeff.combine_with(incoming_coeff);
+
+    // Check if all 4 corners of the transformed image fit into the original
+    // image
+    good_params = 0;
+
+    for (x = 0; x < out_width; x += out_width - 1) {
+      for (y = 0; y < out_height; y += out_height - 1) {
+        // move the origin
+        x1 = x - 0.5 * out_width;
+        y1 = y - 0.5 * out_height;
+
+        // rotate
+        x2 = cos(coeff.angle()) * x1 - sin(coeff.angle()) * y1;
+        y2 = sin(coeff.angle()) * x1 + sin(coeff.angle()) * y1;
+
+        // translate
+        x2 = x2 + coeff.dx() * out_width;
+        y2 = y2 + coeff.dy() * out_height;
+
+        // zoom
+        x2 = x2 / coeff.zoom_x();
+        y2 = y2 / coeff.zoom_y();
+
+        // move the origin back
+        x2 = x2 + 0.5 * src_width;
+        y2 = y2 + 0.5 * src_height;
+
+        if (!((floor(x2) < 0) || (floor(x2) > src_width - 2.0) ||
+              (floor(y2) < 0) || (floor(y2) > src_height - 2.0))) {
+          good_params++;
+        }
+      }
+    }
+    counter++;
+  }
+
+  if (counter >= 50) {
+    printf("Warning: No suitable spatial transformation after %d attempts.\n", counter);
+    coeff.clear();
+    coeff.replace_with(incoming_coeff);
+  }
+}
+
+void AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  typename TTypes<float, 2>::Tensor& out)
+{
+  float *out_ptr = out.data();
+  int    counter = 0;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    out_ptr[counter + 0] = coeff.gamma();
+    out_ptr[counter + 1] = coeff.brightness();
+    out_ptr[counter + 2] = coeff.contrast();
+    out_ptr[counter + 3] = coeff.color1();
+    out_ptr[counter + 4] = coeff.color2();
+    out_ptr[counter + 5] = coeff.color3();
+    counter             += 6;
+  }
+}
+
+void AugmentationLayerBase::copy_spatial_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  const int out_width,
+  const int out_height,
+  const int src_width,
+  const int src_height,
+  typename TTypes<float, 2>::Tensor& out,
+  const bool invert)
+{
+  float   *out_ptr = out.data();
+  int      counter = 0;
+  TransMat t;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    t.toIdentity();
+    t.fromCoeff(&coeff, out_width, out_height, src_width, src_height);
+
+    if (invert) {
+      t = t.inverse();
+    }
+
+    out_ptr[counter + 0] = t.t0;
+    out_ptr[counter + 1] = t.t1;
+    out_ptr[counter + 2] = t.t2;
+    out_ptr[counter + 3] = t.t3;
+    out_ptr[counter + 4] = t.t4;
+    out_ptr[counter + 5] = t.t5;
+    counter             += 6;
+  }
+}
+}
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
new file mode 100644
index 0000000..d2aba2c
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
@@ -0,0 +1,228 @@
+#ifndef AUGMENTATION_LAYER_BASE_H_
+#define AUGMENTATION_LAYER_BASE_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+template<typename T>
+class OptionalType {
+  public:
+    OptionalType(const T default_value) : default_value(default_value), has_value(false) {}
+
+    operator bool() const {
+      return has_value;
+    }
+
+    OptionalType& operator=(T val) {
+      has_value = true;
+      value     = val;
+      return *this;
+    }
+
+    const T operator()() const {
+      return has_value ? value : default_value;
+    }
+
+    void clear() {
+      has_value = false;
+    }
+
+    const T get_default() {
+      return default_value;
+    }
+
+  private:
+    T value;
+    bool has_value;
+    const T default_value;
+};
+
+class AugmentationCoeff {
+  public:
+    // Spatial Types
+    OptionalType<float>dx;
+    OptionalType<float>dy;
+    OptionalType<float>angle;
+    OptionalType<float>zoom_x;
+    OptionalType<float>zoom_y;
+
+    // Chromatic Types
+    OptionalType<float>gamma;
+    OptionalType<float>brightness;
+    OptionalType<float>contrast;
+    OptionalType<float>color1;
+    OptionalType<float>color2;
+    OptionalType<float>color3;
+
+    AugmentationCoeff() : dx(0.0), dy(0.0), angle(0.0), zoom_x(1.0), zoom_y(1.0), gamma(1.0),
+      brightness(0.0), contrast(1.0), color1(1.0), color2(1.0), color3(1.0) {}
+
+    AugmentationCoeff(const AugmentationCoeff& coeff) : AugmentationCoeff() {
+      replace_with(coeff);
+    }
+
+    void clear();
+
+    void combine_with(const AugmentationCoeff& coeff);
+
+    void replace_with(const AugmentationCoeff& coeff);
+};
+
+typedef struct AugmentationParam {
+  std::string rand_type;
+  bool        should_exp;
+  float       mean;
+  float       spread;
+  float       prob;
+} AugmentationParam;
+
+class AugmentationParams {
+  public:
+    int crop_height;
+    int crop_width;
+
+    // Spatial options
+    OptionalType<struct AugmentationParam>translate;
+    OptionalType<struct AugmentationParam>rotate;
+    OptionalType<struct AugmentationParam>zoom;
+    OptionalType<struct AugmentationParam>squeeze;
+
+    // Chromatic options
+    OptionalType<struct AugmentationParam>gamma;
+    OptionalType<struct AugmentationParam>brightness;
+    OptionalType<struct AugmentationParam>contrast;
+    OptionalType<struct AugmentationParam>color;
+
+    inline AugmentationParams(int                     crop_height,
+                              int                     crop_width,
+                              std::vector<std::string>params_name,
+                              std::vector<std::string>params_rand_type,
+                              std::vector<bool>       params_exp,
+                              std::vector<float>      params_mean,
+                              std::vector<float>      params_spread,
+                              std::vector<float>      params_prob) :
+      crop_height(crop_height),
+      crop_width(crop_width),
+      translate(AugmentationParam()),
+      rotate(AugmentationParam()),
+      zoom(AugmentationParam()),
+      squeeze(AugmentationParam()),
+      gamma(AugmentationParam()),
+      brightness(AugmentationParam()),
+      contrast(AugmentationParam()),
+      color(AugmentationParam()) {
+      for (int i = 0; i < params_name.size(); i++) {
+        const std::string name      = params_name[i];
+        const std::string rand_type = params_rand_type[i];
+        const bool  should_exp      = params_exp[i];
+        const float mean            = params_mean[i];
+        const float spread          = params_spread[i];
+        const float prob            = params_prob[i];
+
+        struct AugmentationParam param = { rand_type, should_exp, mean, spread, prob };
+
+        if (name == "translate") {
+          this->translate = param;
+        } else if (name == "rotate") {
+          this->rotate = param;
+        } else if (name == "zoom") {
+          this->zoom = param;
+        }  else if (name == "squeeze") {
+          this->squeeze = param;
+        } else if (name == "noise") {
+          // NoOp: We handle noise on the Python side
+        } else if (name == "gamma") {
+          this->gamma = param;
+        } else if (name == "brightness") {
+          this->brightness = param;
+        } else if (name == "contrast") {
+          this->contrast = param;
+        } else if (name == "color") {
+          this->color = param;
+        } else {
+          std::cout << "Ignoring unknown augmentation parameter: " << name << std::endl;
+        }
+      }
+    }
+
+    bool should_do_spatial_transform() {
+      return this->translate || this->rotate || this->zoom || this->squeeze;
+    }
+
+    bool should_do_chromatic_transform() {
+      return this->gamma || this->brightness || this->contrast || this->color;
+    }
+};
+
+class AugmentationLayerBase {
+  public:
+    class TransMat {
+      /**
+       * Translation matrix class for spatial augmentation
+       * | 0 1 2 |
+       * | 3 4 5 |
+       */
+
+      public:
+        float t0, t1, t2;
+        float t3, t4, t5;
+
+
+        void fromCoeff(AugmentationCoeff *coeff,
+                       int                out_width,
+                       int                out_height,
+                       int                src_width,
+                       int                src_height);
+
+        void     fromTensor(const float *tensor_data);
+
+        TransMat inverse();
+
+        void     leftMultiply(float u0,
+                              float u1,
+                              float u2,
+                              float u3,
+                              float u4,
+                              float u5);
+
+        void toIdentity();
+    };
+
+    // TODO: Class ChromaticCoeffs
+
+    static float rng_generate(const AugmentationParam& param,
+                              float                    discount_coeff,
+                              const float              default_value);
+
+    static void clear_spatial_coeffs(AugmentationCoeff& coeff);
+    static void generate_chromatic_coeffs(float                     discount_coeff,
+                                          const AugmentationParams& aug,
+                                          AugmentationCoeff       & coeff);
+    static void generate_spatial_coeffs(float                     discount_coeff,
+                                        const AugmentationParams& aug,
+                                        AugmentationCoeff       & coeff);
+    static void generate_valid_spatial_coeffs(float                     discount_coeff,
+                                              const AugmentationParams& aug,
+                                              AugmentationCoeff       & coeff,
+                                              int                       src_width,
+                                              int                       src_height,
+                                              int                       out_width,
+                                              int                       out_height);
+
+    static void copy_chromatic_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                                typename TTypes<float, 2>::Tensor& out);
+    static void copy_spatial_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                              const int out_width,
+                                              const int out_height,
+                                              const int src_width,
+                                              const int src_height,
+                                              typename TTypes<float, 2>::Tensor& out,
+                                              const bool invert = false);
+};
+} // namespace tensorflow
+
+#endif // AUGMENTATION_LAYER_BASE_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
new file mode 100644
index 0000000..77b8c83
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
@@ -0,0 +1,461 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+template<>
+void Augment(OpKernelContext *context,
+             const CPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int64 channel_count                          = batch_size * out_height * out_width;
+  const int   kCostPerChannel                        = 10;
+  const DeviceBase::CpuWorkerThreads& worker_threads =
+    *context->device()->tensorflow_cpu_worker_threads();
+
+  Shard(worker_threads.num_threads,
+        worker_threads.workers,
+        channel_count,
+        kCostPerChannel,
+        [batch_size, channels, src_width,
+         src_height, src_count, out_width, out_height, src_data,
+         out_data, transMats, chromatic_coeffs](
+          int64 start_channel, int64 end_channel) {
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      for (int index = start_channel; index < end_channel; index++) {
+        int x = index % out_width;
+        int y = (index / out_width) % out_height;
+        int n = index / out_width / out_height;
+
+        const float *transMat = transMats + n * 6;
+
+        float gamma, brightness, contrast;
+
+        if (chromatic_coeffs) {
+          gamma      = chromatic_coeffs[n * 6 + 0];
+          brightness = chromatic_coeffs[n * 6 + 1];
+          contrast   = chromatic_coeffs[n * 6 + 2];
+        }
+
+        float xpos = x * transMat[0] + y * transMat[1] + transMat[2];
+        float ypos = x * transMat[3] + y * transMat[4] + transMat[5];
+
+        xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+        ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+        float tlx = floor(xpos);
+        float tly = floor(ypos);
+
+        float xdist = xpos - tlx;
+        float ydist = ypos - tly;
+
+        int srcTLIdxOffset = ((n * src_height + (int)tly) * src_width + (int)tlx) * channels;
+
+        // ((n * src_height + tly) * src_width + (tlx + 1)) * channels
+        int srcTRIdxOffset = srcTLIdxOffset + channels;
+
+        // ((n * src_height + (tly + 1)) * src_width + tlx) * channels
+        int srcBLIdxOffset = srcTLIdxOffset + channels * src_width;
+
+        // ((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels
+        int srcBRIdxOffset = srcTLIdxOffset + channels + channels * src_width;
+
+        // Variables for chromatic transform
+        int   data_index[3];
+        float rgb[3];
+        float mean_in  = 0;
+        float mean_out = 0;
+
+        for (int c = 0; c < channels; c++) {
+          // Bilinear interpolation
+          int srcTLIdx = srcTLIdxOffset + c;
+          int srcTRIdx = std::min(srcTRIdxOffset + c, src_count);
+          int srcBLIdx = std::min(srcBLIdxOffset + c, src_count);
+          int srcBRIdx = std::min(srcBRIdxOffset + c, src_count);
+
+          float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                       + (xdist) * (ydist) * src_data[srcBRIdx]
+                       + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                       + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+          if (chromatic_coeffs) {
+            // Gather data for chromatic transform
+            data_index[c] = index * channels + c;
+            rgb[c]        = dest;
+            mean_in      += rgb[c];
+
+            // Note: coeff[3] == color1, coeff[4] == color2, ...
+            rgb[c] *= chromatic_coeffs[n * 6 + (3 + c)];
+
+            mean_out += rgb[c];
+          } else {
+            out_data[index * channels + c] = dest;
+          }
+        }
+
+        float brightness_coeff = mean_in / (mean_out + 0.01f);
+
+        if (chromatic_coeffs) {
+          // Chromatic transformation
+          for (int c = 0; c < channels; c++) {
+            // compensate brightness
+            rgb[c] = clamp(rgb[c] * brightness_coeff, 0.0f, 1.0f);
+
+            // gamma change
+            rgb[c] = pow(rgb[c], gamma);
+
+            // brightness change
+            rgb[c] = rgb[c] + brightness;
+
+            // contrast change
+            rgb[c] = 0.5f + (rgb[c] - 0.5f) * contrast;
+
+            out_data[data_index[c]] = clamp(rgb[c], 0.0f, 1.0f);
+          }
+        }
+      }
+    });
+}
+
+template<typename Device>
+class DataAugmentation : public OpKernel {
+  public:
+    explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+
+      // TODO: Verify params are all the same length
+
+      // Get the tensors for params_a and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_a_rand_type", &params_a_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread", &params_a_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_coeff_schedule", &params_a_coeff_schedule_));
+
+      // Get the tensors for params_b and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_b_rand_type", &params_b_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread", &params_b_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_coeff_schedule", &params_b_coeff_schedule_));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      // Get the global step value
+      const Tensor& global_step_t = ctx->input(2);
+      auto global_step_eigen      = global_step_t.tensor<int64, 0>();
+      const int64 global_step     = global_step_eigen.data()[0];
+
+      // Dimension constants
+      const int batch_size = input_a_t.dim_size(0);
+      const int src_height = input_a_t.dim_size(1);
+      const int src_width  = input_a_t.dim_size(2);
+      const int channels   = input_a_t.dim_size(3);
+      const int src_count  = batch_size * src_height * src_width * channels;
+      const int out_height = crop_[0];
+      const int out_width  = crop_[1];
+      const int out_count  = batch_size * out_height * out_width * channels;
+
+      // All tensors for this op
+      Tensor chromatic_coeffs_a_t;
+      Tensor chromatic_coeffs_b_t;
+
+      // Allocate the memory for the output images
+      Tensor *output_a_t;
+      Tensor *output_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(0, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(1, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_b_t));
+
+      // Allocate the memory for the output spatial transforms
+      Tensor *spat_transform_a_t;
+      Tensor *spat_transform_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_b_t));
+
+      // Compute discount for coefficients if using a schedule
+      float discount_coeff_a = 1.0;
+      float discount_coeff_b = 1.0;
+
+      if (params_a_coeff_schedule_.size() == 3) {
+        float half_life     = params_a_coeff_schedule_[0];
+        float initial_coeff = params_a_coeff_schedule_[1];
+        float final_coeff   = params_a_coeff_schedule_[2];
+        discount_coeff_a = initial_coeff + (final_coeff - initial_coeff) *
+                           (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+      }
+
+      if (params_b_coeff_schedule_.size() == 3) {
+        if (params_a_coeff_schedule_.size() == 3) {
+          discount_coeff_b = discount_coeff_a;
+        } else {
+          float half_life     = params_b_coeff_schedule_[0];
+          float initial_coeff = params_b_coeff_schedule_[1];
+          float final_coeff   = params_b_coeff_schedule_[2];
+          discount_coeff_b = initial_coeff + (final_coeff - initial_coeff) *
+                             (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+        }
+      }
+
+      /*** BEGIN AUGMENTATION TO IMAGE A ***/
+      auto input_a  = input_a_t.tensor<float, 4>();
+      auto output_a = output_a_t->tensor<float, 4>();
+
+      // Load augmentation parameters for image A
+      AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+                                                    params_a_name_,
+                                                    params_a_rand_type_,
+                                                    params_a_exp_,
+                                                    params_a_mean_,
+                                                    params_a_spread_,
+                                                    params_a_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_a;
+
+
+      bool gen_spatial_transform   = aug_a.should_do_spatial_transform();
+      bool gen_chromatic_transform = aug_a.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff;
+
+        if (gen_spatial_transform) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_a, aug_a, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_a, aug_a, coeff);
+        }
+
+        coeffs_a.push_back(coeff);
+      }
+
+      // Copy spatial coefficients A to the output Tensor on the CPU
+      // (output for FlowAugmentation)
+      auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_a);
+
+      float *chromatic_coeffs_a_data = NULL;
+
+      if (gen_chromatic_transform) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_a_t));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_a = chromatic_coeffs_a_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_a, chromatic_coeffs_a);
+        chromatic_coeffs_a_data = chromatic_coeffs_a.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_a.data(),
+        output_a.data(),
+        spat_transform_a.data(),
+        chromatic_coeffs_a_data);
+
+      /*** END AUGMENTATION TO IMAGE A ***/
+
+      /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+      AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+                                                    params_b_name_,
+                                                    params_b_rand_type_,
+                                                    params_b_exp_,
+                                                    params_b_mean_,
+                                                    params_b_spread_,
+                                                    params_b_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_b;
+
+      bool gen_spatial_transform_b   = aug_b.should_do_spatial_transform();
+      bool gen_chromatic_transform_b = aug_b.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff(coeffs_a[n]);
+
+        // If we did a spatial transform on image A, we need to do the same one
+        // (+ possibly more) on image B
+        if (gen_spatial_transform_b) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_b, aug_b, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform_b) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_b, aug_b, coeff);
+        }
+
+        coeffs_b.push_back(coeff);
+      }
+
+      /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+
+      /*** BEGIN AUGMENTATION TO IMAGE B ***/
+      auto input_b  = input_b_t.tensor<float, 4>();
+      auto output_b = output_b_t->tensor<float, 4>();
+
+      // Copy spatial coefficients B to the output Tensor on the CPU
+      auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b);
+
+      float *chromatic_coeffs_b_data = NULL;
+
+      if (gen_chromatic_transform || gen_chromatic_transform_b) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        tensorflow::AllocatorAttributes pinned_allocator;
+        pinned_allocator.set_on_host(true);
+        pinned_allocator.set_gpu_compatible(true);
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_b_t, pinned_allocator));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_b = chromatic_coeffs_b_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_b, chromatic_coeffs_b);
+        chromatic_coeffs_b_data = chromatic_coeffs_b.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_b.data(),
+        output_b.data(),
+        spat_transform_b.data(),
+        chromatic_coeffs_b_data);
+
+      // FlowAugmentation needs the inverse
+      // TODO: To avoid rewriting, can we invert when we read on the
+      // FlowAugmentation side?
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b,
+                                                           true);
+
+      /*** END AUGMENTATION TO IMAGE B ***/
+    }
+
+  private:
+    std::vector<int32>crop_;
+
+    // Params A
+    std::vector<string>params_a_name_;
+    std::vector<string>params_a_rand_type_;
+    std::vector<bool>params_a_exp_;
+    std::vector<float>params_a_mean_;
+    std::vector<float>params_a_spread_;
+    std::vector<float>params_a_prob_;
+    std::vector<float>params_a_coeff_schedule_;
+
+    // Params B
+    std::vector<string>params_b_name_;
+    std::vector<string>params_b_rand_type_;
+    std::vector<bool>params_b_exp_;
+    std::vector<float>params_b_mean_;
+    std::vector<float>params_b_spread_;
+    std::vector<float>params_b_prob_;
+    std::vector<float>params_b_coeff_schedule_;
+};
+
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_CPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_GPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
new file mode 100644
index 0000000..7a2101d
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
@@ -0,0 +1,348 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+inline __device__ __host__ float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+__global__ void SpatialAugmentation(
+  const int32  nthreads,
+  const int    src_width,
+  const int    src_height,
+  const int    channels,
+  const int    src_count,
+  const int    out_width,
+  const int    out_height,
+  const float *src_data,
+  float       *out_data,
+  const float *transMats) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    int c = index % channels;
+    int x = (index / channels) % out_width;
+    int y = (index / channels / out_width) % out_height;
+    int n = index / channels / out_width / out_height;
+
+    const float *transMat = transMats + n * 6;
+    float xpos            = x * transMat[0] + y * transMat[1] + transMat[2];
+    float ypos            = x * transMat[3] + y * transMat[4] + transMat[5];
+
+    xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+    ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+    float tlx = floor(xpos);
+    float tly = floor(ypos);
+
+    // Bilinear interpolation
+    int srcTLIdx = ((n * src_height + tly) * src_width + tlx) * channels + c;
+    int srcTRIdx = min((int)(((n * src_height + tly) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+    int srcBLIdx = min((int)(((n * src_height + (tly + 1)) * src_width + tlx) * channels + c),
+                       src_count);
+    int srcBRIdx = min((int)(((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+
+    float xdist = xpos - tlx;
+    float ydist = ypos - tly;
+
+    float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                 + (xdist) * (ydist) * src_data[srcBRIdx]
+                 + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                 + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+    out_data[index] = dest;
+  }
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template<>
+void Augment(OpKernelContext *context,
+             const GPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int out_count     = batch_size * out_height * out_width * channels;
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_count, d);
+
+  printf("Chromatic transform not yet implemented on GPU, ignoring.");
+
+  SpatialAugmentation << < config.block_count, config.thread_per_block, 0, d.stream() >> > (
+    config.virtual_thread_count, src_width, src_height, channels, src_count,
+    out_width, out_height,
+    src_data, out_data, transMats);
+}
+
+//
+// template<typename Device>
+// class DataAugmentation : public OpKernel {
+//   public:
+//     explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+//       // Get the crop [height, width] tensor and verify its dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+//       OP_REQUIRES(ctx, crop_.size() == 2,
+//                   errors::InvalidArgument("crop must be 2 dimensions"));
+//
+//       // TODO: Verify params are all the same length
+//
+//       // Get the tensors for params_a and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_a_rand_type",
+// &params_a_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread",
+// &params_a_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+//
+//       // Get the tensors for params_b and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_b_rand_type",
+// &params_b_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread",
+// &params_b_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+//     }
+//
+//     void Compute(OpKernelContext *ctx) override {
+//       const GPUDevice& device = ctx->eigen_gpu_device();
+//
+//       // Get the input images
+//       const Tensor& input_a_t = ctx->input(0);
+//       const Tensor& input_b_t = ctx->input(1);
+//
+//       // Dimension constants
+//       const int batch_size = input_a_t.dim_size(0);
+//       const int src_height = input_a_t.dim_size(1);
+//       const int src_width  = input_a_t.dim_size(2);
+//       const int channels   = input_a_t.dim_size(3);
+//       const int src_count  = batch_size * src_height * src_width * channels;
+//       const int out_height = crop_[0];
+//       const int out_width  = crop_[1];
+//       const int out_count  = batch_size * out_height * out_width * channels;
+//
+//       // Allocate the memory for the output images
+//       Tensor *output_a_t;
+//       Tensor *output_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(0, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(1, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_b_t));
+//
+//       // Allocate the memory for the output spatial transforms
+//       Tensor *spat_transform_a_t;
+//       Tensor *spat_transform_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+//  &spat_transform_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+// &spat_transform_b_t));
+//
+//       // Allocate temporary pinned memory for the spatial transforms to be
+// used
+//       // on the GPU
+//       tensorflow::AllocatorAttributes pinned_allocator;
+//       pinned_allocator.set_on_host(true);
+//       pinned_allocator.set_gpu_compatible(true);
+//
+//       Tensor spat_transform_a_pinned_t;
+//       Tensor spat_transform_b_pinned_t;
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_a_pinned_t,
+// pinned_allocator));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_b_pinned_t,
+// pinned_allocator));
+//       auto spat_transform_a_pinned = spat_transform_a_pinned_t.tensor<float,
+// 2>();
+//       auto spat_transform_b_pinned = spat_transform_b_pinned_t.tensor<float,
+// 2>();
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE A ***/
+//       auto input_a  = input_a_t.tensor<float, 4>();
+//       auto output_a = output_a_t->tensor<float, 4>();
+//
+//       // Load augmentation parameters for image A
+//       AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+//                                                     params_a_name_,
+//                                                     params_a_rand_type_,
+//                                                     params_a_exp_,
+//                                                     params_a_mean_,
+//                                                     params_a_spread_,
+//                                                     params_a_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_a;
+//
+//       bool gen_spatial_transform = aug_a.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_a, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_a.push_back(coeff);
+//       }
+//
+//       // Copy spatial coefficients A to the output Tensor on the CPU (output
+// for
+//       // FlowAugmentation)
+//       auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_a);
+//
+//       // ...as well as a Tensor going to the GPU
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+//                                                            out_height,
+//                                                            src_width,
+//                                                            src_height,
+//
+//
+//
+//                                                    spat_transform_a_pinned);
+//
+//       CudaLaunchConfig config = GetCudaLaunchConfig(out_count, device);
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_a.data(), output_a.data(), spat_transform_a_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE A ***/
+//
+//       /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//       AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+//                                                     params_b_name_,
+//                                                     params_b_rand_type_,
+//                                                     params_b_exp_,
+//                                                     params_b_mean_,
+//                                                     params_b_spread_,
+//                                                     params_b_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_b;
+//
+//       gen_spatial_transform = aug_b.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_b, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_b.push_back(coeff);
+//       }
+//
+//       /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE B ***/
+//       auto input_b  = input_b_t.tensor<float, 4>();
+//       auto output_b = output_b_t->tensor<float, 4>();
+//
+//       // Copy spatial coefficients B to the output Tensor on the CPU
+//       auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_b,
+//                                                            true);
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//
+//
+//
+//                                                    spat_transform_b_pinned);
+//
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_b.data(), output_b.data(), spat_transform_b_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE B ***/
+//     }
+//
+//   private:
+//     std::vector<int32>crop_;
+//
+//     // Params A
+//     std::vector<string>params_a_name_;
+//     std::vector<string>params_a_rand_type_;
+//     std::vector<bool>params_a_exp_;
+//     std::vector<float>params_a_mean_;
+//     std::vector<float>params_a_spread_;
+//     std::vector<float>params_a_prob_;
+//
+//     // Params B
+//     std::vector<string>params_b_name_;
+//     std::vector<string>params_b_rand_type_;
+//     std::vector<bool>params_b_exp_;
+//     std::vector<float>params_b_mean_;
+//     std::vector<float>params_b_spread_;
+//     std::vector<float>params_b_prob_;
+// };
+} // namespace tensorflow
+#endif // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
new file mode 100644
index 0000000..545b8a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
@@ -0,0 +1,22 @@
+#ifndef FLOWNET_DATA_AUGMENTATION_H_
+#define FLOWNET_DATA_AUGMENTATION_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<class Device>
+void Augment(OpKernelContext *context,
+             const Device   & d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs);
+} // namespace tensorflow
+#endif // FLOWNET_DATA_AUGMENTATION_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
new file mode 100644
index 0000000..b5cc11f
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
@@ -0,0 +1,129 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline int clamp(int f, int a, int b) {
+  return std::max(a, std::min(f, b));
+}
+
+template<>
+void FillFlowAugmentation(const CPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int src_height      = flows.dimension(1);
+  const int src_width       = flows.dimension(2);
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+  float *output_ptr     = output.data();
+  const float *flow_ptr = flows.data();
+
+  for (int n = 0; n < batch_size; n++) {
+    const float *transMatA = transforms_from_a.data() + n * 6;
+    const float *transMatB = transforms_from_b.data() + n * 6;
+
+    for (int y = 0; y < out_height; y++) {
+      int outputIdxOffset = (n * out_height + y) * out_width;
+
+      for (int x = 0; x < out_width; x++) {
+        // Apply transformation matrix applied to first image
+        const float xpos1 = x * transMatA[0] + y * transMatA[1] + transMatA[2];
+        const float ypos1 = x * transMatA[3] + y * transMatA[4] + transMatA[5];
+
+        const int srcXIdx =
+          ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * 2 + 0;
+        const int srcYIdx = srcXIdx + 1;
+
+        const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+        const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+        // Apply inverse of the transformation matrix applied to second image
+        const float xpos3 = xpos2 * transMatB[0] + ypos2 * transMatB[1] + transMatB[2];
+        const float ypos3 = xpos2 * transMatB[3] + ypos2 * transMatB[4] + transMatB[5];
+
+        output_ptr[(outputIdxOffset + x) * 2 + 0] = xpos3 - (float)x;
+        output_ptr[(outputIdxOffset + x) * 2 + 1] = ypos3 - (float)y;
+      }
+    }
+  }
+}
+
+template<typename Device>
+class FlowAugmentation : public OpKernel {
+  public:
+    explicit FlowAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& flows_t             = ctx->input(0);
+      const Tensor& transforms_from_a_t = ctx->input(1);
+      const Tensor& transforms_from_b_t = ctx->input(2);
+
+      OP_REQUIRES(ctx, flows_t.dims() == 4,
+                  errors::InvalidArgument("Input images must have rank 4"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_a_t.shape()) &&
+                   transforms_from_a_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_a_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_a should be num_images x 6"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_b_t.shape()) &&
+                   transforms_from_b_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_b_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_b should be num_images x 6"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ flows_t.dim_size(0), crop_[0], crop_[1],
+                                     flows_t.dim_size(3) }), &output_t));
+
+      // Perform flow augmentation
+      auto flows             = flows_t.tensor<float, 4>();
+      auto transforms_from_a = transforms_from_a_t.tensor<float, 2>();
+      auto transforms_from_b = transforms_from_b_t.tensor<float, 2>();
+      auto output            = output_t->tensor<float, 4>();
+
+      FillFlowAugmentation(ctx->eigen_device<Device>(),
+                           output,
+                           flows,
+                           transforms_from_a,
+                           transforms_from_b);
+    }
+
+  private:
+    std::vector<int32>crop_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_CPU),
+                        FlowAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_GPU),
+                        FlowAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
new file mode 100644
index 0000000..7795991
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
@@ -0,0 +1,19 @@
+#ifndef FLOWNET_FLOW_AUG_H_
+#define FLOWNET_FLOW_AUG_H_
+
+// See docs in ../ops/image_ops.cc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+template<class Device>
+void FillFlowAugmentation(const Device& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOW_AUG_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
new file mode 100644
index 0000000..7e10864
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
@@ -0,0 +1,95 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+inline __device__ __host__ int clamp(int f, int a, int b) {
+  return max(a, min(f, b));
+}
+
+__global__ void FillFlowAugmentationKernel(
+  const int32 nthreads,
+  const float *flow_ptr,
+  const float *transforms_from_a,
+  const float *inv_transforms_from_b,
+  const int src_total_count, const int src_height, const int src_width,
+  const int batch_size, const int out_height,
+  const int out_width, float *output_ptr) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const float x = (float)(index % out_width);
+    const float y = (float)((index / out_width) % out_height);
+    const int   n = (index / out_width / out_height);
+
+    const int transformIdx = n * 6;
+
+    // Apply transformation matrix applied to second image
+    const float xpos1 = x * transforms_from_a[transformIdx + 0]
+                        + y * transforms_from_a[transformIdx + 1]
+                        + transforms_from_a[transformIdx + 2];
+    const float ypos1 = x * transforms_from_a[transformIdx + 3]
+                        + y * transforms_from_a[transformIdx + 4]
+                        + transforms_from_a[transformIdx + 5];
+
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    const int srcXIdx =
+      ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) *
+      2 + 0;
+    const int srcYIdx = srcXIdx + 1;
+
+    const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+    const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+    // Apply inverse of the transformation matrix applied to first image
+    const float xpos3 = xpos2 * inv_transforms_from_b[transformIdx + 0]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 1]
+                        + inv_transforms_from_b[transformIdx + 2];
+    const float ypos3 = xpos2 * inv_transforms_from_b[transformIdx + 3]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 4]
+                        + inv_transforms_from_b[transformIdx + 5];
+
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 0] = xpos3 -
+                                                                           x;
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 1] = ypos3 -
+                                                                           y;
+  }
+}
+
+template<>
+void FillFlowAugmentation(const GPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int depth           = 2;
+  const int total_count     = batch_size * out_height * out_width * depth;
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+
+  CudaLaunchConfig config = GetCudaLaunchConfig(total_count / 2, device);
+
+  FillFlowAugmentationKernel << < config.block_count, config.thread_per_block, 0,
+    device.stream() >> > (
+    total_count / 2, flows.data(), transforms_from_a.data(),
+    transforms_from_b.data(),
+    src_total_count, flows.dimension(1), flows.dimension(2), batch_size,
+    out_height, out_width, output.data());
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/preprocessing.cc b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
new file mode 100644
index 0000000..086a0d0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext *c) {
+  ShapeHandle input;
+
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch    = c->Dim(input, 0);
+  DimensionHandle    depth = c->Dim(input, 3);
+  std::vector<int32> crop_;
+  c->GetAttr("crop", &crop_);
+  DimensionHandle height = c->MakeDim(crop_[0]);
+  DimensionHandle width  = c->MakeDim(crop_[1]);
+  c->set_output(0, c->MakeShape({ batch, height, width, depth }));
+  return Status::OK();
+}
+
+REGISTER_OP("DataAugmentation")
+.Input("image_a: float32")
+.Input("image_b: float32")
+.Input("global_step: int64")
+.Attr("crop: list(int) >= 2")
+.Attr("params_a_name: list(string)")
+.Attr("params_a_rand_type: list(string)")
+.Attr("params_a_exp: list(bool)")
+.Attr("params_a_mean: list(float)")
+.Attr("params_a_spread: list(float)")
+.Attr("params_a_prob: list(float)")
+.Attr("params_a_coeff_schedule: list(float)")
+.Attr("params_b_name: list(string)")
+.Attr("params_b_rand_type: list(string)")
+.Attr("params_b_exp: list(bool)")
+.Attr("params_b_mean: list(float)")
+.Attr("params_b_spread: list(float)")
+.Attr("params_b_prob: list(float)")
+.Attr("params_b_coeff_schedule: list(float)")
+.Output("aug_image_a: float32")
+.Output("aug_image_b: float32")
+.Output("transforms_from_a: float32")
+.Output("transforms_from_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Verify input A and input B both have 4 dimensions
+    ShapeHandle input_shape_a, input_shape_b;
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape_a));
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_shape_b));
+
+    // TODO: Verify params vectors all have the same length
+
+    // TODO: Move this out of here and into Compute
+    // Verify input A and input B are the same shape
+    DimensionHandle batch_size, unused;
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 0),
+                                    c->Value(c->Dim(input_shape_b, 0)),
+                                    &batch_size));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 1),
+                                    c->Value(c->Dim(input_shape_b, 1)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 2),
+                                    c->Value(c->Dim(input_shape_b, 2)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 3),
+                                    c->Value(c->Dim(input_shape_b, 3)), &unused));
+
+    // Get cropping dimensions
+    std::vector<int32>crop_;
+    TF_RETURN_IF_ERROR(c->GetAttr("crop", &crop_));
+
+    // Reshape input shape to cropped shape
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 1, c->MakeDim(crop_[0]),
+                                     &input_shape_a));
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 2, c->MakeDim(crop_[1]),
+                                     &input_shape_a));
+
+    // Set output images shapes
+    c->set_output(0, input_shape_a);
+    c->set_output(1, input_shape_a);
+
+    // Set output spatial transforms shapes
+    c->set_output(2, c->MakeShape({ batch_size, 6 }));
+    c->set_output(3, c->MakeShape({ batch_size, 6 }));
+
+    return Status::OK();
+  });
+
+REGISTER_OP("FlowAugmentation")
+.Input("flows: float32")
+.Input("transforms_from_a: float32")
+.Input("transforms_from_b: float32")
+.Attr("crop: list(int) >= 2")
+.Output("transformed_flows: float32")
+.SetShapeFn(SetOutputToSizedImage);
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/training_schedules.py b/Codes/flownet2/src/training_schedules.py
new file mode 100644
index 0000000..4db5aab
--- /dev/null
+++ b/Codes/flownet2/src/training_schedules.py
@@ -0,0 +1,12 @@
+LONG_SCHEDULE = {
+    'step_values': [400000, 600000, 800000, 1000000],
+    'learning_rates': [0.0001, 0.00005, 0.000025, 0.0000125, 0.00000625],
+    'momentum': 0.9,
+    'momentum2': 0.999,
+    'weight_decay': 0.0004,
+    'max_iter': 1200000,
+}
+
+FINETUNE_SCHEDULE = {
+    # TODO: Finetune schedule
+}
diff --git a/Codes/flownet2/src/utils.py b/Codes/flownet2/src/utils.py
new file mode 100644
index 0000000..f6abe18
--- /dev/null
+++ b/Codes/flownet2/src/utils.py
@@ -0,0 +1,46 @@
+import tensorflow as tf
+
+
+# Thanks, https://github.com/tensorflow/tensorflow/issues/4079
+def LeakyReLU(x, leak=0.1, name="lrelu"):
+    with tf.variable_scope(name):
+        f1 = 0.5 * (1.0 + leak)
+        f2 = 0.5 * (1.0 - leak)
+        return f1 * x + f2 * abs(x)
+
+
+def average_endpoint_error(labels, predictions):
+    """
+    Given labels and predictions of size (N, H, W, 2), calculates average endpoint error:
+        sqrt[sum_across_channels{(X - Y)^2}]
+    """
+    num_samples = predictions.shape.as_list()[0]
+    with tf.name_scope(None, "average_endpoint_error", (predictions, labels)) as scope:
+        predictions = tf.to_float(predictions)
+        labels = tf.to_float(labels)
+        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+        squared_difference = tf.square(tf.subtract(predictions, labels))
+        # sum across channels: sum[(X - Y)^2] -> N, H, W, 1
+        loss = tf.reduce_sum(squared_difference, 3, keep_dims=True)
+        loss = tf.sqrt(loss)
+        return tf.reduce_sum(loss) / num_samples
+
+
+def pad(tensor, num=1):
+    """
+    Pads the given tensor along the height and width dimensions with `num` 0s on each side
+    """
+    return tf.pad(tensor, [[0, 0], [num, num], [num, num], [0, 0]], "CONSTANT")
+
+
+def antipad(tensor, num=1):
+    """
+    Performs a crop. "padding" for a deconvolutional layer (conv2d tranpose) removes
+    padding from the output rather than adding it to the input.
+    """
+    batch, h, w, c = tensor.get_shape().as_list()
+    # print(batch, h, w, c)
+    # print(type(batch), type(h), type(w), type(c))
+    # return tf.slice(tensor, begin=[0, num, num, 0], size=[batch, h - 2 * num, w - 2 * num, c])
+    return tensor[:, num: num + h - 2 * num, num: num + w - 2 * num, :]
author	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
committer	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
commit	fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree	af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src