82 files changed, 8146 insertions, 0 deletions
diff --git a/Codes/constant.py b/Codes/constant.py
new file mode 100644
index 0000000..eafeab9
--- /dev/null
+++ b/Codes/constant.py
@@ -0,0 +1,153 @@
+import os
+import argparse
+import configparser
+
+
+def get_dir(directory):
+    """
+    get the directory, if no such directory, then make it.
+
+    @param directory: The new directory.
+    """
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    return directory
+
+
+def parser_args():
+    parser = argparse.ArgumentParser(description='Options to run the network.')
+    parser.add_argument('-g', '--gpu', type=str, default='0',
+                        help='the device id of gpu.')
+    parser.add_argument('-i', '--iters', type=int, default=1,
+                        help='set the number of iterations, default is 1')
+    parser.add_argument('-b', '--batch', type=int, default=4,
+                        help='set the batch size, default is 4.')
+    parser.add_argument('--num_his', type=int, default=4,
+                        help='set the time steps, default is 4.')
+
+    parser.add_argument('-d', '--dataset', type=str,
+                        help='the name of dataset.')
+    parser.add_argument('--train_folder', type=str, default='',
+                        help='set the training folder path.')
+    parser.add_argument('--test_folder', type=str, default='',
+                        help='set the testing folder path.')
+
+    parser.add_argument('--config', type=str, default='training_hyper_params/hyper_params.ini',
+                        help='the path of training_hyper_params, default is training_hyper_params/hyper_params.ini')
+
+    parser.add_argument('--snapshot_dir', type=str, default='',
+                        help='if it is folder, then it is the directory to save models, '
+                             'if it is a specific model.ckpt-xxx, then the system will load it for testing.')
+    parser.add_argument('--summary_dir', type=str, default='', help='the directory to save summaries.')
+    parser.add_argument('--psnr_dir', type=str, default='', help='the directory to save psnrs results in testing.')
+
+    parser.add_argument('--evaluate', type=str, default='compute_auc',
+                        help='the evaluation metric, default is compute_auc')
+
+    return parser.parse_args()
+
+
+class Const(object):
+    class ConstError(TypeError):
+        pass
+
+    class ConstCaseError(ConstError):
+        pass
+
+    def __setattr__(self, name, value):
+        if name in self.__dict__:
+            raise self.ConstError("Can't change const.{}".format(name))
+        if not name.isupper():
+            raise self.ConstCaseError('const name {} is not all uppercase'.format(name))
+
+        self.__dict__[name] = value
+
+    def __str__(self):
+        _str = '<================ Constants information ================>\n'
+        for name, value in self.__dict__.items():
+            print(name, value)
+            _str += '\t{}\t{}\n'.format(name, value)
+
+        return _str
+
+
+args = parser_args()
+const = Const()
+
+# inputs constants
+const.DATASET = args.dataset
+const.TRAIN_FOLDER = args.train_folder
+const.TEST_FOLDER = args.test_folder
+
+const.GPU = args.gpu
+
+const.BATCH_SIZE = args.batch
+const.NUM_HIS = args.num_his
+const.ITERATIONS = args.iters
+
+const.EVALUATE = args.evaluate
+
+# network constants
+const.HEIGHT = 256
+const.WIDTH = 256
+const.FLOWNET_CHECKPOINT = 'flownet2/checkpoints/FlowNetSD/flownet-SD.ckpt-0'
+const.FLOW_HEIGHT = 384
+const.FLOW_WIDTH = 512
+
+# set training hyper-parameters of different datasets
+config = configparser.ConfigParser()
+assert config.read(args.config)
+
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+const.L_NUM = config.getint(const.DATASET, 'L_NUM')
+# the power to which each gradient term is raised in GDL loss
+const.ALPHA_NUM = config.getint(const.DATASET, 'ALPHA_NUM')
+# the percentage of the adversarial loss to use in the combined loss
+const.LAM_ADV = config.getfloat(const.DATASET, 'LAM_ADV')
+# the percentage of the lp loss to use in the combined loss
+const.LAM_LP = config.getfloat(const.DATASET, 'LAM_LP')
+# the percentage of the GDL loss to use in the combined loss
+const.LAM_GDL = config.getfloat(const.DATASET, 'LAM_GDL')
+# the percentage of the different frame loss
+const.LAM_FLOW = config.getfloat(const.DATASET, 'LAM_FLOW')
+
+# Learning rate of generator
+const.LRATE_G = eval(config.get(const.DATASET, 'LRATE_G'))
+const.LRATE_G_BOUNDARIES = eval(config.get(const.DATASET, 'LRATE_G_BOUNDARIES'))
+
+# Learning rate of discriminator
+const.LRATE_D = eval(config.get(const.DATASET, 'LRATE_D'))
+const.LRATE_D_BOUNDARIES = eval(config.get(const.DATASET, 'LRATE_D_BOUNDARIES'))
+
+
+const.SAVE_DIR = '{dataset}_l_{L_NUM}_alpha_{ALPHA_NUM}_lp_{LAM_LP}_' \
+                 'adv_{LAM_ADV}_gdl_{LAM_GDL}_flow_{LAM_FLOW}'.format(dataset=const.DATASET,
+                                                                      L_NUM=const.L_NUM,
+                                                                      ALPHA_NUM=const.ALPHA_NUM,
+                                                                      LAM_LP=const.LAM_LP, LAM_ADV=const.LAM_ADV,
+                                                                      LAM_GDL=const.LAM_GDL, LAM_FLOW=const.LAM_FLOW)
+
+if args.snapshot_dir:
+    # if the snapshot_dir is model.ckpt-xxx, which means it is the single model for testing.
+    if os.path.exists(args.snapshot_dir + '.meta') or os.path.exists(args.snapshot_dir + '.data-00000-of-00001') or \
+            os.path.exists(args.snapshot_dir + '.index'):
+        const.SNAPSHOT_DIR = args.snapshot_dir
+        print(const.SNAPSHOT_DIR)
+    else:
+        const.SNAPSHOT_DIR = get_dir(os.path.join('models', const.SAVE_DIR + '_' + args.snapshot_dir))
+else:
+    const.SNAPSHOT_DIR = get_dir(os.path.join('models', const.SAVE_DIR))
+
+if args.summary_dir:
+    const.SUMMARY_DIR = get_dir(os.path.join('summary', const.SAVE_DIR + '_' + args.summary_dir))
+else:
+    const.SUMMARY_DIR = get_dir(os.path.join('summary', const.SAVE_DIR))
+
+if args.psnr_dir:
+    const.PSNR_DIR = get_dir(os.path.join('psnrs', const.SAVE_DIR + '_' + args.psnr_dir))
+else:
+    const.PSNR_DIR = get_dir(os.path.join('psnrs', const.SAVE_DIR))
+
+
diff --git a/Codes/evaluate.py b/Codes/evaluate.py
new file mode 100644
index 0000000..2bce871
--- /dev/null
+++ b/Codes/evaluate.py
@@ -0,0 +1,576 @@
+import numpy as np
+import scipy.io as scio
+import os
+import argparse
+import pickle
+from sklearn import metrics
+import json
+import socket
+
+
+# data folder contain all datasets, such as ped1, ped2, avenue, shanghaitech, etc
+# DATA_DIR = '../Data'
+hostname = socket.gethostname()
+if hostname == 'dl-T8520-G10':  # 119
+    DATA_DIR = '/home/liuwen/ssd/datasets'
+elif hostname == 'admin' or hostname == 'compute101' or hostname == 'compute113' or hostname == 'compute106' \
+        or hostname == 'compute107' or hostname == 'compute114':   # node02
+    DATA_DIR = '/home/luowx/liuwen/datasets'
+elif hostname == 'gpu13' or 'gpu14':
+    DATA_DIR = '/public/home/gaoshenghua/liuwen/datasets'
+else:
+    # raise NotImplementedError('Not found this machine {}!'.format(hostname))
+    DATA_DIR = '../Data'
+
+
+# normalize scores in each sub video
+NORMALIZE = True
+
+# number of history frames, since in prediction based method, the first 4 frames can not be predicted, so that
+# the first 4frames are undecidable, we just ignore the first 4 frames
+DECIDABLE_IDX = 4
+
+
+def parser_args():
+    parser = argparse.ArgumentParser(description='evaluating the model, computing the roc/auc.')
+
+    parser.add_argument('-f', '--file', type=str, help='the path of loss file.')
+    parser.add_argument('-t', '--type', type=str, default='compute_auc',
+                        help='the type of evaluation, choosing type is: plot_roc, compute_auc, '
+                             'test_func\n, the default type is compute_auc')
+    return parser.parse_args()
+
+
+class RecordResult(object):
+    def __init__(self, fpr=None, tpr=None, auc=-np.inf, dataset=None, loss_file=None):
+        self.fpr = fpr
+        self.tpr = tpr
+        self.auc = auc
+        self.dataset = dataset
+        self.loss_file = loss_file
+
+    def __lt__(self, other):
+        return self.auc < other.auc
+
+    def __gt__(self, other):
+        return self.auc > other.auc
+
+    def __str__(self):
+        return 'dataset = {}, loss file = {}, auc = {}'.format(self.dataset, self.loss_file, self.auc)
+
+
+class GroundTruthLoader(object):
+    AVENUE = 'avenue'
+    PED1 = 'ped1'
+    PED1_PIXEL_SUBSET = 'ped1_pixel_subset'
+    PED2 = 'ped2'
+    ENTRANCE = 'enter'
+    EXIT = 'exit'
+    SHANGHAITECH = 'shanghaitech'
+    SHANGHAITECH_LABEL_PATH = os.path.join(DATA_DIR, 'shanghaitech/testing/test_frame_mask')
+    TOY_DATA = 'toydata'
+    TOY_DATA_LABEL_PATH = os.path.join(DATA_DIR, TOY_DATA, 'toydata.json')
+
+    NAME_MAT_MAPPING = {
+        AVENUE: os.path.join(DATA_DIR, 'avenue/avenue.mat'),
+        PED1: os.path.join(DATA_DIR, 'ped1/ped1.mat'),
+        PED2: os.path.join(DATA_DIR, 'ped2/ped2.mat'),
+        ENTRANCE: os.path.join(DATA_DIR, 'enter/enter.mat'),
+        EXIT: os.path.join(DATA_DIR, 'exit/exit.mat')
+    }
+
+    NAME_FRAMES_MAPPING = {
+        AVENUE: os.path.join(DATA_DIR, 'avenue/testing/frames'),
+        PED1: os.path.join(DATA_DIR, 'ped1/testing/frames'),
+        PED2: os.path.join(DATA_DIR, 'ped2/testing/frames'),
+        ENTRANCE: os.path.join(DATA_DIR, 'enter/testing/frames'),
+        EXIT: os.path.join(DATA_DIR, 'exit/testing/frames')
+    }
+
+    def __init__(self, mapping_json=None):
+        """
+        Initial a ground truth loader, which loads the ground truth with given dataset name.
+
+        :param mapping_json: the mapping from dataset name to the path of ground truth.
+        """
+
+        if mapping_json is not None:
+            with open(mapping_json, 'rb') as json_file:
+                self.mapping = json.load(json_file)
+        else:
+            self.mapping = GroundTruthLoader.NAME_MAT_MAPPING
+
+    def __call__(self, dataset):
+        """ get the ground truth by provided the name of dataset.
+
+        :type dataset: str
+        :param dataset: the name of dataset.
+        :return: np.ndarray, shape(#video)
+                 np.array[0] contains all the start frame and end frame of abnormal events of video 0,
+                 and its shape is (#frapsnr, )
+        """
+
+        if dataset == GroundTruthLoader.SHANGHAITECH:
+            gt = self.__load_shanghaitech_gt()
+        elif dataset == GroundTruthLoader.TOY_DATA:
+            gt = self.__load_toydata_gt()
+        else:
+            gt = self.__load_ucsd_avenue_subway_gt(dataset)
+        return gt
+
+    def __load_ucsd_avenue_subway_gt(self, dataset):
+        assert dataset in self.mapping, 'there is no dataset named {} \n Please check {}' \
+            .format(dataset, GroundTruthLoader.NAME_MAT_MAPPING.keys())
+
+        mat_file = self.mapping[dataset]
+        abnormal_events = scio.loadmat(mat_file, squeeze_me=True)['gt']
+
+        if abnormal_events.ndim == 2:
+            abnormal_events = abnormal_events.reshape(-1, abnormal_events.shape[0], abnormal_events.shape[1])
+
+        num_video = abnormal_events.shape[0]
+        dataset_video_folder = GroundTruthLoader.NAME_FRAMES_MAPPING[dataset]
+        video_list = os.listdir(dataset_video_folder)
+        video_list.sort()
+
+        assert num_video == len(video_list), 'ground true does not match the number of testing videos. {} != {}' \
+            .format(num_video, len(video_list))
+
+        # get the total frames of sub video
+        def get_video_length(sub_video_number):
+            # video_name = video_name_template.format(sub_video_number)
+            video_name = os.path.join(dataset_video_folder, video_list[sub_video_number])
+            assert os.path.isdir(video_name), '{} is not directory!'.format(video_name)
+
+            length = len(os.listdir(video_name))
+
+            return length
+
+        # need to test [].append, or np.array().append(), which one is faster
+        gt = []
+        for i in range(num_video):
+            length = get_video_length(i)
+
+            sub_video_gt = np.zeros((length,), dtype=np.int8)
+            sub_abnormal_events = abnormal_events[i]
+            if sub_abnormal_events.ndim == 1:
+                sub_abnormal_events = sub_abnormal_events.reshape((sub_abnormal_events.shape[0], -1))
+
+            _, num_abnormal = sub_abnormal_events.shape
+
+            for j in range(num_abnormal):
+                # (start - 1, end - 1)
+                start = sub_abnormal_events[0, j] - 1
+                end = sub_abnormal_events[1, j]
+
+                sub_video_gt[start: end] = 1
+
+            gt.append(sub_video_gt)
+
+        return gt
+
+    @staticmethod
+    def __load_shanghaitech_gt():
+        video_path_list = os.listdir(GroundTruthLoader.SHANGHAITECH_LABEL_PATH)
+        video_path_list.sort()
+
+        gt = []
+        for video in video_path_list:
+            # print(os.path.join(GroundTruthLoader.SHANGHAITECH_LABEL_PATH, video))
+            gt.append(np.load(os.path.join(GroundTruthLoader.SHANGHAITECH_LABEL_PATH, video)))
+
+        return gt
+
+    @staticmethod
+    def __load_toydata_gt():
+        with open(GroundTruthLoader.TOY_DATA_LABEL_PATH, 'r') as gt_file:
+            gt_dict = json.load(gt_file)
+
+        gt = []
+        for video, video_info in gt_dict.items():
+            length = video_info['length']
+            video_gt = np.zeros((length,), dtype=np.uint8)
+            sub_gt = np.array(np.matrix(video_info['gt']))
+
+            for anomaly in sub_gt:
+                start = anomaly[0]
+                end = anomaly[1] + 1
+                video_gt[start: end] = 1
+            gt.append(video_gt)
+        return gt
+
+    @staticmethod
+    def get_pixel_masks_file_list(dataset):
+        # pixel mask folder
+        pixel_mask_folder = os.path.join(DATA_DIR, dataset, 'pixel_masks')
+        pixel_mask_file_list = os.listdir(pixel_mask_folder)
+        pixel_mask_file_list.sort()
+
+        # get all testing videos
+        dataset_video_folder = GroundTruthLoader.NAME_FRAMES_MAPPING[dataset]
+        video_list = os.listdir(dataset_video_folder)
+        video_list.sort()
+
+        # get all testing video names with pixel masks
+        pixel_video_ids = []
+        ids = 0
+        for pixel_mask_name in pixel_mask_file_list:
+            while ids < len(video_list):
+                if video_list[ids] + '.npy' == pixel_mask_name:
+                    pixel_video_ids.append(ids)
+                    ids += 1
+                    break
+                else:
+                    ids += 1
+
+        assert len(pixel_video_ids) == len(pixel_mask_file_list)
+
+        for i in range(len(pixel_mask_file_list)):
+            pixel_mask_file_list[i] = os.path.join(pixel_mask_folder, pixel_mask_file_list[i])
+
+        return pixel_mask_file_list, pixel_video_ids
+
+
+def load_psnr_gt(loss_file):
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+
+    dataset = results['dataset']
+    psnr_records = results['psnr']
+
+    num_videos = len(psnr_records)
+
+    # load ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    assert num_videos == len(gt), 'the number of saved videos does not match the ground truth, {} != {}' \
+        .format(num_videos, len(gt))
+
+    return dataset, psnr_records, gt
+
+
+def load_psnr_gt_flow(loss_file):
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+
+    dataset = results['dataset']
+    psnrs = results['psnr']
+    flows = results['flow']
+
+    num_videos = len(psnrs)
+
+    # load ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    assert num_videos == len(gt), 'the number of saved videos does not match the ground truth, {} != {}' \
+        .format(num_videos, len(gt))
+
+    return dataset, psnrs, flows, gt
+
+
+def load_psnr(loss_file):
+    """
+    load image psnr or optical flow psnr.
+    :param loss_file: loss file path
+    :return:
+    """
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+    psnrs = results['psnr']
+    return psnrs
+
+
+def get_scores_labels(loss_file):
+    # the name of dataset, loss, and ground truth
+    dataset, psnr_records, gt = load_psnr_gt(loss_file=loss_file)
+
+    # the number of videos
+    num_videos = len(psnr_records)
+
+    scores = np.array([], dtype=np.float32)
+    labels = np.array([], dtype=np.int8)
+    # video normalization
+    for i in range(num_videos):
+        distance = psnr_records[i]
+
+        if NORMALIZE:
+            distance -= distance.min()  # distances = (distance - min) / (max - min)
+            distance /= distance.max()
+            # distance = 1 - distance
+
+        scores = np.concatenate((scores[:], distance[DECIDABLE_IDX:]), axis=0)
+        labels = np.concatenate((labels[:], gt[i][DECIDABLE_IDX:]), axis=0)
+    return dataset, scores, labels
+
+
+def precision_recall_auc(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult()
+    for sub_loss_file in loss_file_list:
+        dataset, scores, labels = get_scores_labels(sub_loss_file)
+        precision, recall, thresholds = metrics.precision_recall_curve(labels, scores, pos_label=0)
+        auc = metrics.auc(recall, precision)
+
+        results = RecordResult(recall, precision, auc, dataset, sub_loss_file)
+
+        if optimal_results < results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def cal_eer(fpr, tpr):
+    # makes fpr + tpr = 1
+    eer = fpr[np.nanargmin(np.absolute((fpr + tpr - 1)))]
+    return eer
+
+
+def compute_eer(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult(auc=np.inf)
+    for sub_loss_file in loss_file_list:
+        dataset, scores, labels = get_scores_labels(sub_loss_file)
+        fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=0)
+        eer = cal_eer(fpr, tpr)
+
+        results = RecordResult(fpr, tpr, eer, dataset, sub_loss_file)
+
+        if optimal_results > results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def compute_auc(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult()
+    for sub_loss_file in loss_file_list:
+        # the name of dataset, loss, and ground truth
+        dataset, psnr_records, gt = load_psnr_gt(loss_file=sub_loss_file)
+
+        # the number of videos
+        num_videos = len(psnr_records)
+
+        scores = np.array([], dtype=np.float32)
+        labels = np.array([], dtype=np.int8)
+        # video normalization
+        for i in range(num_videos):
+            distance = psnr_records[i]
+
+            if NORMALIZE:
+                distance -= distance.min()  # distances = (distance - min) / (max - min)
+                distance /= distance.max()
+                # distance = 1 - distance
+
+            scores = np.concatenate((scores, distance[DECIDABLE_IDX:]), axis=0)
+            labels = np.concatenate((labels, gt[i][DECIDABLE_IDX:]), axis=0)
+
+        fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=0)
+        auc = metrics.auc(fpr, tpr)
+
+        results = RecordResult(fpr, tpr, auc, dataset, sub_loss_file)
+
+        if optimal_results < results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def average_psnr(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    max_avg_psnr = -np.inf
+    max_file = ''
+    for file in loss_file_list:
+        psnr_records = load_psnr(file)
+
+        psnr_records = np.concatenate(psnr_records, axis=0)
+        avg_psnr = np.mean(psnr_records)
+        if max_avg_psnr < avg_psnr:
+            max_avg_psnr = avg_psnr
+            max_file = file
+        print('{}, average psnr = {}'.format(file, avg_psnr))
+
+    print('max average psnr file = {}, psnr = {}'.format(max_file, max_avg_psnr))
+
+
+def calculate_psnr(loss_file):
+    optical_result = compute_auc(loss_file)
+    print('##### optimal result and model = {}'.format(optical_result))
+
+    mean_psnr = []
+    for file in os.listdir(loss_file):
+        file = os.path.join(loss_file, file)
+        dataset, psnr_records, gt = load_psnr_gt(file)
+
+        psnr_records = np.concatenate(psnr_records, axis=0)
+        gt = np.concatenate(gt, axis=0)
+
+        mean_normal_psnr = np.mean(psnr_records[gt == 0])
+        mean_abnormal_psnr = np.mean(psnr_records[gt == 1])
+        mean = np.mean(psnr_records)
+        print('mean normal psrn = {}, mean abnormal psrn = {}, mean = {}'.format(
+            mean_normal_psnr,
+            mean_abnormal_psnr,
+            mean)
+        )
+        mean_psnr.append(mean)
+    print('max mean psnr = {}'.format(np.max(mean_psnr)))
+
+
+def calculate_score(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_path = loss_file
+    else:
+        optical_result = compute_auc(loss_file)
+        loss_file_path = optical_result.loss_file
+        print('##### optimal result and model = {}'.format(optical_result))
+    dataset, psnr_records, gt = load_psnr_gt(loss_file=loss_file_path)
+
+    # the number of videos
+    num_videos = len(psnr_records)
+
+    scores = np.array([], dtype=np.float32)
+    labels = np.array([], dtype=np.int8)
+    # video normalization
+    for i in range(num_videos):
+        distance = psnr_records[i]
+
+        distance = (distance - distance.min()) / (distance.max() - distance.min())
+
+        scores = np.concatenate((scores, distance[DECIDABLE_IDX:]), axis=0)
+        labels = np.concatenate((labels, gt[i][DECIDABLE_IDX:]), axis=0)
+
+    mean_normal_scores = np.mean(scores[labels == 0])
+    mean_abnormal_scores = np.mean(scores[labels == 1])
+    print('mean normal scores = {}, mean abnormal scores = {}, '
+          'delta = {}'.format(mean_normal_scores, mean_abnormal_scores, mean_normal_scores - mean_abnormal_scores))
+
+
+def test_func(*args):
+    # simulate testing on CUHK AVENUE dataset
+    dataset = GroundTruthLoader.AVENUE
+
+    # load the ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    num_videos = len(gt)
+
+    simulated_results = {
+        'dataset': dataset,
+        'psnr': []
+    }
+
+    simulated_psnr = []
+    for i in range(num_videos):
+        sub_video_length = gt[i].shape[0]
+        simulated_psnr.append(np.random.random(size=sub_video_length))
+
+    simulated_results['psnr'] = simulated_psnr
+
+    # writing to file, 'generated_loss.bin'
+    with open('generated_loss.bin', 'wb') as writer:
+        pickle.dump(simulated_results, writer, pickle.HIGHEST_PROTOCOL)
+
+    print(file_path.name)
+    result = compute_auc(file_path.name)
+
+    print('optimal = {}'.format(result))
+
+
+eval_type_function = {
+    'compute_auc': compute_auc,
+    'compute_eer': compute_eer,
+    'precision_recall_auc': precision_recall_auc,
+    'calculate_psnr': calculate_psnr,
+    'calculate_score': calculate_score,
+    'average_psnr': average_psnr,
+    'average_psnr_sample': average_psnr
+}
+
+
+def evaluate(eval_type, save_file):
+    assert eval_type in eval_type_function, 'there is no type of evaluation {}, please check {}' \
+        .format(eval_type, eval_type_function.keys())
+    eval_func = eval_type_function[eval_type]
+    optimal_results = eval_func(save_file)
+    return optimal_results
+
+
+if __name__ == '__main__':
+    args = parser_args()
+
+    eval_type = args.type
+    file_path = args.file
+
+    print('Evaluate type = {}'.format(eval_type))
+    print('File path = {}'.format(file_path))
+
+    if eval_type == 'test_func':
+        test_func()
+    else:
+        evaluate(eval_type, file_path)
+\ No newline at end of file
diff --git a/Codes/flownet2/.gitignore b/Codes/flownet2/.gitignore
new file mode 100644
index 0000000..31abf4e
--- /dev/null
+++ b/Codes/flownet2/.gitignore
@@ -0,0 +1,9 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.o
+*.so
+*.so.dSYM
+checkpoints/
+!checkpoints/download.sh
+!checkpoints/README.md
diff --git a/Codes/flownet2/LICENSE b/Codes/flownet2/LICENSE
new file mode 100644
index 0000000..d2cc224
--- /dev/null
+++ b/Codes/flownet2/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Sam Pepose
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Codes/flownet2/Makefile b/Codes/flownet2/Makefile
new file mode 100644
index 0000000..073c011
--- /dev/null
+++ b/Codes/flownet2/Makefile
@@ -0,0 +1,82 @@
+# Makefile
+
+TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"`
+
+ifndef CUDA_HOME
+    CUDA_HOME := /usr/local/cuda
+endif
+
+CC        = gcc -O2 -pthread
+CXX       = g++
+GPUCC     = nvcc
+CFLAGS    = -std=c++11 -I$(TF_INC) -I"$(CUDA_HOME)/include" -DGOOGLE_CUDA=1
+GPUCFLAGS = -c
+LFLAGS    = -pthread -shared -fPIC
+GPULFLAGS = -x cu -Xcompiler -fPIC
+CGPUFLAGS = -L$(CUDA_HOME)/lib -L$(CUDA_HOME)/lib64 -lcudart
+
+OUT_DIR   = src/ops/build
+PREPROCESSING_SRC = "src/ops/preprocessing/preprocessing.cc" "src/ops/preprocessing/kernels/flow_augmentation.cc" "src/ops/preprocessing/kernels/augmentation_base.cc" "src/ops/preprocessing/kernels/data_augmentation.cc"
+GPU_SRC_DATA_AUG  	= src/ops/preprocessing/kernels/data_augmentation.cu.cc
+GPU_SRC_FLOW     	= src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
+GPU_PROD_DATA_AUG 	= $(OUT_DIR)/data_augmentation.o
+GPU_PROD_FLOW    	= $(OUT_DIR)/flow_augmentation_gpu.o
+PREPROCESSING_PROD	= $(OUT_DIR)/preprocessing.so
+
+DOWNSAMPLE_SRC = "src/ops/downsample/downsample_kernel.cc" "src/ops/downsample/downsample_op.cc"
+GPU_SRC_DOWNSAMPLE  = src/ops/downsample/downsample_kernel_gpu.cu.cc
+GPU_PROD_DOWNSAMPLE = $(OUT_DIR)/downsample_kernel_gpu.o
+DOWNSAMPLE_PROD 	= $(OUT_DIR)/downsample.so
+
+CORRELATION_SRC = "src/ops/correlation/correlation_kernel.cc" "src/ops/correlation/correlation_grad_kernel.cc" "src/ops/correlation/correlation_op.cc"
+GPU_SRC_CORRELATION  = src/ops/correlation/correlation_kernel.cu.cc
+GPU_SRC_CORRELATION_GRAD  = src/ops/correlation/correlation_grad_kernel.cu.cc
+GPU_SRC_PAD = src/ops/correlation/pad.cu.cc
+GPU_PROD_CORRELATION = $(OUT_DIR)/correlation_kernel_gpu.o
+GPU_PROD_CORRELATION_GRAD = $(OUT_DIR)/correlation_grad_kernel_gpu.o
+GPU_PROD_PAD = $(OUT_DIR)/correlation_pad_gpu.o
+CORRELATION_PROD 	= $(OUT_DIR)/correlation.so
+
+FLOWWARP_SRC = "src/ops/flow_warp/flow_warp_op.cc" "src/ops/flow_warp/flow_warp.cc" "src/ops/flow_warp/flow_warp_grad.cc"
+GPU_SRC_FLOWWARP = "src/ops/flow_warp/flow_warp.cu.cc"
+GPU_SRC_FLOWWARP_GRAD = "src/ops/flow_warp/flow_warp_grad.cu.cc"
+GPU_PROD_FLOWWARP = "$(OUT_DIR)/flow_warp_gpu.o"
+GPU_PROD_FLOWWARP_GRAD = "$(OUT_DIR)/flow_warp_grad_gpu.o"
+FLOWWARP_PROD = "$(OUT_DIR)/flow_warp.so"
+
+ifeq ($(OS),Windows_NT)
+    detected_OS := Windows
+else
+    detected_OS := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+endif
+ifeq ($(detected_OS),Darwin)  # Mac OS X
+	CGPUFLAGS += -undefined dynamic_lookup
+endif
+ifeq ($(detected_OS),Linux)
+	CFLAGS += -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES -D__STRICT_ANSI__ -D_GLIBCXX_USE_CXX11_ABI=0
+endif
+
+all: preprocessing downsample correlation flowwarp
+
+preprocessing:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_DATA_AUG) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_DATA_AUG)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOW) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOW)
+	$(CXX) -g $(CFLAGS)  $(PREPROCESSING_SRC) $(GPU_PROD_DATA_AUG) $(GPU_PROD_FLOW) $(LFLAGS) $(CGPUFLAGS) -o $(PREPROCESSING_PROD)
+
+downsample:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_DOWNSAMPLE) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_DOWNSAMPLE)
+	$(CXX) -g $(CFLAGS)  $(DOWNSAMPLE_SRC) $(GPU_PROD_DOWNSAMPLE) $(LFLAGS) $(CGPUFLAGS) -o $(DOWNSAMPLE_PROD)
+
+correlation:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_CORRELATION) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_CORRELATION)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_CORRELATION_GRAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_CORRELATION_GRAD)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_PAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_PAD)
+	$(CXX) -g $(CFLAGS)  $(CORRELATION_SRC) $(GPU_PROD_CORRELATION) $(GPU_PROD_CORRELATION_GRAD) $(GPU_PROD_PAD) $(LFLAGS) $(CGPUFLAGS) -o $(CORRELATION_PROD)
+
+flowwarp:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOWWARP) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOWWARP)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOWWARP_GRAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOWWARP_GRAD)
+	$(CXX) -g $(CFLAGS)  $(FLOWWARP_SRC) $(GPU_PROD_FLOWWARP) $(GPU_PROD_FLOWWARP_GRAD) $(LFLAGS) $(CGPUFLAGS) -o $(FLOWWARP_PROD)
+
+clean:
+	rm -f $(PREPROCESSING_PROD) $(GPU_PROD_FLOW) $(GPU_PROD_DATA_AUG) $(DOWNSAMPLE_PROD) $(GPU_PROD_DOWNSAMPLE)
diff --git a/Codes/flownet2/README.md b/Codes/flownet2/README.md
new file mode 100644
index 0000000..8647723
--- /dev/null
+++ b/Codes/flownet2/README.md
@@ -0,0 +1,66 @@
+## FlowNet2 (TensorFlow)
+
+This repo contains FlowNet2[1] for TensorFlow. It includes FlowNetC, S, CS, CSS, CSS-ft-sd, SD, and 2.
+
+### Installation
+```
+pip install enum
+pip install pypng
+pip install matplotlib
+pip install image
+pip install scipy
+pip install numpy
+pip install tensorflow
+```
+
+Linux:
+`sudo apt-get install python-tk`
+
+You must have CUDA installed:
+`make all`
+
+### Download weights
+To download the weights for all models (4.4GB), run the `download.sh` script in the `checkpoints` directory. All test scripts rely on these checkpoints to work properly.
+
+
+### Flow Generation (1 image pair)
+
+```
+python -m src.flownet2.test --input_a data/samples/0img0.ppm --input_b data/samples/0img1.ppm --out ./
+```
+
+Available models:
+* `flownet2`
+* `flownet_s`
+* `flownet_c`
+* `flownet_cs`
+* `flownet_css` (can edit test.py to use css-ft-sd weights)
+* `flownet_sd`
+
+If installation is successful, you should predict the following flow from samples/0img0.ppm:
+![FlowNet2 Sample Prediction](/data/samples/0flow-pred-flownet2.png?raw=true)
+
+### Training
+If you would like to train any of the networks from scratch (replace `flownet2` with the appropriate model):
+```
+python -m src.flownet2.train
+```
+For stacked networks, previous network weights will be loaded and fixed. For example, if training CS, the C weights are loaded and fixed and the S weights are randomly initialized.
+
+
+### Fine-tuning
+TODO
+
+### Benchmarks
+Benchmarks are for a forward pass with each model of two 512x384 images. All benchmarks were tested with a K80 GPU and Intel Xeon CPU E5-2682 v4 @ 2.30GHz. Code was executed with TensorFlow-1.2.1 and python 2.7.12 on Ubuntu 16.04. Resulting times were averaged over 10 runs. The first run is always slower as it sets up the Tensorflow Session.
+
+| | S | C | CS | CSS | SD | 2
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
+| First Run | 681.039ms | 898.792ms | 998.584ms | 1063.357ms | 933.806ms | 1882.003ms |
+| Subsequent Runs | 38.067ms | 78.789ms | 123.300ms | 161.186ms | 62.061ms | 276.641ms |
+
+
+### Sources
+[1] E. Ilg, N. Mayer, T. Saikia, M. Keuper, A. Dosovitskiy, T. Brox
+FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks,
+IEEE Conference in Computer Vision and Pattern Recognition (CVPR), 2017.
diff --git a/Codes/flownet2/__init__.py b/Codes/flownet2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/__init__.py
diff --git a/Codes/flownet2/corr.py b/Codes/flownet2/corr.py
new file mode 100644
index 0000000..3301d8c
--- /dev/null
+++ b/Codes/flownet2/corr.py
@@ -0,0 +1,45 @@
+import tensorflow as tf
+import numpy as np
+import math
+
+BATCH_SIZE = 8
+HEIGHT = 30
+WIDTH = 60
+CHANNELS = 3
+
+NEIGHBORHOOD_SIZE = 41
+MAX_DISPLACEMENT = int(math.ceil(NEIGHBORHOOD_SIZE / 2.0))
+STRIDE_2 = 2
+
+assert(STRIDE_2 <= NEIGHBORHOOD_SIZE)
+
+# Define two feature maps
+fmA = tf.ones((BATCH_SIZE, HEIGHT, WIDTH, CHANNELS), dtype=tf.int32)
+fmB = tf.convert_to_tensor(np.random.randint(5, size=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS)), dtype=tf.int32)
+
+depth = int(math.floor((2.0 * MAX_DISPLACEMENT + 1) / STRIDE_2) ** 2)
+
+print('Output should be size:', (BATCH_SIZE, HEIGHT, WIDTH, depth))
+print('Striding at values: ', [e for e in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2)])
+
+def main():
+    out = []
+    for i in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2): # height
+        for j in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2): # width
+            padded_a = tf.pad(fmA, [[0,0], [0, abs(i)], [0, abs(j)], [0, 0]])
+            padded_b = tf.pad(fmB, [[0, 0], [abs(i), 0], [abs(j), 0], [0, 0]])
+            m = padded_a * padded_b
+
+            height_start_idx = 0 if i <= 0 else i
+            height_end_idx = height_start_idx + HEIGHT
+            width_start_idx = 0 if j <= 0 else j
+            width_end_idx = width_start_idx + WIDTH
+            cut = m[:, height_start_idx:height_end_idx, width_start_idx:width_end_idx, :]
+
+            final = tf.reduce_sum(cut, 3)
+            out.append(final)
+    corr = tf.stack(out, 3)
+    print('Output size: ', corr.shape)
+
+
+main()
diff --git a/Codes/flownet2/src/__init__.py b/Codes/flownet2/src/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/__init__.py
diff --git a/Codes/flownet2/src/correlation.py b/Codes/flownet2/src/correlation.py
new file mode 100644
index 0000000..60a5c37
--- /dev/null
+++ b/Codes/flownet2/src/correlation.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+
+_correlation_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/correlation.so"))
+
+
+def correlation(input_a, input_b, kernel_size, max_displacement, stride_1, stride_2, padding):
+    return _correlation_ops.correlation(input_a,
+                                        input_b,
+                                        kernel_size,
+                                        max_displacement,
+                                        stride_1,
+                                        stride_2,
+                                        padding)
+
+
+@tf.RegisterGradient("Correlation")
+def _correlation_grad(corr_op, gradients):
+    kernel_size = corr_op.get_attr("kernel_size")
+    max_displacement = corr_op.get_attr("max_displacement")
+    stride_1 = corr_op.get_attr("stride_1")
+    stride_2 = corr_op.get_attr("stride_2")
+    pad = corr_op.get_attr("pad")
+
+    corr_grads = _correlation_ops.correlation_grad(gradients,
+                                                   corr_op.inputs[0],
+                                                   corr_op.inputs[1],
+                                                   kernel_size,
+                                                   max_displacement,
+                                                   stride_1,
+                                                   stride_2,
+                                                   pad)
+
+    # Return the gradients with respect to input_a and input_b
+    return corr_grads.backprops_a, corr_grads.backprops_b
diff --git a/Codes/flownet2/src/dataloader.py b/Codes/flownet2/src/dataloader.py
new file mode 100644
index 0000000..22a6ddb
--- /dev/null
+++ b/Codes/flownet2/src/dataloader.py
@@ -0,0 +1,329 @@
+# -*- coding: utf-8 -*-
+import tensorflow as tf
+import copy
+slim = tf.contrib.slim
+
+_preprocessing_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/preprocessing.so"))
+
+
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+class Image(slim.tfexample_decoder.ItemHandler):
+    """An ItemHandler that decodes a parsed Tensor as an image."""
+
+    def __init__(self,
+                 image_key=None,
+                 format_key=None,
+                 shape=None,
+                 channels=3,
+                 dtype=tf.uint8,
+                 repeated=False):
+        """Initializes the image.
+        Args:
+          image_key: the name of the TF-Example feature in which the encoded image
+            is stored.
+          shape: the output shape of the image as 1-D `Tensor`
+            [height, width, channels]. If provided, the image is reshaped
+            accordingly. If left as None, no reshaping is done. A shape should
+            be supplied only if all the stored images have the same shape.
+          channels: the number of channels in the image.
+          dtype: images will be decoded at this bit depth. Different formats
+            support different bit depths.
+              See tf.image.decode_image,
+                  tf.decode_raw,
+          repeated: if False, decodes a single image. If True, decodes a
+            variable number of image strings from a 1D tensor of strings.
+        """
+        if not image_key:
+            image_key = 'image/encoded'
+
+        super(Image, self).__init__([image_key])
+        self._image_key = image_key
+        self._shape = shape
+        self._channels = channels
+        self._dtype = dtype
+        self._repeated = repeated
+
+    def tensors_to_item(self, keys_to_tensors):
+        """See base class."""
+        image_buffer = keys_to_tensors[self._image_key]
+
+        if self._repeated:
+            return functional_ops.map_fn(lambda x: self._decode(x),
+                                         image_buffer, dtype=self._dtype)
+        else:
+            return self._decode(image_buffer)
+
+    def _decode(self, image_buffer):
+        """Decodes the image buffer.
+        Args:
+          image_buffer: The tensor representing the encoded image tensor.
+        Returns:
+          A tensor that represents decoded image of self._shape, or
+          (?, ?, self._channels) if self._shape is not specified.
+        """
+        def decode_raw():
+            """Decodes a raw image."""
+            return tf.decode_raw(image_buffer, out_type=self._dtype)
+
+        image = decode_raw()
+        # image.set_shape([None, None, self._channels])
+        if self._shape is not None:
+            image = tf.reshape(image, self._shape)
+
+        return image
+
+
+def __get_dataset(dataset_config, split_name):
+    """
+    dataset_config: A dataset_config defined in datasets.py
+    split_name: 'train'/'validate'
+    """
+    with tf.name_scope('__get_dataset'):
+        if split_name not in dataset_config['SIZES']:
+            raise ValueError('split name %s not recognized' % split_name)
+
+        IMAGE_HEIGHT, IMAGE_WIDTH = dataset_config['IMAGE_HEIGHT'], dataset_config['IMAGE_WIDTH']
+        reader = tf.TFRecordReader
+        keys_to_features = {
+            'image_a': tf.FixedLenFeature((), tf.string),
+            'image_b': tf.FixedLenFeature((), tf.string),
+            'flow': tf.FixedLenFeature((), tf.string),
+        }
+        items_to_handlers = {
+            'image_a': Image(
+                image_key='image_a',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'image_b': Image(
+                image_key='image_b',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'flow': Image(
+                image_key='flow',
+                dtype=tf.float32,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 2],
+                channels=2),
+        }
+        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
+        return slim.dataset.Dataset(
+            data_sources=dataset_config['PATHS'][split_name],
+            reader=reader,
+            decoder=decoder,
+            num_samples=dataset_config['SIZES'][split_name],
+            items_to_descriptions=dataset_config['ITEMS_TO_DESCRIPTIONS'])
+
+
+def config_to_arrays(dataset_config):
+    output = {
+        'name': [],
+        'rand_type': [],
+        'exp': [],
+        'mean': [],
+        'spread': [],
+        'prob': [],
+        'coeff_schedule': [],
+    }
+    config = copy.deepcopy(dataset_config)
+
+    if 'coeff_schedule_param' in config:
+        del config['coeff_schedule_param']
+
+    # Get all attributes
+    for (name, value) in config.iteritems():
+        if name == 'coeff_schedule_param':
+            output['coeff_schedule'] = [value['half_life'],
+                                        value['initial_coeff'],
+                                        value['final_coeff']]
+        else:
+            output['name'].append(name)
+            output['rand_type'].append(value['rand_type'])
+            output['exp'].append(value['exp'])
+            output['mean'].append(value['mean'])
+            output['spread'].append(value['spread'])
+            output['prob'].append(value['prob'])
+
+    return output
+
+
+# https://github.com/tgebru/transform/blob/master/src/caffe/layers/data_augmentation_layer.cpp#L34
+def _generate_coeff(param, discount_coeff=tf.constant(1.0), default_value=tf.constant(0.0)):
+    if not all(name in param for name in ['rand_type', 'exp', 'mean', 'spread', 'prob']):
+        raise RuntimeError('Expected rand_type, exp, mean, spread, prob in `param`')
+
+    rand_type = param['rand_type']
+    exp = float(param['exp'])
+    mean = tf.convert_to_tensor(param['mean'], dtype=tf.float32)
+    spread = float(param['spread'])  # AKA standard deviation
+    prob = float(param['prob'])
+
+    # Multiply spread by our discount_coeff so it changes over time
+    spread = spread * discount_coeff
+
+    if rand_type == 'uniform':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_uniform([], mean - spread, mean + spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'gaussian':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_normal([], mean, spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'bernoulli':
+        if prob > 0.0:
+            value = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            value = 0.0
+    elif rand_type == 'uniform_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_uniform([], mean - spread, mean + spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    elif rand_type == 'gaussian_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_normal([], mean, spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    else:
+        raise ValueError('Unknown distribution type %s.' % rand_type)
+    return value
+
+
+def load_batch(dataset_config, split_name, global_step):
+    num_threads = 32
+    reader_kwargs = {'options': tf.python_io.TFRecordOptions(
+        tf.python_io.TFRecordCompressionType.ZLIB)}
+
+    with tf.name_scope('load_batch'):
+        dataset = __get_dataset(dataset_config, split_name)
+        data_provider = slim.dataset_data_provider.DatasetDataProvider(
+            dataset,
+            num_readers=num_threads,
+            common_queue_capacity=2048,
+            common_queue_min=1024,
+            reader_kwargs=reader_kwargs)
+        image_a, image_b, flow = data_provider.get(['image_a', 'image_b', 'flow'])
+        image_a, image_b, flow = map(tf.to_float, [image_a, image_b, flow])
+
+        if dataset_config['PREPROCESS']['scale']:
+            image_a = image_a / 255.0
+            image_b = image_b / 255.0
+
+        crop = [dataset_config['PREPROCESS']['crop_height'],
+                dataset_config['PREPROCESS']['crop_width']]
+        config_a = config_to_arrays(dataset_config['PREPROCESS']['image_a'])
+        config_b = config_to_arrays(dataset_config['PREPROCESS']['image_b'])
+
+        image_as, image_bs, flows = map(lambda x: tf.expand_dims(x, 0), [image_a, image_b, flow])
+
+        # Perform data augmentation on GPU
+        with tf.device('/cpu:0'):
+            image_as, image_bs, transforms_from_a, transforms_from_b = \
+                _preprocessing_ops.data_augmentation(image_as,
+                                                     image_bs,
+                                                     global_step,
+                                                     crop,
+                                                     config_a['name'],
+                                                     config_a['rand_type'],
+                                                     config_a['exp'],
+                                                     config_a['mean'],
+                                                     config_a['spread'],
+                                                     config_a['prob'],
+                                                     config_a['coeff_schedule'],
+                                                     config_b['name'],
+                                                     config_b['rand_type'],
+                                                     config_b['exp'],
+                                                     config_b['mean'],
+                                                     config_b['spread'],
+                                                     config_b['prob'],
+                                                     config_b['coeff_schedule'])
+
+            noise_coeff_a = None
+            noise_coeff_b = None
+
+            # Generate and apply noise coeff for A if defined in A params
+            if 'noise' in dataset_config['PREPROCESS']['image_a']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_a']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+
+                noise_coeff_a = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_a']['noise'], discount_coeff)
+                noise_a = tf.random_normal(shape=tf.shape(image_as),
+                                           mean=0.0, stddev=noise_coeff_a,
+                                           dtype=tf.float32)
+                image_as = tf.clip_by_value(image_as + noise_a, 0.0, 1.0)
+
+            # Generate noise coeff for B if defined in B params
+            if 'noise' in dataset_config['PREPROCESS']['image_b']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_b']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+                noise_coeff_b = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_b']['noise'], discount_coeff)
+
+            # Combine coeff from a with coeff from b
+            if noise_coeff_a is not None:
+                if noise_coeff_b is not None:
+                    noise_coeff_b = noise_coeff_a * noise_coeff_b
+                else:
+                    noise_coeff_b = noise_coeff_a
+
+            # Add noise to B if needed
+            if noise_coeff_b is not None:
+                noise_b = tf.random_normal(shape=tf.shape(image_bs),
+                                           mean=0.0, stddev=noise_coeff_b,
+                                           dtype=tf.float32)
+                image_bs = tf.clip_by_value(image_bs + noise_b, 0.0, 1.0)
+
+                # Perform flow augmentation using spatial parameters from data augmentation
+            flows = _preprocessing_ops.flow_augmentation(
+                flows, transforms_from_a, transforms_from_b, crop)
+
+            return tf.train.batch([image_as, image_bs, flows],
+                                  enqueue_many=True,
+                                  batch_size=dataset_config['BATCH_SIZE'],
+                                  capacity=dataset_config['BATCH_SIZE'] * 4,
+                                  num_threads=num_threads,
+                                  allow_smaller_final_batch=False)
diff --git a/Codes/flownet2/src/dataset_configs.py b/Codes/flownet2/src/dataset_configs.py
new file mode 100644
index 0000000..fbda5d0
--- /dev/null
+++ b/Codes/flownet2/src/dataset_configs.py
@@ -0,0 +1,153 @@
+"""
+Add dataset configurations here. Each dataset must have the following structure:
+
+NAME = {
+    IMAGE_HEIGHT: int,
+    IMAGE_WIDTH: int,
+    ITEMS_TO_DESCRIPTIONS: {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    SIZES: {
+        'train': int,
+        'validate': int,    (optional)
+        ...
+    },
+    BATCH_SIZE: int,
+    PATHS: {
+        'train': '',
+        'validate': '', (optional)
+        ...
+    }
+}
+"""
+
+"""
+note that one step = one batch of data processed, ~not~ an entire epoch
+'coeff_schedule_param': {
+    'half_life': 50000,         after this many steps, the value will be i + (f - i)/2
+    'initial_coeff': 0.5,       initial value
+    'final_coeff': 1,           final value
+},
+"""
+
+FLYING_CHAIRS_DATASET_CONFIG = {
+    'IMAGE_HEIGHT': 384,
+    'IMAGE_WIDTH': 512,
+    'ITEMS_TO_DESCRIPTIONS': {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    'SIZES': {
+        'train': 22232,
+        'validate': 640,
+        'sample': 8,
+    },
+    'BATCH_SIZE': 8,
+    'PATHS': {
+        'train': './data/tfrecords/fc_train.tfrecords',
+        'validate': './data/tfrecords/fc_val.tfrecords',
+        'sample': './data/tfrecords/fc_sample.tfrecords',
+    },
+    'PREPROCESS': {
+        'scale': False,
+        'crop_height': 320,
+        'crop_width': 448,
+        'image_a': {
+            'translate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0.2,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'squeeze': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.3,
+                'prob': 1.0,
+            },
+            'noise': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0.03,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+        },
+        # All preprocessing to image A will be applied to image B in addition to the following.
+        'image_b': {
+            'translate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'gamma': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'brightness': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'contrast': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'color': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'coeff_schedule_param': {
+                'half_life': 50000,
+                'initial_coeff': 0.5,
+                'final_coeff': 1,
+            },
+        }
+    },
+}
diff --git a/Codes/flownet2/src/downsample.py b/Codes/flownet2/src/downsample.py
new file mode 100644
index 0000000..5e6fc95
--- /dev/null
+++ b/Codes/flownet2/src/downsample.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+_downsample = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/downsample.so"))
+
+
+def downsample(tensor, size):
+    return _downsample.downsample(tensor, size)
diff --git a/Codes/flownet2/src/flow_warp.py b/Codes/flownet2/src/flow_warp.py
new file mode 100644
index 0000000..fe5fd4d
--- /dev/null
+++ b/Codes/flownet2/src/flow_warp.py
@@ -0,0 +1,15 @@
+import tensorflow as tf
+
+_flow_warp_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/flow_warp.so"))
+
+
+def flow_warp(image, flow):
+    return _flow_warp_ops.flow_warp(image, flow)
+
+
+@tf.RegisterGradient("FlowWarp")
+def _flow_warp_grad(flow_warp_op, gradients):
+    return _flow_warp_ops.flow_warp_grad(flow_warp_op.inputs[0],
+                                         flow_warp_op.inputs[1],
+                                         gradients)
diff --git a/Codes/flownet2/src/flowlib.py b/Codes/flownet2/src/flowlib.py
new file mode 100644
index 0000000..36c56d4
--- /dev/null
+++ b/Codes/flownet2/src/flowlib.py
@@ -0,0 +1,554 @@
+#!/usr/bin/python
+"""
+# ==============================
+# flowlib.py
+# library for optical flow processing
+# Author: Ruoteng Li
+# Date: 6th Aug 2016
+# ==============================
+"""
+import png
+import numpy as np
+import matplotlib.colors as cl
+import matplotlib.pyplot as plt
+from PIL import Image
+import tensorflow as tf
+
+
+UNKNOWN_FLOW_THRESH = 1e7
+SMALLFLOW = 0.0
+LARGEFLOW = 1e8
+
+"""
+=============
+Flow Section
+=============
+"""
+
+
+def show_flow(filename):
+    """
+    visualize optical flow map using matplotlib
+    :param filename: optical flow file
+    :return: None
+    """
+    flow = read_flow(filename)
+    img = flow_to_image(flow)
+    plt.imshow(img)
+    plt.show()
+
+
+def visualize_flow(flow, mode='Y'):
+    """
+    this function visualize the input flow
+    :param flow: input flow in array
+    :param mode: choose which color mode to visualize the flow (Y: Ccbcr, RGB: RGB color)
+    :return: None
+    """
+    if mode == 'Y':
+        # Ccbcr color wheel
+        img = flow_to_image(flow)
+        plt.imshow(img)
+        plt.show()
+    elif mode == 'RGB':
+        (h, w) = flow.shape[0:2]
+        du = flow[:, :, 0]
+        dv = flow[:, :, 1]
+        valid = flow[:, :, 2]
+        max_flow = max(np.max(du), np.max(dv))
+        img = np.zeros((h, w, 3), dtype=np.float64)
+        # angle layer
+        img[:, :, 0] = np.arctan2(dv, du) / (2 * np.pi)
+        # magnitude layer, normalized to 1
+        img[:, :, 1] = np.sqrt(du * du + dv * dv) * 8 / max_flow
+        # phase layer
+        img[:, :, 2] = 8 - img[:, :, 1]
+        # clip to [0,1]
+        small_idx = img[:, :, 0:3] < 0
+        large_idx = img[:, :, 0:3] > 1
+        img[small_idx] = 0
+        img[large_idx] = 1
+        # convert to rgb
+        img = cl.hsv_to_rgb(img)
+        # remove invalid point
+        img[:, :, 0] = img[:, :, 0] * valid
+        img[:, :, 1] = img[:, :, 1] * valid
+        img[:, :, 2] = img[:, :, 2] * valid
+        # show
+        plt.imshow(img)
+        plt.show()
+
+    return None
+
+
+def read_flow(filename):
+    """
+    read optical flow from Middlebury .flo file
+    :param filename: name of the flow file
+    :return: optical flow data in matrix
+    """
+    f = open(filename, 'rb')
+    magic = np.fromfile(f, np.float32, count=1)
+    data2d = None
+
+    if 202021.25 != magic:
+        print('Magic number incorrect. Invalid .flo file')
+    else:
+        w = np.fromfile(f, np.int32, count=1)
+        h = np.fromfile(f, np.int32, count=1)
+        print("Reading %d x %d flo file" % (h, w))
+        data2d = np.fromfile(f, np.float32, count=2 * w * h)
+        # reshape data into 3D array (columns, rows, channels)
+        data2d = np.resize(data2d, (h[0], w[0], 2))
+    f.close()
+    return data2d
+
+
+def read_flow_png(flow_file):
+    """
+    Read optical flow from KITTI .png file
+    :param flow_file: name of the flow file
+    :return: optical flow data in matrix
+    """
+    flow_object = png.Reader(filename=flow_file)
+    flow_direct = flow_object.asDirect()
+    flow_data = list(flow_direct[2])
+    (w, h) = flow_direct[3]['size']
+    flow = np.zeros((h, w, 3), dtype=np.float64)
+    for i in range(len(flow_data)):
+        flow[i, :, 0] = flow_data[i][0::3]
+        flow[i, :, 1] = flow_data[i][1::3]
+        flow[i, :, 2] = flow_data[i][2::3]
+
+    invalid_idx = (flow[:, :, 2] == 0)
+    flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0
+    flow[invalid_idx, 0] = 0
+    flow[invalid_idx, 1] = 0
+    return flow
+
+
+def write_flow(flow, filename):
+    """
+    write optical flow in Middlebury .flo format
+    :param flow: optical flow map
+    :param filename: optical flow file path to be saved
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = flow.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    flow.tofile(f)
+    f.close()
+
+
+def segment_flow(flow):
+    h = flow.shape[0]
+    w = flow.shape[1]
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW))
+    idx2 = (abs(u) == SMALLFLOW)
+    class0 = (v == 0) & (u == 0)
+    u[idx2] = 0.00001
+    tan_value = v / u
+
+    class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0)
+    class2 = (tan_value >= 1) & (u >= 0) & (v >= 0)
+    class3 = (tan_value < -1) & (u <= 0) & (v >= 0)
+    class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0)
+    class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0)
+    class7 = (tan_value < -1) & (u >= 0) & (v <= 0)
+    class6 = (tan_value >= 1) & (u <= 0) & (v <= 0)
+    class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0)
+
+    seg = np.zeros((h, w))
+
+    seg[class1] = 1
+    seg[class2] = 2
+    seg[class3] = 3
+    seg[class4] = 4
+    seg[class5] = 5
+    seg[class6] = 6
+    seg[class7] = 7
+    seg[class8] = 8
+    seg[class0] = 0
+    seg[idx] = 0
+
+    return seg
+
+
+def flow_error(tu, tv, u, v):
+    """
+    Calculate average end point error
+    :param tu: ground-truth horizontal flow map
+    :param tv: ground-truth vertical flow map
+    :param u:  estimated horizontal flow map
+    :param v:  estimated vertical flow map
+    :return: End point error of the estimated flow
+    """
+    smallflow = 0.0
+    '''
+    stu = tu[bord+1:end-bord,bord+1:end-bord]
+    stv = tv[bord+1:end-bord,bord+1:end-bord]
+    su = u[bord+1:end-bord,bord+1:end-bord]
+    sv = v[bord+1:end-bord,bord+1:end-bord]
+    '''
+    stu = tu[:]
+    stv = tv[:]
+    su = u[:]
+    sv = v[:]
+
+    idxUnknow = (abs(stu) > UNKNOWN_FLOW_THRESH) | (abs(stv) > UNKNOWN_FLOW_THRESH)
+    stu[idxUnknow] = 0
+    stv[idxUnknow] = 0
+    su[idxUnknow] = 0
+    sv[idxUnknow] = 0
+
+    ind2 = [(np.absolute(stu) > smallflow) | (np.absolute(stv) > smallflow)]
+    index_su = su[ind2]
+    index_sv = sv[ind2]
+    an = 1.0 / np.sqrt(index_su ** 2 + index_sv ** 2 + 1)
+    un = index_su * an
+    vn = index_sv * an
+
+    index_stu = stu[ind2]
+    index_stv = stv[ind2]
+    tn = 1.0 / np.sqrt(index_stu ** 2 + index_stv ** 2 + 1)
+    tun = index_stu * tn
+    tvn = index_stv * tn
+
+    '''
+    angle = un * tun + vn * tvn + (an * tn)
+    index = [angle == 1.0]
+    angle[index] = 0.999
+    ang = np.arccos(angle)
+    mang = np.mean(ang)
+    mang = mang * 180 / np.pi
+    '''
+
+    epe = np.sqrt((stu - su) ** 2 + (stv - sv) ** 2)
+    epe = epe[ind2]
+    mepe = np.mean(epe)
+    return mepe
+
+
+def flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    maxu = -999.
+    maxv = -999.
+    minu = 999.
+    minv = 999.
+
+    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
+    u[idxUnknow] = 0
+    v[idxUnknow] = 0
+
+    maxu = max(maxu, np.max(u))
+    minu = min(minu, np.min(u))
+
+    maxv = max(maxv, np.max(v))
+    minv = min(minv, np.min(v))
+
+    rad = np.sqrt(u ** 2 + v ** 2)
+    maxrad = max(-1, np.max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu,maxu, minv, maxv))
+
+    u = u/(maxrad + np.finfo(float).eps)
+    v = v/(maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    img[idx] = 0
+
+    return np.uint8(img)
+
+
+def tf_flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, :, 0]
+    v = flow[:, :, :, 1]
+
+    maxu = tf.constant(-999.)
+    maxv = tf.constant(-999.)
+    minu = tf.constant(999.)
+    minv = tf.constant(999.)
+
+    zeros = tf.zeros_like(u, dtype=tf.float32)
+    u = tf.where(tf.greater(u, UNKNOWN_FLOW_THRESH), zeros, u)
+    v = tf.where(tf.greater(v, UNKNOWN_FLOW_THRESH), zeros, v)
+
+    rad = tf.sqrt(u ** 2 + v ** 2)
+    maxrad = tf.reduce_max(-1, tf.reduce_max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu, maxu, minv, maxv))
+
+    u = u / (maxrad + np.finfo(float).eps)
+    v = v / (maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    # idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    # img[idx] = 0
+
+    return np.uint8(img)
+
+
+def evaluate_flow_file(gt, pred):
+    """
+    evaluate the estimated optical flow end point error according to ground truth provided
+    :param gt: ground truth file path
+    :param pred: estimated optical flow file path
+    :return: end point error, float32
+    """
+    # Read flow files and calculate the errors
+    gt_flow = read_flow(gt)        # ground truth flow
+    eva_flow = read_flow(pred)     # predicted flow
+    # Calculate errors
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], eva_flow[:, :, 0], eva_flow[:, :, 1])
+    return average_pe
+
+
+def evaluate_flow(gt_flow, pred_flow):
+    """
+    gt: ground-truth flow
+    pred: estimated flow
+    """
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], pred_flow[:, :, 0], pred_flow[:, :, 1])
+    return average_pe
+
+
+"""
+==============
+Disparity Section
+==============
+"""
+
+
+def read_disp_png(file_name):
+    """
+    Read optical flow from KITTI .png file
+    :param file_name: name of the flow file
+    :return: optical flow data in matrix
+    """
+    image_object = png.Reader(filename=file_name)
+    image_direct = image_object.asDirect()
+    image_data = list(image_direct[2])
+    (w, h) = image_direct[3]['size']
+    channel = len(image_data[0]) / w
+    flow = np.zeros((h, w, channel), dtype=np.uint16)
+    for i in range(len(image_data)):
+        for j in range(channel):
+            flow[i, :, j] = image_data[i][j::channel]
+    return flow[:, :, 0] / 256
+
+
+def disp_to_flowfile(disp, filename):
+    """
+    Read KITTI disparity file in png format
+    :param disp: disparity matrix
+    :param filename: the flow file name to save
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = disp.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    empty_map = np.zeros((height, width), dtype=np.float32)
+    data = np.dstack((disp, empty_map))
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    data.tofile(f)
+    f.close()
+
+
+"""
+==============
+Image Section
+==============
+"""
+
+
+def read_image(filename):
+    """
+    Read normal image of any format
+    :param filename: name of the image file
+    :return: image data in matrix uint8 type
+    """
+    img = Image.open(filename)
+    im = np.array(img)
+    return im
+
+
+def warp_image(im, flow):
+    """
+    Use optical flow to warp image to the next
+    :param im: image to warp
+    :param flow: optical flow
+    :return: warped image
+    """
+    from scipy import interpolate
+    image_height = im.shape[0]
+    image_width = im.shape[1]
+    flow_height = flow.shape[0]
+    flow_width = flow.shape[1]
+    n = image_height * image_width
+    (iy, ix) = np.mgrid[0:image_height, 0:image_width]
+    (fy, fx) = np.mgrid[0:flow_height, 0:flow_width]
+    fx += flow[:,:,0]
+    fy += flow[:,:,1]
+    mask = np.logical_or(fx <0 , fx > flow_width)
+    mask = np.logical_or(mask, fy < 0)
+    mask = np.logical_or(mask, fy > flow_height)
+    fx = np.minimum(np.maximum(fx, 0), flow_width)
+    fy = np.minimum(np.maximum(fy, 0), flow_height)
+    points = np.concatenate((ix.reshape(n,1), iy.reshape(n,1)), axis=1)
+    xi = np.concatenate((fx.reshape(n, 1), fy.reshape(n,1)), axis=1)
+    warp = np.zeros((image_height, image_width, im.shape[2]))
+    for i in range(im.shape[2]):
+        channel = im[:, :, i]
+        plt.imshow(channel, cmap='gray')
+        values = channel.reshape(n, 1)
+        new_channel = interpolate.griddata(points, values, xi, method='cubic')
+        new_channel = np.reshape(new_channel, [flow_height, flow_width])
+        new_channel[mask] = 1
+        warp[:, :, i] = new_channel.astype(np.uint8)
+
+    return warp.astype(np.uint8)
+
+
+"""
+==============
+Others
+==============
+"""
+
+
+def scale_image(image, new_range):
+    """
+    Linearly scale the image into desired range
+    :param image: input image
+    :param new_range: the new range to be aligned
+    :return: image normalized in new range
+    """
+    min_val = np.min(image).astype(np.float32)
+    max_val = np.max(image).astype(np.float32)
+    min_val_new = np.array(min(new_range), dtype=np.float32)
+    max_val_new = np.array(max(new_range), dtype=np.float32)
+    scaled_image = (image - min_val) / (max_val - min_val) * (max_val_new - min_val_new) + min_val_new
+    return scaled_image.astype(np.uint8)
+
+
+def compute_color(u, v):
+    """
+    compute optical flow color map
+    :param u: optical flow horizontal map
+    :param v: optical flow vertical map
+    :return: optical flow in color code
+    """
+    [h, w] = u.shape
+    img = np.zeros([h, w, 3])
+    nanIdx = np.isnan(u) | np.isnan(v)
+    u[nanIdx] = 0
+    v[nanIdx] = 0
+
+    colorwheel = make_color_wheel()
+    # ncols = np.size(colorwheel, 0)
+    ncols = colorwheel.shape[0]
+
+    rad = np.sqrt(u**2+v**2)
+
+    a = np.arctan2(-v, -u) / np.pi
+
+    fk = (a+1) / 2 * (ncols - 1) + 1
+
+    k0 = np.floor(fk).astype(int)
+
+    k1 = k0 + 1
+    k1[k1 == ncols+1] = 1
+    f = fk - k0
+
+    for i in range(0, np.size(colorwheel, 1)):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0-1] / 255
+        col1 = tmp[k1-1] / 255
+        col = (1-f) * col0 + f * col1
+
+        idx = rad <= 1
+        col[idx] = 1-rad[idx]*(1-col[idx])
+        notidx = np.logical_not(idx)
+
+        col[notidx] *= 0.75
+        img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx)))
+
+    return img
+
+
+def make_color_wheel():
+    """
+    Generate color wheel according Middlebury color code
+    :return: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+
+    colorwheel = np.zeros([ncols, 3])
+
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY))
+    col += RY
+
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG))
+    colorwheel[col:col+YG, 1] = 255
+    col += YG
+
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC))
+    col += GC
+
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB))
+    colorwheel[col:col+CB, 2] = 255
+    col += CB
+
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM))
+    col += + BM
+
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
+    colorwheel[col:col+MR, 0] = 255
+
+    return colorwheel
diff --git a/Codes/flownet2/src/flownet2/__init__.py b/Codes/flownet2/src/flownet2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/__init__.py
diff --git a/Codes/flownet2/src/flownet2/flownet2.py b/Codes/flownet2/src/flownet2/flownet2.py
new file mode 100644
index 0000000..d44ed10
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/flownet2.py
@@ -0,0 +1,118 @@
+from ..net import Net, Mode
+from ..flownet_css.flownet_css import FlowNetCSS
+from ..flownet_sd.flownet_sd import FlowNetSD
+from ..flow_warp import flow_warp
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNet2(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_css = FlowNetCSS(mode, debug)
+        self.net_sd = FlowNetSD(mode, debug)
+        super(FlowNet2, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNet2'):
+            # Forward pass through FlowNetCSS and FlowNetSD with weights frozen
+            net_css_predictions = self.net_css.model(inputs, training_schedule, trainable=True)
+            net_sd_predictions = self.net_sd.model(inputs, training_schedule, trainable=True)
+
+            def ChannelNorm(tensor):
+                sq = tf.square(tensor)
+                r_sum = tf.reduce_sum(sq, keep_dims=True, axis=3)
+                return tf.sqrt(r_sum)
+
+            sd_flow_norm = ChannelNorm(net_sd_predictions['flow'])
+            css_flow_norm = ChannelNorm(net_css_predictions['flow'])
+
+            flow_warp_sd = flow_warp(inputs['input_b'], net_sd_predictions['flow'])
+            img_diff_sd = inputs['input_a'] - flow_warp_sd
+            img_diff_sd_norm = ChannelNorm(img_diff_sd)
+
+            flow_warp_css = flow_warp(inputs['input_b'], net_css_predictions['flow'])
+            img_diff_css = inputs['input_a'] - flow_warp_css
+            img_diff_css_norm = ChannelNorm(img_diff_css)
+
+            input_to_fusion = tf.concat([inputs['input_a'],
+                                         net_sd_predictions['flow'],
+                                         net_css_predictions['flow'],
+                                         sd_flow_norm,
+                                         css_flow_norm,
+                                         img_diff_sd_norm,
+                                         img_diff_css_norm], axis=3)
+
+            # Fusion Network
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    fuse_conv0 = slim.conv2d(pad(input_to_fusion), 64, 3, scope='fuse_conv0')
+                    fuse_conv1 = slim.conv2d(pad(fuse_conv0), 64, 3, stride=2, scope='fuse_conv1')
+                    fuse_conv1_1 = slim.conv2d(pad(fuse_conv1), 128, 3, scope='fuse_conv1_1')
+                    fuse_conv2 = slim.conv2d(pad(fuse_conv1_1), 128, 3,
+                                             stride=2, scope='fuse_conv2')
+                    fuse_conv2_1 = slim.conv2d(pad(fuse_conv2), 128, 3, scope='fuse_conv2_1')
+
+                    predict_flow2 = slim.conv2d(pad(fuse_conv2_1), 2, 3,
+                                                scope='predict_flow2',
+                                                activation_fn=None)
+                    fuse_deconv1 = antipad(slim.conv2d_transpose(fuse_conv2_1, 32, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv1'))
+                    fuse_upsample_flow2to1 = antipad(slim.conv2d_transpose(predict_flow2, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow2to1',
+                                                                           activation_fn=None))
+                    concat1 = tf.concat([fuse_conv1_1, fuse_deconv1,
+                                         fuse_upsample_flow2to1], axis=3)
+                    fuse_interconv1 = slim.conv2d(pad(concat1), 32, 3,
+                                                  activation_fn=None, scope='fuse_interconv1')
+
+                    predict_flow1 = slim.conv2d(pad(fuse_interconv1), 2, 3,
+                                                scope='predict_flow1',
+                                                activation_fn=None)
+                    fuse_deconv0 = antipad(slim.conv2d_transpose(concat1, 16, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv0'))
+                    fuse_upsample_flow1to0 = antipad(slim.conv2d_transpose(predict_flow1, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow1to0',
+                                                                           activation_fn=None))
+                    concat0 = tf.concat([fuse_conv0, fuse_deconv0, fuse_upsample_flow1to0], axis=3)
+                    fuse_interconv0 = slim.conv2d(pad(concat0), 16, 3,
+                                                  activation_fn=None, scope='fuse_interconv0')
+
+                    predict_flow0 = slim.conv2d(pad(fuse_interconv0), 2,
+                                                3, activation_fn=None, scope='predict_flow0')
+
+                    flow = tf.image.resize_bilinear(
+                        predict_flow0, tf.stack([height, width]), align_corners=True)
+                    print(predict_flow0)
+                    print(flow)
+                    return {
+                        'predict_flow0': predict_flow0,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        # L2 loss between predict_flow0, true flow (weighted w/ 0.005)
+        predict_flow0 = predictions['predict_flow0']
+        size = [predict_flow0.shape[1], predict_flow0.shape[2]]
+        downsampled_flow0 = downsample(flow, size)
+        loss = average_endpoint_error(downsampled_flow0, predict_flow0)
+        tf.losses.add_loss(loss)
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet2/test.py b/Codes/flownet2/src/flownet2/test.py
new file mode 100644
index 0000000..3177614
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet2 import FlowNet2
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNet2(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNet2/flownet-2.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet2/train.py b/Codes/flownet2/src/flownet2/train.py
new file mode 100644
index 0000000..40c028d
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/train.py
@@ -0,0 +1,24 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet2 import FlowNet2
+
+# Create a new network
+net = FlowNet2()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_2',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CSS and SD parts of network
+    checkpoints={
+        './checkpoints/FlowNetCSS-ft-sd/flownet-CSS-ft-sd.ckpt-0': ('FlowNet2/FlowNetCSS', 'FlowNet2'),
+        './checkpoints/FlowNetSD/flownet-SD.ckpt-0': ('FlowNet2/FlowNetSD', 'FlowNet2')
+    }
+)
diff --git a/Codes/flownet2/src/flownet_c/__init__.py b/Codes/flownet2/src/flownet_c/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/__init__.py
diff --git a/Codes/flownet2/src/flownet_c/flownet_c.py b/Codes/flownet2/src/flownet_c/flownet_c.py
new file mode 100644
index 0000000..d333ee2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/flownet_c.py
@@ -0,0 +1,167 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..correlation import correlation
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetC(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetC, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetC'):
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_a_1 = slim.conv2d(pad(inputs['input_a'], 3), 64, 7, scope='conv1')
+                        conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2')
+                        conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3')
+
+                        conv_b_1 = slim.conv2d(pad(inputs['input_b'], 3),
+                                               64, 7, scope='conv1', reuse=True)
+                        conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True)
+                        conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True)
+
+                        # Compute cross correlation with leaky relu activation
+                        cc = correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20)
+                        cc_relu = LeakyReLU(cc)
+
+                    # Combine cross correlation results with convolution of feature map A
+                    netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir')
+                    # Concatenate along the channels axis
+                    net = tf.concat([netA_conv, cc_relu], axis=3)
+
+                    conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_c/test.py b/Codes/flownet2/src/flownet_c/test.py
new file mode 100644
index 0000000..692f22d
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_c import FlowNetC
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetC(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetC/flownet-C.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_c/train.py b/Codes/flownet2/src/flownet_c/train.py
new file mode 100644
index 0000000..9296ac7
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_c import FlowNetC
+
+# Create a new network
+net = FlowNetC()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_c',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_cs/__init__.py b/Codes/flownet2/src/flownet_cs/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/__init__.py
diff --git a/Codes/flownet2/src/flownet_cs/flownet_cs.py b/Codes/flownet2/src/flownet_cs/flownet_cs.py
new file mode 100644
index 0000000..aeaea47
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/flownet_cs.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_c.flownet_c import FlowNetC
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_c = FlowNetC(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCS'):
+            # Forward pass through FlowNetC with weights frozen
+            net_c_predictions = self.net_c.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_c_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_c_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_cs/test.py b/Codes/flownet2/src/flownet_cs/test.py
new file mode 100644
index 0000000..ae00ff4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_cs import FlowNetCS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCS/flownet-CS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_cs/train.py b/Codes/flownet2/src/flownet_cs/train.py
new file mode 100644
index 0000000..9376132
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/train.py
@@ -0,0 +1,21 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_cs import FlowNetCS
+
+# Create a new network
+net = FlowNetCS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_cs',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for C part of network
+    checkpoints={'./checkpoints/FlowNetC/flownet-C.ckpt-0': ('FlowNetCS/FlowNetC', 'FlowNetCS')}
+)
diff --git a/Codes/flownet2/src/flownet_css/__init__.py b/Codes/flownet2/src/flownet_css/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/__init__.py
diff --git a/Codes/flownet2/src/flownet_css/flownet_css.py b/Codes/flownet2/src/flownet_css/flownet_css.py
new file mode 100644
index 0000000..93d9db2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/flownet_css.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_cs.flownet_cs import FlowNetCS
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCSS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_cs = FlowNetCS(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCSS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCSS'):
+            # Forward pass through FlowNetCS with weights frozen
+            net_cs_predictions = self.net_cs.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_cs_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_cs_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_css/test.py b/Codes/flownet2/src/flownet_css/test.py
new file mode 100644
index 0000000..9d1249e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_css import FlowNetCSS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCSS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCSS/flownet-CSS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_css/train.py b/Codes/flownet2/src/flownet_css/train.py
new file mode 100644
index 0000000..2964f3e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/train.py
@@ -0,0 +1,22 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_css import FlowNetCSS
+
+# Create a new network
+net = FlowNetCSS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_css',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CS part of network
+    checkpoints={
+        './checkpoints/FlowNetCS/flownet-CS.ckpt-0': ('FlowNetCSS/FlowNetCS', 'FlowNetCSS')}
+)
diff --git a/Codes/flownet2/src/flownet_s/__init__.py b/Codes/flownet2/src/flownet_s/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/__init__.py
diff --git a/Codes/flownet2/src/flownet_s/flownet_s.py b/Codes/flownet2/src/flownet_s/flownet_s.py
new file mode 100644
index 0000000..f6704b1
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/flownet_s.py
@@ -0,0 +1,161 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        stacked = False
+        with tf.variable_scope('FlowNetS'):
+            if 'warped' in inputs and 'flow' in inputs and 'brightness_error' in inputs:
+                stacked = True
+                concat_inputs = tf.concat([inputs['input_a'],
+                                           inputs['input_b'],
+                                           inputs['warped'],
+                                           inputs['flow'],
+                                           inputs['brightness_error']], axis=3)
+            else:
+                concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_1 = slim.conv2d(pad(concat_inputs, 3), 64, 7, scope='conv1')
+                        conv_2 = slim.conv2d(pad(conv_1, 2), 128, 5, scope='conv2')
+                        conv_3 = slim.conv2d(pad(conv_2, 2), 256, 5, scope='conv3')
+
+                    conv3_1 = slim.conv2d(pad(conv_3), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_s/test.py b/Codes/flownet2/src/flownet_s/test.py
new file mode 100644
index 0000000..ae1b2f3
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_s import FlowNetS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetS/flownet-S.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_s/train.py b/Codes/flownet2/src/flownet_s/train.py
new file mode 100644
index 0000000..13a792a
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_s import FlowNetS
+
+# Create a new network
+net = FlowNetS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_s_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_sd/__init__.py b/Codes/flownet2/src/flownet_sd/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/__init__.py
diff --git a/Codes/flownet2/src/flownet_sd/flownet_sd.py b/Codes/flownet2/src/flownet_sd/flownet_sd.py
new file mode 100644
index 0000000..2f5c9e4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/flownet_sd.py
@@ -0,0 +1,160 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+# from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetSD(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetSD, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True, reuse=None):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetSD', reuse=reuse):
+            concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    conv0 = slim.conv2d(pad(concat_inputs), 64, 3, scope='conv0')
+                    conv1 = slim.conv2d(pad(conv0), 64, 3, stride=2, scope='conv1')
+                    conv1_1 = slim.conv2d(pad(conv1), 128, 3, scope='conv1_1')
+                    conv2 = slim.conv2d(pad(conv1_1), 128, 3, stride=2, scope='conv2')
+                    conv2_1 = slim.conv2d(pad(conv2), 128, 3, scope='conv2_1')
+                    conv3 = slim.conv2d(pad(conv2_1), 256, 3, stride=2, scope='conv3')
+                    conv3_1 = slim.conv2d(pad(conv3), 256, 3, scope='conv3_1')
+                    conv4 = slim.conv2d(pad(conv3_1), 512, 3, stride=2, scope='conv4')
+                    conv4_1 = slim.conv2d(pad(conv4), 512, 3, scope='conv4_1')
+                    conv5 = slim.conv2d(pad(conv4_1), 512, 3, stride=2, scope='conv5')
+                    conv5_1 = slim.conv2d(pad(conv5), 512, 3, scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+                        interconv5 = slim.conv2d(pad(concat5), 512, 3,
+                                                 activation_fn=None, scope='interconv5')
+
+                        predict_flow5 = slim.conv2d(pad(interconv5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+                        interconv4 = slim.conv2d(pad(concat4), 256, 3,
+                                                 activation_fn=None, scope='interconv4')
+
+                        predict_flow4 = slim.conv2d(pad(interconv4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+                        interconv3 = slim.conv2d(pad(concat3), 128, 3,
+                                                 activation_fn=None, scope='interconv3')
+
+                        predict_flow3 = slim.conv2d(pad(interconv3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv2, deconv2, upsample_flow3to2], axis=3)
+                        interconv2 = slim.conv2d(pad(concat2), 64, 3,
+                                                 activation_fn=None, scope='interconv2')
+
+                        predict_flow2 = slim.conv2d(pad(interconv2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 0.05
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    # def loss(self, flow, predictions):
+    #     flow = flow * 20.0
+    #
+    #     losses = []
+    #     INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+    #
+    #     # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+    #     predict_flow6 = predictions['predict_flow6']
+    #     size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+    #     downsampled_flow6 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+    #
+    #     # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+    #     predict_flow5 = predictions['predict_flow5']
+    #     size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+    #     downsampled_flow5 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+    #
+    #     # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+    #     predict_flow4 = predictions['predict_flow4']
+    #     size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+    #     downsampled_flow4 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+    #
+    #     # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+    #     predict_flow3 = predictions['predict_flow3']
+    #     size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+    #     downsampled_flow3 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+    #
+    #     # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+    #     predict_flow2 = predictions['predict_flow2']
+    #     size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+    #     downsampled_flow2 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+    #
+    #     loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+    #
+    #     # Return the 'total' loss: loss fns + regularization terms defined in the model
+    #     return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_sd/test.py b/Codes/flownet2/src/flownet_sd/test.py
new file mode 100644
index 0000000..b2ac285
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_sd import FlowNetSD
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetSD(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetSD/flownet-SD.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_sd/train.py b/Codes/flownet2/src/flownet_sd/train.py
new file mode 100644
index 0000000..86c64e5
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_sd import FlowNetSD
+
+# Create a new network
+net = FlowNetSD()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_sd_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/net.py b/Codes/flownet2/src/net.py
new file mode 100644
index 0000000..43b2193
--- /dev/null
+++ b/Codes/flownet2/src/net.py
@@ -0,0 +1,177 @@
+import abc
+from enum import Enum
+import os
+import tensorflow as tf
+from .flowlib import flow_to_image, write_flow
+import numpy as np
+# from scipy.misc import imread, imsave, imresize
+import cv2
+import uuid
+from .training_schedules import LONG_SCHEDULE
+slim = tf.contrib.slim
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+class Mode(Enum):
+    TRAIN = 1
+    TEST = 2
+
+
+class Net(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.global_step = slim.get_or_create_global_step()
+        self.mode = mode
+        self.debug = debug
+
+    @abc.abstractmethod
+    def model(self, inputs, training_schedule, trainable=True):
+        """
+        Defines the model and returns a tuple of Tensors needed for calculating the loss.
+        """
+        return
+
+    @abc.abstractmethod
+    def loss(self, **kwargs):
+        """
+        Accepts prediction Tensors from the output of `model`.
+        Returns a single Tensor representing the total loss of the model.
+        """
+        return
+    """
+     python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_6.png \
+                                 --input_b /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_7.png \
+                                 --out  ./
+      python -m src.flownet_sd.test --input_a 006.png  --input_b  007.png     --out ./
+      python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/ped1/frames/testing/01/006.jpg \
+                             --input_b /home/liuwen/ssd/videogan/ped1/frames/testing/01/007.jpg \
+                             --out  ./
+    """
+    def test(self, checkpoint, input_a_path, input_b_path, out_path, save_image=True, save_flo=False):
+        input_a = cv2.imread(input_a_path)
+        input_b = cv2.imread(input_b_path)
+
+        input_a = cv2.resize(input_a, (512, 384))
+        input_b = cv2.resize(input_b, (512, 384))
+        print(input_a.shape, input_b.shape)
+
+        # Convert from RGB -> BGR
+        # input_a = input_a[..., [2, 1, 0]]
+        # input_b = input_b[..., [2, 1, 0]]
+
+        # Scale from [0, 255] -> [0.0, 1.0] if needed
+        if input_a.max() > 1.0:
+            input_a = input_a / 255.0
+        if input_b.max() > 1.0:
+            input_b = input_b / 255.0
+
+        # TODO: This is a hack, we should get rid of this
+        training_schedule = LONG_SCHEDULE
+
+        inputs = {
+            'input_a': tf.expand_dims(tf.constant(input_a, dtype=tf.float32), 0),
+            'input_b': tf.expand_dims(tf.constant(input_b, dtype=tf.float32), 0),
+        }
+        predictions = self.model(inputs, training_schedule)
+        pred_flow = predictions['flow']
+
+        saver = tf.train.Saver()
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        with tf.Session(config=config) as sess:
+            saver.restore(sess, checkpoint)
+            pred_flow = sess.run(pred_flow)[0, :, :, :]
+
+            np.save('temporal_ped1', pred_flow)
+
+            unique_name = 'flow-' + str(uuid.uuid4())
+            if save_image:
+                flow_img = flow_to_image(pred_flow)
+                full_out_path = os.path.join(out_path, unique_name + '.png')
+                cv2.imwrite(full_out_path, flow_img)
+
+            if save_flo:
+                full_out_path = os.path.join(out_path, unique_name + '.flo')
+                write_flow(pred_flow, full_out_path)
+
+    def train(self, log_dir, training_schedule, input_a, input_b, flow, checkpoints=None):
+        tf.summary.image("image_a", input_a, max_outputs=2)
+        tf.summary.image("image_b", input_b, max_outputs=2)
+
+        self.learning_rate = tf.train.piecewise_constant(
+            self.global_step,
+            [tf.cast(v, tf.int64) for v in training_schedule['step_values']],
+            training_schedule['learning_rates'])
+
+        optimizer = tf.train.AdamOptimizer(
+            self.learning_rate,
+            training_schedule['momentum'],
+            training_schedule['momentum2'])
+
+        inputs = {
+            'input_a': input_a,
+            'input_b': input_b,
+        }
+        predictions = self.model(inputs, training_schedule)
+        total_loss = self.loss(flow, predictions)
+        tf.summary.scalar('loss', total_loss)
+
+        if checkpoints:
+            for (checkpoint_path, (scope, new_scope)) in checkpoints.iteritems():
+                variables_to_restore = slim.get_variables(scope=scope)
+                renamed_variables = {
+                    var.op.name.split(new_scope + '/')[1]: var
+                    for var in variables_to_restore
+                }
+                restorer = tf.train.Saver(renamed_variables)
+                with tf.Session() as sess:
+                    restorer.restore(sess, checkpoint_path)
+
+        # Show the generated flow in TensorBoard
+        if 'flow' in predictions:
+            pred_flow_0 = predictions['flow'][0, :, :, :]
+            pred_flow_0 = tf.py_func(flow_to_image, [pred_flow_0], tf.uint8)
+            pred_flow_1 = predictions['flow'][1, :, :, :]
+            pred_flow_1 = tf.py_func(flow_to_image, [pred_flow_1], tf.uint8)
+            pred_flow_img = tf.stack([pred_flow_0, pred_flow_1], 0)
+            tf.summary.image('pred_flow', pred_flow_img, max_outputs=2)
+
+        true_flow_0 = flow[0, :, :, :]
+        true_flow_0 = tf.py_func(flow_to_image, [true_flow_0], tf.uint8)
+        true_flow_1 = flow[1, :, :, :]
+        true_flow_1 = tf.py_func(flow_to_image, [true_flow_1], tf.uint8)
+        true_flow_img = tf.stack([true_flow_0, true_flow_1], 0)
+        tf.summary.image('true_flow', true_flow_img, max_outputs=2)
+
+        train_op = slim.learning.create_train_op(
+            total_loss,
+            optimizer,
+            summarize_gradients=True)
+
+        if self.debug:
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                tf.train.start_queue_runners(sess)
+                slim.learning.train_step(
+                    sess,
+                    train_op,
+                    self.global_step,
+                    {
+                        'should_trace': tf.constant(1),
+                        'should_log': tf.constant(1),
+                        'logdir': log_dir + '/debug',
+                    }
+                )
+        else:
+            slim.learning.train(
+                train_op,
+                log_dir,
+                # session_config=tf.ConfigProto(allow_soft_placement=True),
+                global_step=self.global_step,
+                save_summaries_secs=60,
+                number_of_steps=training_schedule['max_iter']
+            )
diff --git a/Codes/flownet2/src/ops/build/.gitkeep b/Codes/flownet2/src/ops/build/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/ops/build/.gitkeep
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
new file mode 100644
index 0000000..4e92f45
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
@@ -0,0 +1,160 @@
+#define EIGEN_USE_THREADS
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class CorrelationGradKernel : public OpKernel {
+  public:
+    explicit CorrelationGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and verify their dimensions
+      const Tensor& gradients_t = ctx->input(0);
+      const Tensor& input_a_t   = ctx->input(1);
+      const Tensor& input_b_t   = ctx->input(2);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input
+      const int batch_size          = input_a_t.dim_size(0);
+      const int in_height           = input_a_t.dim_size(1);
+      const int in_width            = input_a_t.dim_size(2);
+      const int in_channels         = input_a_t.dim_size(3);
+      const int in_count_per_sample = in_height * in_width * in_channels;
+      const int padded_height       = in_height + 2 * pad;
+      const int padded_width        = in_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      const int kernel_radius = (kernel_size - 1) / 2;
+      const int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      const int out_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      const int out_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      const int neighborhood_grid_radius = max_displacement / stride_2;
+      const int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      const int out_channels             = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the outputs
+      Tensor *output_a_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_a_t.shape(), &output_a_gradient_t));
+      Tensor *output_b_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, input_b_t.shape(), &output_b_gradient_t));
+
+      // Get the tensors
+      auto gradients         = gradients_t.tensor<float, 4>();
+      auto input_a           = input_a_t.tensor<float, 4>();
+      auto input_b           = input_b_t.tensor<float, 4>();
+      auto output_a_gradient = output_a_gradient_t->tensor<float, 4>();
+      auto output_b_gradient = output_b_gradient_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      CorrelationGradA(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_b.data(),
+                       gradients.data(),
+                       output_a_gradient.data());
+
+      CorrelationGradB(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_a.data(),
+                       gradients.data(),
+                       output_b_gradient.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CorrelationGrad")
+                        .Device(DEVICE_GPU),
+                        CorrelationGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
new file mode 100644
index 0000000..19e3a40
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
@@ -0,0 +1,262 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define ROUND_OFF 50000
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateDataBackward0(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_a_gradient,
+                                       const float *input_b,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // Get X,Y ranges and clamp
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    // We add round_off before_s1 the int division and subtract round_off after
+    // it, to ensure the formula matches ceil behavior:
+    int xmin = (x - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+    int ymin = (y - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+
+    // Same here:
+    int xmax = (x - max_displacement + round_off_s1) / stride_1 - round_off;
+    int ymax = (y - max_displacement + round_off_s1) / stride_1 - round_off;
+
+    float sum = 0;
+
+    if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+      xmin = max(0, xmin);
+      xmax = min(out_width - 1, xmax);
+
+      ymin = max(0, ymin);
+      ymax = min(out_height - 1, ymax);
+
+      for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+        for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+          // Get input_b data:
+          int s2o         = stride_2 * o;
+          int s2p         = stride_2 * p;
+          int idx_input_b = ((item * padded_in_height + (y + s2p)) * padded_in_width + (x + s2o)) *
+                            in_channels + k;
+          float input_b_tmp = input_b[idx_input_b]; // input_b[x+s2o,y+s2p,k]
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              // gradient[x,y,o,p]
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_b_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_a_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_a_gradient[input_a_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+__global__ void CorrelateDataBackward1(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_b_gradient,
+                                       const float *input_a,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    float sum = 0;
+
+    // Height (y)
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      // Width (x)
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+        int s2o = stride_2 * o;
+        int s2p = stride_2 * p;
+
+        // Get X,Y ranges and clamp
+        // We add round_off before_s1 the int division and subtract round_off
+        // after it, to ensure the formula matches ceil behavior:
+        int xmin = (x - 2 * kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+        int ymin = (y - 2 * kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+
+        // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+        // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+
+        // Same here:
+        int xmax = (x - max_displacement - s2o + round_off_s1) / stride_1 - round_off;
+        int ymax = (y - max_displacement - s2p + round_off_s1) / stride_1 - round_off;
+
+        if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+          xmin = max(0, xmin);
+          xmax = min(out_width - 1, xmax);
+
+          ymin = max(0, ymin);
+          ymax = min(out_height - 1, ymax);
+
+          // Get input_a data:
+          int idx_input_a = ((item * padded_in_height + (y - s2p)) * padded_in_width + (x - s2o)) *
+                            in_channels + k;
+          float input_a_tmp = input_a[idx_input_a];
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_a_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_b_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_b_gradient[input_b_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample, // h * w * ch
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward0 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_a_gradient, input_b, gradient);
+  }
+}
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward1 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_b_gradient, input_a, gradient);
+  }
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
new file mode 100644
index 0000000..f8a5193
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
@@ -0,0 +1,137 @@
+#define EIGEN_USE_THREADS
+
+#include <utility>
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<typename Device>
+class CorrelationKernel : public OpKernel {
+  public:
+    explicit CorrelationKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input (already padded)
+      int batch_size     = input_a_t.dim_size(0);
+      int input_height   = input_a_t.dim_size(1);
+      int input_width    = input_a_t.dim_size(2);
+      int input_channels = input_a_t.dim_size(3);
+      int padded_height  = input_height + 2 * pad;
+      int padded_width   = input_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      int kernel_radius = (kernel_size - 1) / 2;
+      int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      int output_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      int output_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      OP_REQUIRES(ctx, output_height >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input height."));
+      OP_REQUIRES(ctx, output_width >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input width."));
+
+      int neighborhood_grid_radius = max_displacement / stride_2;
+      int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      int output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ batch_size, output_height, output_width, output_channels }),
+                       &output_t));
+
+      // Get the tensors
+      auto input_a = input_a_t.tensor<float, 4>();
+      auto input_b = input_b_t.tensor<float, 4>();
+      auto output  = output_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      // Perform cross correlation
+      Correlation(ctx->eigen_device<Device>(),
+                  padded_input_a.data(),
+                  padded_input_b.data(),
+                  batch_size,
+                  output_height,
+                  output_width,
+                  output_channels,
+                  output_height * output_width * output_channels,
+                  padded_height,
+                  padded_width,
+                  input_channels,
+                  max_displacement,
+                  neighborhood_grid_radius,
+                  neighborhood_grid_width,
+                  kernel_radius,
+                  kernel_size,
+                  stride_1,
+                  stride_2,
+                  output.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Correlation")
+                        .Device(DEVICE_GPU),
+                        CorrelationKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
new file mode 100644
index 0000000..c63e489
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
@@ -0,0 +1,153 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateData(int          batch_size,
+                              int          out_width,
+                              int          out_height,
+                              int          out_channels,
+                              int          out_count,
+                              int          max_displacement,
+                              int          neighborhood_grid_radius,
+                              int          neighborhood_grid_width,
+                              int          kernel_radius,
+                              int          kernel_size,
+                              int          stride_1,
+                              int          stride_2,
+                              int          in_width_padded,
+                              int          in_height_padded,
+                              int          in_channels,
+                              const float *input_a,
+                              const float *input_b,
+                              float       *output) {
+  extern __shared__ char patch_data_char[];
+
+  float *patch_data = (float *)patch_data_char;
+
+  // First (upper left) position of kernel upper-left corner in current center
+  // position of neighborhood in image 1
+  int x1     = blockIdx.x * stride_1 + max_displacement;
+  int y1     = blockIdx.y * stride_1 + max_displacement;
+  int item   = blockIdx.z;
+  int ch_off = threadIdx.x;
+
+  // Load 3D patch into shared shared memory
+  // HEIGHT
+  for (int j = 0; j < kernel_size; j++) {
+    // WIDTH
+    for (int i = 0; i < kernel_size; i++) {
+      int ji_off = ((j * kernel_size) + i) * in_channels;
+
+      // CHANNELS
+      for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+        int idx1 = ((item * in_height_padded + y1 + j) * in_width_padded + x1 + i) *
+                   in_channels + ch;
+        int idxPatchData = ji_off + ch;
+        patch_data[idxPatchData] = input_a[idx1];
+      }
+    }
+  }
+
+  __syncthreads();
+
+  __shared__ float sum[WARPS_PER_BLOCK * THREADS_PER_WARP];
+
+  // Compute correlation
+  for (int out_channel = 0; out_channel < out_channels; out_channel++) {
+    sum[ch_off] = 0;
+
+    int s2o = (out_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int s2p = (out_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int x2  = x1 + s2o;
+    int y2  = y1 + s2p;
+
+    // HEIGHT
+    for (int j = 0; j < kernel_size; j++) {
+      // WIDTH
+      for (int i = 0; i < kernel_size; i++) {
+        int ji_off = ((j * kernel_size) + i) * in_channels;
+
+        // CHANNELS
+        for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+          int idxPatchData = ji_off + ch;
+          int idx2         = ((item * in_height_padded + y2 + j) * in_width_padded + x2 + i) *
+                             in_channels + ch;
+
+          sum[ch_off] += patch_data[idxPatchData] * input_b[idx2];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    if (ch_off == 0) {
+      float total_sum = 0;
+
+      for (int idx = 0; idx < WARPS_PER_BLOCK * THREADS_PER_WARP; idx++) {
+        total_sum += sum[idx];
+      }
+      const int sumelems = kernel_size * kernel_size * in_channels;
+      const int index    = (blockIdx.y * out_width + blockIdx.x) * out_channels + out_channel;
+
+      /* from Caffe:   const int index    = ((out_channel * out_height +
+         blockIdx.y) * out_width) + blockIdx.x; */
+      output[index + item * out_count] = total_sum / (float)sumelems;
+
+      // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      // n = 0
+      // caffe: ((k * H + h) * W + w)  +   n * K * H * W
+      // tf: (h * W + w) * K + k       +   n * H * W * K
+    }
+  }
+}
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output) {
+  dim3 totalBlocksCorr(out_width, out_height, batch_size);
+  dim3 threadsPerBlock(THREADS_PER_WARP *WARPS_PER_BLOCK);
+  const int shared_memory_per_block = (kernel_size * kernel_size) * in_channels;
+
+  CorrelateData << < totalBlocksCorr, threadsPerBlock, shared_memory_per_block * sizeof(float),
+    device.stream() >> > (
+    batch_size, out_width, out_height, out_channels, out_count,
+    max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+    kernel_size, stride_1, stride_2, in_width_padded, in_height_padded, in_channels,
+    input_a, input_b, output);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.h b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
new file mode 100644
index 0000000..a1dfb62
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
@@ -0,0 +1,77 @@
+#ifndef FLOWNET_CORRELATION_H_
+#define FLOWNET_CORRELATION_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output);
+
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient);
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_CORRELATION_H_
diff --git a/Codes/flownet2/src/ops/correlation/correlation_op.cc b/Codes/flownet2/src/ops/correlation/correlation_op.cc
new file mode 100644
index 0000000..4f420f0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_op.cc
@@ -0,0 +1,83 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status SetOutput(InferenceContext *c) {
+  ShapeHandle input_a, input_b, input;
+
+  // Get shapes of both inputs and verify they are rank 4
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_a));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_b));
+
+  // Verify inputs are same dimensions
+  TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input));
+
+  // Get the attributes
+  int kernel_size, max_displacement, stride_1, stride_2, pad;
+  TF_RETURN_IF_ERROR(c->GetAttr("kernel_size", &kernel_size));
+  TF_RETURN_IF_ERROR(c->GetAttr("max_displacement", &max_displacement));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_1", &stride_1));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_2", &stride_2));
+  TF_RETURN_IF_ERROR(c->GetAttr("pad", &pad));
+
+  // Get dimensions of input (already padded)
+  int64 batch         = c->Value(c->Dim(input, 0));
+  int64 input_height  = c->Value(c->Dim(input, 1));
+  int64 input_width   = c->Value(c->Dim(input, 2));
+  int64 padded_height = input_height + 2 * pad;
+  int64 padded_width  = input_width + 2 * pad;
+
+  // The size of unreachable border region on each side
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_size   = max_displacement + kernel_radius;
+
+  // Calculate the output dimensions
+  int64 output_height = (int64)ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+  int64 output_width  = (int64)ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+  // TODO: Verify output size >= 1
+
+  int   neighborhood_grid_radius = max_displacement / stride_2;
+  int   neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+  int64 output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+  // Set output shape
+  c->set_output(0, c->MakeShape({ batch, output_height, output_width, output_channels }));
+  return Status::OK();
+}
+
+REGISTER_OP("Correlation")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("output: float32")
+.SetShapeFn(SetOutput);
+
+REGISTER_OP("CorrelationGrad")
+.Input("gradients: float32")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("backprops_a: float32")
+.Output("backprops_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Output gradients should be the same dimensions as the inputs
+    ShapeHandle out;
+    TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &out));
+    c->set_output(0, out);
+    c->set_output(1, out);
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/pad.cu.cc b/Codes/flownet2/src/ops/correlation/pad.cu.cc
new file mode 100644
index 0000000..0b6c93d
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.cu.cc
@@ -0,0 +1,76 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "pad.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void PadData(
+  const float *in,
+  int          in_widthheight,
+  int          in_width,
+  int          in_height,
+  int          out_width,
+  int          out_height,
+  int          channels,
+  int          padding,
+  float       *out) {
+  int xy = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x  = xy % in_width;
+  int y  = xy / in_width;
+  int ch = blockIdx.y;
+  int n  = blockIdx.z;
+
+  if (xy >= in_widthheight) {
+    out[((n * out_height + y) * out_width + x) * channels + ch] = 0.0;
+    return;
+  }
+
+  float value = in[((n * in_height + y) * in_width + x) * channels + ch];
+
+  __syncthreads();
+
+  int xpad = x + padding;
+  int ypad = y + padding;
+
+  out[((n * out_height + ypad) * out_width + xpad) * channels + ch] = value;
+}
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output) {
+  int  in_widthheight    = input_width * input_height;
+  int  threads_per_block = 16;
+  dim3 totalBlocks((in_widthheight - 1) / threads_per_block + 1, input_channels, batch_size);
+
+  cudaMemset(output, 0, batch_size * output_height * output_width * input_channels * sizeof(float));
+
+  int padding = (output_height - input_height) / 2;
+
+  // LAUNCH KERNEL
+  PadData << < totalBlocks, threads_per_block, 0, device.stream() >> > (
+    input,
+    in_widthheight,
+    input_width,
+    input_height,
+    output_width,
+    output_height,
+    input_channels,
+    padding,
+    output);
+}
+}
+#endif // if GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/pad.h b/Codes/flownet2/src/ops/correlation/pad.h
new file mode 100644
index 0000000..afb4df0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.h
@@ -0,0 +1,20 @@
+#ifndef FLOWNET_PAD_H_
+#define FLOWNET_PAD_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output);
+} // end namespace tensorflow
+
+#endif // ifndef FLOWNET_PAD_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
new file mode 100644
index 0000000..eefe247
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
@@ -0,0 +1,47 @@
+#define EIGEN_USE_THREADS
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+class DownsampleKernel : public OpKernel {
+ public:
+  explicit DownsampleKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    // Get the size [height, width] tensor and verify its dimensions
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("size", &size_));
+    OP_REQUIRES(ctx, size_.size() == 2, errors::InvalidArgument("size must be 2 dimensions"));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    // Get the input images and transforms and verify their dimensions
+    const Tensor& input_t = ctx->input(0);
+    OP_REQUIRES(ctx, input_t.dims() == 4,
+                errors::InvalidArgument("Input images must have rank 4"));
+
+    // Allocate the memory for the output
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+        0, TensorShape({input_t.dim_size(0), size_[0], size_[1], input_t.dim_size(3)}), &output_t));
+
+    // Perform flow augmentation
+    auto input = input_t.tensor<float, 4>();
+    auto output = output_t->tensor<float, 4>();
+
+    Downsample(ctx->eigen_gpu_device(), input, output);
+  }
+
+  private:
+    std::vector<int32> size_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Downsample")
+                          .Device(DEVICE_GPU),
+                      DownsampleKernel<GPUDevice>)
+}  // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.h b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
new file mode 100644
index 0000000..bcc4e3f
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
@@ -0,0 +1,18 @@
+#ifndef FLOWNET_DOWNSAMPLE_H_
+#define FLOWNET_DOWNSAMPLE_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output);
+
+}  // end namespace tensorflow
+
+#endif  // FLOWNET_DOWNSAMPLE_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
new file mode 100644
index 0000000..b7629a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
@@ -0,0 +1,108 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define CUDART_NAN_F            __int_as_float(0x7fffffff)
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void DownsampleKernel(
+    const int32 nthreads,
+    const float* input_ptr,
+    float* output_ptr,
+    const int in_width,
+    const int in_height,
+    const int out_width,
+    const int out_height,
+    const int channels,
+    const float width_scale,
+    const float height_scale,
+    const int wradius,
+    const int hradius) {
+        CUDA_1D_KERNEL_LOOP(index, nthreads) {
+            const int c = index % channels;
+            const int destx = (index / channels) % out_width;
+            const int desty = (index / channels / out_width) % out_height;
+            const int n = (index / channels / out_width) / out_height;
+
+            const float srcx = ((float)destx / (float)(out_width - 1)) * (float)(in_width - 1);
+            const float srcy = ((float)desty / (float)(out_height - 1)) * (float)(in_height - 1);
+
+            const int isrcx = round(srcx);
+            const int isrcy = round(srcy);
+
+            float accum_value = 0;
+            float accum_weight = 0;
+            float accum_nan = 0;
+
+            for (int dy = -hradius; dy <= hradius; dy++) {
+                int yoff = isrcy + dy;
+                //
+                for (int dx = -wradius; dx <= wradius; dx++) {
+                    int xoff = isrcx + dx;
+
+                    if (xoff >= 0 && yoff >= 0 && xoff < in_width && yoff < in_height) {
+                        int idx = ((n * in_height + yoff) * in_width + xoff) * channels + c;
+                        float sample = input_ptr[idx];
+                        float weight = fmaxf(0.0f, 1.0f - (fabsf((float)xoff - srcx) / width_scale))
+                                       * fmaxf(0.0f, 1.0f - (fabsf((float)yoff - srcy) / height_scale));
+                        if (sample != sample) { // isnan
+                            accum_nan += weight;
+                            sample = 0;
+                            weight = 0;
+                        }
+                        accum_value += sample * weight;
+                        accum_weight += weight;
+                    }
+                }
+            }
+
+            if (accum_nan / accum_weight > 0.5) {
+                output_ptr[index] = CUDART_NAN_F;
+            } else {
+                output_ptr[index] = accum_value / accum_weight;
+            }
+        }
+}
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int out_height = output.dimension(1);
+    const int out_width = output.dimension(2);
+    const int out_channels = output.dimension(3);
+    const int total_count = batch_size * out_height * out_width * out_channels;
+
+    const int in_height = input.dimension(1);
+    const int in_width = input.dimension(2);
+
+    const float width_scale = (float)(in_width - 1) / (float)(out_width - 1);
+    const float height_scale = (float)(in_height - 1) / (float)(out_height - 1);
+
+    const int wradius = ceil(width_scale);
+    const int hradius = ceil(height_scale);
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, device);
+    DownsampleKernel<<<config.block_count, config.thread_per_block, 0,
+                        device.stream()>>>(total_count, input.data(), output.data(),
+                        in_width, in_height, out_width, out_height, out_channels,
+                        width_scale, height_scale, wradius, hradius);
+    return device.ok();
+}
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/downsample/downsample_op.cc b/Codes/flownet2/src/ops/downsample/downsample_op.cc
new file mode 100644
index 0000000..6980dc7
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_op.cc
@@ -0,0 +1,30 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch = c->Dim(input, 0);
+  DimensionHandle depth = c->Dim(input, 3);
+  std::vector<int32> size_;
+  c->GetAttr("size", &size_);
+  DimensionHandle height = c->MakeDim(size_[0]);
+  DimensionHandle width  = c->MakeDim(size_[1]);
+  c->set_output(0, c->MakeShape({batch, height, width, depth}));
+  return Status::OK();
+}
+
+REGISTER_OP("Downsample")
+    .Input("input: float32")
+    .Attr("size: list(int) >= 2")
+    .Output("output: float32")
+    .SetShapeFn(SetOutputToSizedImage);
+
+}  // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
new file mode 100644
index 0000000..b5d9602
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
@@ -0,0 +1,48 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpKernel : public OpKernel {
+  public:
+    explicit FlowWarpKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& input_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  input_t.dim_size(0) == flow_t.dim_size(0) && input_t.dim_size(
+                    1) == flow_t.dim_size(1) && input_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_t.shape(), &output_t));
+
+      // Perform flow augmentation
+      auto input  = input_t.tensor<float, 4>();
+      auto flow   = flow_t.tensor<float, 4>();
+      auto output = output_t->tensor<float, 4>();
+
+      FlowWarp(ctx->eigen_gpu_device(), input, flow, output);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarp")
+                        .Device(DEVICE_GPU),
+                        FlowWarpKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
new file mode 100644
index 0000000..2007151
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
@@ -0,0 +1,130 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define RA_TILE 32
+#define RA_ROWS 8
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpKernel(
+  const float *image,
+  const float *flow,
+  float       *warped,
+  const int    batch_size,
+  const int    channels,
+  const int    cblocks,
+  const int    width,
+  const int    wblocks,
+  const int    height,
+  const int    width_height) {
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X];
+  __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1];
+
+  int x;
+  int c;
+
+  x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if ((threadIdx.y == 0) && (x < width)) {
+    const int idx = ((n * height + y) * width + x) * 2;
+    x2_buf[threadIdx.x] = float(x) + flow[idx];
+    y2_buf[threadIdx.x] = float(y) + flow[idx + 1];
+  }
+
+  __syncthreads();
+
+  float x2 = x2_buf[threadIdx.y];
+  float y2 = y2_buf[threadIdx.y];
+
+  int ix2_L = int(x2);
+  int iy2_T = int(y2);
+  int ix2_R = min(ix2_L + 1, width - 1);
+  int iy2_B = min(iy2_T + 1, height - 1);
+
+  int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels;
+  int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels;
+  int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels;
+  int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels;
+
+  float alpha   = x2 - ix2_L;
+  float beta    = y2 - iy2_T;
+  float coeffTL = (1 - alpha) * (1 - beta);
+  float coeffTR = alpha * (1 - beta);
+  float coeffBL = (1 - alpha) * beta;
+  float coeffBR = alpha * beta;
+
+  for (int cb = 0; cb < cblocks; cb++) {
+    __syncthreads();
+
+    buffer[threadIdx.y][threadIdx.x] = 0.0;
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.x;
+
+    if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) {
+      buffer[threadIdx.y][threadIdx.x] = // buffer [x][c]
+                                         coeffTL * image[off_TL + c] +
+                                         coeffTR * image[off_TR + c] +
+                                         coeffBL * image[off_BL + c] +
+                                         coeffBR * image[off_BR + c];
+    }
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.y;
+    x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+    if ((c < channels) && (x < width)) {
+      warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y];
+    }
+  }
+}
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output) {
+  const int batch_size = input.dimension(0);
+  const int height     = input.dimension(1);
+  const int width      = input.dimension(2);
+  const int channels   = input.dimension(3);
+
+  const int width_height = width * height;
+  int  wblocks           = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks           = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, FW_TILE_C);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    input.data(),
+    flow.data(),
+    output.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.h b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
new file mode 100644
index 0000000..2780316
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
@@ -0,0 +1,28 @@
+#ifndef FLOWNET_FLOWWARP_H_
+#define FLOWNET_FLOWWARP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+#define FW_THREADS 32
+#define FW_TILE_X FW_THREADS
+#define FW_TILE_C FW_THREADS
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output);
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOWWARP_H_
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
new file mode 100644
index 0000000..9f3e7ea
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
@@ -0,0 +1,57 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpGradKernel : public OpKernel {
+  public:
+    explicit FlowWarpGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& image_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+      const Tensor& grad_t  = ctx->input(2);
+
+      OP_REQUIRES(ctx, image_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  image_t.dim_size(0) == flow_t.dim_size(0) && image_t.dim_size(
+                    1) == flow_t.dim_size(1) && image_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *image_grad_t;
+      Tensor *flow_grad_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, image_t.shape(), &image_grad_t));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, flow_t.shape(), &flow_grad_t));
+
+      auto image      = image_t.tensor<float, 4>();
+      auto flow       = flow_t.tensor<float, 4>();
+      auto gradient   = grad_t.tensor<float, 4>();
+      auto image_grad = image_grad_t->tensor<float, 4>();
+      auto flow_grad  = flow_grad_t->tensor<float, 4>();
+
+      FlowWarpGrad(ctx->eigen_gpu_device(),
+                   image,
+                   flow,
+                   gradient,
+                   image_grad,
+                   flow_grad);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarpGrad")
+                        .Device(DEVICE_GPU),
+                        FlowWarpGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
new file mode 100644
index 0000000..25248c8
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
@@ -0,0 +1,126 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "flow_warp.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpGradKernel(
+  const float *image,
+  float       *image_grad,
+  const float *flow,
+  float       *flow_grad,
+  const float *gradient,
+  int          batch_size,
+  int          channels,
+  int          cblocks,
+  int          width,
+  int          wblocks,
+  int          height,
+  int          widthheight) {
+  int x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if (x >= width) return;
+
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  const int flow_idx = ((n * height + y) * width + x) * 2;
+  float     x2       = float(x) + flow[flow_idx];
+  float     y2       = float(y) + flow[flow_idx + 1];
+
+  if ((x2 >= 0.f) && (y2 >= 0.f) && (x2 < width) && (y2 < height)) {
+    int ix2_L = int(x2);
+    int iy2_T = int(y2);
+    int ix2_R = min(ix2_L + 1, width - 1);
+    int iy2_B = min(iy2_T + 1, height - 1);
+
+    float alpha = x2 - ix2_L;
+    float beta  = y2 - iy2_T;
+
+    for (int c = 0; c < channels; c++) {
+      float warped_diff_value = gradient[((n * height + y) * width + x) * channels + c];
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * beta);
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * beta);
+    }
+
+    float gamma    = iy2_B - y2;
+    float bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      int   ch_off = (n * channels + c) * height;
+      float temp   = 0;
+      temp += gamma *
+              (image[((n * height + iy2_T) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_B) * width + ix2_L) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2] = bot_diff;
+
+    gamma    = ix2_R - x2;
+    bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      float temp = 0;
+      temp += gamma *
+              (image[((n * height + iy2_B) * width + ix2_L) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_R) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2 + 1] = bot_diff;
+  }
+}
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad) {
+  const int batch_size   = image.dimension(0);
+  const int height       = image.dimension(1);
+  const int width        = image.dimension(2);
+  const int channels     = image.dimension(3);
+  const int width_height = width * height;
+
+  int  wblocks = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, 1);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(image_grad.data(), 0, batch_size * height * width * channels * sizeof(float));
+  cudaMemset(flow_grad.data(),  0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpGradKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    image.data(),
+    image_grad.data(),
+    flow.data(),
+    flow_grad.data(),
+    gradient.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
new file mode 100644
index 0000000..aef9c74
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
@@ -0,0 +1,23 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("FlowWarp")
+.Input("image: float32")
+.Input("flow: float32")
+.Output("output: float32")
+.SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+REGISTER_OP("FlowWarpGrad")
+.Input("image: float32")
+.Input("flow: float32")
+.Input("gradient: float32")
+.Output("image_grad: float32")
+.Output("flow_grad: float32")
+.SetShapeFn([](shape_inference::InferenceContext *c) {
+    c->set_output(0, c->input(0));
+    c->set_output(1, c->input(1));
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
new file mode 100644
index 0000000..b93dfa6
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
@@ -0,0 +1,420 @@
+#include "augmentation_base.h"
+
+#include <math.h>
+#include <random>
+
+namespace tensorflow {
+/** TransMat Functions **/
+void AugmentationLayerBase::TransMat::fromCoeff(AugmentationCoeff *coeff,
+                                                int                out_width,
+                                                int                out_height,
+                                                int                src_width,
+                                                int                src_height) {
+  leftMultiply(1, 0, -0.5 * out_width,
+               0, 1, -0.5 * out_height);
+
+  if (coeff->angle) {
+    leftMultiply(cos(coeff->angle()), -sin(coeff->angle()), 0,
+                 sin(coeff->angle()), cos(coeff->angle()), 0);
+  }
+
+  if (coeff->dx || coeff->dy) {
+    leftMultiply(1, 0, coeff->dx() * out_width,
+                 0, 1, coeff->dy() * out_height);
+  }
+
+  if (coeff->zoom_x || coeff->zoom_y) {
+    leftMultiply(1.0 / coeff->zoom_x(), 0, 0,
+                 0, 1.0 / coeff->zoom_y(), 0);
+  }
+
+  leftMultiply(1, 0, 0.5 * src_width,
+               0, 1, 0.5 * src_height);
+}
+
+void AugmentationLayerBase::TransMat::fromTensor(const float *tensor_data) {
+  t0 = tensor_data[0];
+  t1 = tensor_data[1];
+  t2 = tensor_data[2];
+  t3 = tensor_data[3];
+  t4 = tensor_data[4];
+  t5 = tensor_data[5];
+}
+
+AugmentationLayerBase::TransMat AugmentationLayerBase::TransMat::inverse() {
+  float a = this->t0, b = this->t1, c = this->t2;
+  float d = this->t3, e = this->t4, f = this->t5;
+
+  float denom = a * e - b * d;
+
+  TransMat result;
+
+  result.t0 = e / denom;
+  result.t1 = b / -denom;
+  result.t2 = (c * e - b * f) / -denom;
+  result.t3 = d / -denom;
+  result.t4 = a / denom;
+  result.t5 = (c * d - a * f) / denom;
+
+  return result;
+}
+
+void AugmentationLayerBase::TransMat::leftMultiply(float u0,
+                                                   float u1,
+                                                   float u2,
+                                                   float u3,
+                                                   float u4,
+                                                   float u5) {
+  float t0 = this->t0, t1 = this->t1, t2 = this->t2;
+  float t3 = this->t3, t4 = this->t4, t5 = this->t5;
+
+  this->t0 = t0 * u0 + t3 * u1;
+  this->t1 = t1 * u0 + t4 * u1;
+  this->t2 = t2 * u0 + t5 * u1 + u2;
+  this->t3 = t0 * u3 + t3 * u4;
+  this->t4 = t1 * u3 + t4 * u4;
+  this->t5 = t2 * u3 + t5 * u4 + u5;
+}
+
+void AugmentationLayerBase::TransMat::toIdentity() {
+  t0 = 1; t1 = 0; t2 = 0;
+  t3 = 0; t4 = 1; t5 = 0;
+}
+
+/** AugmentationCoeff Functions **/
+void AugmentationCoeff::clear() {
+  // Spatial variables
+  dx.clear();
+  dy.clear();
+  angle.clear();
+  zoom_x.clear();
+  zoom_y.clear();
+
+  // Chromatic variables
+  gamma.clear();
+  brightness.clear();
+  contrast.clear();
+  color1.clear();
+  color2.clear();
+  color3.clear();
+}
+
+void AugmentationCoeff::combine_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = dx() * coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = dy() * coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = angle() * coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = zoom_x() * coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = zoom_y() * coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = brightness() * coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = contrast() * coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = color1() * coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = color2() * coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = color3() * coeff.color3();
+  }
+}
+
+void AugmentationCoeff::replace_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = coeff.color3();
+  }
+}
+
+/** AugmentationLayerBase Functions **/
+float AugmentationLayerBase::rng_generate(const AugmentationParam& param,
+                                          float                    discount_coeff,
+                                          const float              default_value) {
+  std::random_device rd;  // Will be used to obtain a seed for the random number
+                          // engine
+  std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+
+  float spread = param.spread * discount_coeff;
+
+  if (param.rand_type == "uniform_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (param.spread > 0.0) {
+      std::uniform_real_distribution<> uniform(param.mean - spread,
+                                               param.mean + spread);
+      tmp1 = uniform(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else if (param.rand_type == "gaussian_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (spread > 0.0) {
+      std::normal_distribution<> normal(param.mean, spread);
+      tmp1 = normal(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else {
+    throw "Unknown random type: " + param.rand_type;
+  }
+}
+
+void AugmentationLayerBase::generate_chromatic_coeffs(float                     discount_coeff,
+                                                      const AugmentationParams& aug,
+                                                      AugmentationCoeff       & coeff) {
+  if (aug.gamma) {
+    coeff.gamma = rng_generate(aug.gamma(), discount_coeff, coeff.gamma.get_default());
+  }
+
+  if (aug.brightness) {
+    coeff.brightness =
+      rng_generate(aug.brightness(), discount_coeff, coeff.brightness.get_default());
+  }
+
+  if (aug.contrast) {
+    coeff.contrast = rng_generate(aug.contrast(), discount_coeff, coeff.contrast.get_default());
+  }
+
+  if (aug.color) {
+    coeff.color1 = rng_generate(aug.color(), discount_coeff, coeff.color1.get_default());
+    coeff.color2 = rng_generate(aug.color(), discount_coeff, coeff.color2.get_default());
+    coeff.color3 = rng_generate(aug.color(), discount_coeff, coeff.color3.get_default());
+  }
+}
+
+void AugmentationLayerBase::generate_spatial_coeffs(float                     discount_coeff,
+                                                    const AugmentationParams& aug,
+                                                    AugmentationCoeff       & coeff) {
+  if (aug.translate) {
+    coeff.dx = rng_generate(aug.translate(), discount_coeff, coeff.dx.get_default());
+    coeff.dy = rng_generate(aug.translate(), discount_coeff, coeff.dy.get_default());
+  }
+
+  if (aug.rotate) {
+    coeff.angle = rng_generate(aug.rotate(), discount_coeff, coeff.angle.get_default());
+  }
+
+  if (aug.zoom) {
+    coeff.zoom_x = rng_generate(aug.zoom(), discount_coeff, coeff.zoom_x.get_default());
+    coeff.zoom_y = coeff.zoom_x();
+  }
+
+  if (aug.squeeze) {
+    float squeeze_coeff = rng_generate(aug.squeeze(), discount_coeff, 1.0);
+    coeff.zoom_x = coeff.zoom_x() * squeeze_coeff;
+    coeff.zoom_y = coeff.zoom_y() * squeeze_coeff;
+  }
+}
+
+void AugmentationLayerBase::generate_valid_spatial_coeffs(
+  float                     discount_coeff,
+  const AugmentationParams& aug,
+  AugmentationCoeff       & coeff,
+  int                       src_width,
+  int                       src_height,
+  int                       out_width,
+  int                       out_height) {
+  int   x, y;
+  float x1, y1, x2, y2;
+  int   counter     = 0;
+  int   good_params = 0;
+  AugmentationCoeff incoming_coeff(coeff);
+
+  while (good_params < 4 && counter < 50) {
+    coeff.clear();
+    AugmentationLayerBase::generate_spatial_coeffs(discount_coeff, aug, coeff);
+    coeff.combine_with(incoming_coeff);
+
+    // Check if all 4 corners of the transformed image fit into the original
+    // image
+    good_params = 0;
+
+    for (x = 0; x < out_width; x += out_width - 1) {
+      for (y = 0; y < out_height; y += out_height - 1) {
+        // move the origin
+        x1 = x - 0.5 * out_width;
+        y1 = y - 0.5 * out_height;
+
+        // rotate
+        x2 = cos(coeff.angle()) * x1 - sin(coeff.angle()) * y1;
+        y2 = sin(coeff.angle()) * x1 + sin(coeff.angle()) * y1;
+
+        // translate
+        x2 = x2 + coeff.dx() * out_width;
+        y2 = y2 + coeff.dy() * out_height;
+
+        // zoom
+        x2 = x2 / coeff.zoom_x();
+        y2 = y2 / coeff.zoom_y();
+
+        // move the origin back
+        x2 = x2 + 0.5 * src_width;
+        y2 = y2 + 0.5 * src_height;
+
+        if (!((floor(x2) < 0) || (floor(x2) > src_width - 2.0) ||
+              (floor(y2) < 0) || (floor(y2) > src_height - 2.0))) {
+          good_params++;
+        }
+      }
+    }
+    counter++;
+  }
+
+  if (counter >= 50) {
+    printf("Warning: No suitable spatial transformation after %d attempts.\n", counter);
+    coeff.clear();
+    coeff.replace_with(incoming_coeff);
+  }
+}
+
+void AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  typename TTypes<float, 2>::Tensor& out)
+{
+  float *out_ptr = out.data();
+  int    counter = 0;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    out_ptr[counter + 0] = coeff.gamma();
+    out_ptr[counter + 1] = coeff.brightness();
+    out_ptr[counter + 2] = coeff.contrast();
+    out_ptr[counter + 3] = coeff.color1();
+    out_ptr[counter + 4] = coeff.color2();
+    out_ptr[counter + 5] = coeff.color3();
+    counter             += 6;
+  }
+}
+
+void AugmentationLayerBase::copy_spatial_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  const int out_width,
+  const int out_height,
+  const int src_width,
+  const int src_height,
+  typename TTypes<float, 2>::Tensor& out,
+  const bool invert)
+{
+  float   *out_ptr = out.data();
+  int      counter = 0;
+  TransMat t;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    t.toIdentity();
+    t.fromCoeff(&coeff, out_width, out_height, src_width, src_height);
+
+    if (invert) {
+      t = t.inverse();
+    }
+
+    out_ptr[counter + 0] = t.t0;
+    out_ptr[counter + 1] = t.t1;
+    out_ptr[counter + 2] = t.t2;
+    out_ptr[counter + 3] = t.t3;
+    out_ptr[counter + 4] = t.t4;
+    out_ptr[counter + 5] = t.t5;
+    counter             += 6;
+  }
+}
+}
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
new file mode 100644
index 0000000..d2aba2c
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
@@ -0,0 +1,228 @@
+#ifndef AUGMENTATION_LAYER_BASE_H_
+#define AUGMENTATION_LAYER_BASE_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+template<typename T>
+class OptionalType {
+  public:
+    OptionalType(const T default_value) : default_value(default_value), has_value(false) {}
+
+    operator bool() const {
+      return has_value;
+    }
+
+    OptionalType& operator=(T val) {
+      has_value = true;
+      value     = val;
+      return *this;
+    }
+
+    const T operator()() const {
+      return has_value ? value : default_value;
+    }
+
+    void clear() {
+      has_value = false;
+    }
+
+    const T get_default() {
+      return default_value;
+    }
+
+  private:
+    T value;
+    bool has_value;
+    const T default_value;
+};
+
+class AugmentationCoeff {
+  public:
+    // Spatial Types
+    OptionalType<float>dx;
+    OptionalType<float>dy;
+    OptionalType<float>angle;
+    OptionalType<float>zoom_x;
+    OptionalType<float>zoom_y;
+
+    // Chromatic Types
+    OptionalType<float>gamma;
+    OptionalType<float>brightness;
+    OptionalType<float>contrast;
+    OptionalType<float>color1;
+    OptionalType<float>color2;
+    OptionalType<float>color3;
+
+    AugmentationCoeff() : dx(0.0), dy(0.0), angle(0.0), zoom_x(1.0), zoom_y(1.0), gamma(1.0),
+      brightness(0.0), contrast(1.0), color1(1.0), color2(1.0), color3(1.0) {}
+
+    AugmentationCoeff(const AugmentationCoeff& coeff) : AugmentationCoeff() {
+      replace_with(coeff);
+    }
+
+    void clear();
+
+    void combine_with(const AugmentationCoeff& coeff);
+
+    void replace_with(const AugmentationCoeff& coeff);
+};
+
+typedef struct AugmentationParam {
+  std::string rand_type;
+  bool        should_exp;
+  float       mean;
+  float       spread;
+  float       prob;
+} AugmentationParam;
+
+class AugmentationParams {
+  public:
+    int crop_height;
+    int crop_width;
+
+    // Spatial options
+    OptionalType<struct AugmentationParam>translate;
+    OptionalType<struct AugmentationParam>rotate;
+    OptionalType<struct AugmentationParam>zoom;
+    OptionalType<struct AugmentationParam>squeeze;
+
+    // Chromatic options
+    OptionalType<struct AugmentationParam>gamma;
+    OptionalType<struct AugmentationParam>brightness;
+    OptionalType<struct AugmentationParam>contrast;
+    OptionalType<struct AugmentationParam>color;
+
+    inline AugmentationParams(int                     crop_height,
+                              int                     crop_width,
+                              std::vector<std::string>params_name,
+                              std::vector<std::string>params_rand_type,
+                              std::vector<bool>       params_exp,
+                              std::vector<float>      params_mean,
+                              std::vector<float>      params_spread,
+                              std::vector<float>      params_prob) :
+      crop_height(crop_height),
+      crop_width(crop_width),
+      translate(AugmentationParam()),
+      rotate(AugmentationParam()),
+      zoom(AugmentationParam()),
+      squeeze(AugmentationParam()),
+      gamma(AugmentationParam()),
+      brightness(AugmentationParam()),
+      contrast(AugmentationParam()),
+      color(AugmentationParam()) {
+      for (int i = 0; i < params_name.size(); i++) {
+        const std::string name      = params_name[i];
+        const std::string rand_type = params_rand_type[i];
+        const bool  should_exp      = params_exp[i];
+        const float mean            = params_mean[i];
+        const float spread          = params_spread[i];
+        const float prob            = params_prob[i];
+
+        struct AugmentationParam param = { rand_type, should_exp, mean, spread, prob };
+
+        if (name == "translate") {
+          this->translate = param;
+        } else if (name == "rotate") {
+          this->rotate = param;
+        } else if (name == "zoom") {
+          this->zoom = param;
+        }  else if (name == "squeeze") {
+          this->squeeze = param;
+        } else if (name == "noise") {
+          // NoOp: We handle noise on the Python side
+        } else if (name == "gamma") {
+          this->gamma = param;
+        } else if (name == "brightness") {
+          this->brightness = param;
+        } else if (name == "contrast") {
+          this->contrast = param;
+        } else if (name == "color") {
+          this->color = param;
+        } else {
+          std::cout << "Ignoring unknown augmentation parameter: " << name << std::endl;
+        }
+      }
+    }
+
+    bool should_do_spatial_transform() {
+      return this->translate || this->rotate || this->zoom || this->squeeze;
+    }
+
+    bool should_do_chromatic_transform() {
+      return this->gamma || this->brightness || this->contrast || this->color;
+    }
+};
+
+class AugmentationLayerBase {
+  public:
+    class TransMat {
+      /**
+       * Translation matrix class for spatial augmentation
+       * | 0 1 2 |
+       * | 3 4 5 |
+       */
+
+      public:
+        float t0, t1, t2;
+        float t3, t4, t5;
+
+
+        void fromCoeff(AugmentationCoeff *coeff,
+                       int                out_width,
+                       int                out_height,
+                       int                src_width,
+                       int                src_height);
+
+        void     fromTensor(const float *tensor_data);
+
+        TransMat inverse();
+
+        void     leftMultiply(float u0,
+                              float u1,
+                              float u2,
+                              float u3,
+                              float u4,
+                              float u5);
+
+        void toIdentity();
+    };
+
+    // TODO: Class ChromaticCoeffs
+
+    static float rng_generate(const AugmentationParam& param,
+                              float                    discount_coeff,
+                              const float              default_value);
+
+    static void clear_spatial_coeffs(AugmentationCoeff& coeff);
+    static void generate_chromatic_coeffs(float                     discount_coeff,
+                                          const AugmentationParams& aug,
+                                          AugmentationCoeff       & coeff);
+    static void generate_spatial_coeffs(float                     discount_coeff,
+                                        const AugmentationParams& aug,
+                                        AugmentationCoeff       & coeff);
+    static void generate_valid_spatial_coeffs(float                     discount_coeff,
+                                              const AugmentationParams& aug,
+                                              AugmentationCoeff       & coeff,
+                                              int                       src_width,
+                                              int                       src_height,
+                                              int                       out_width,
+                                              int                       out_height);
+
+    static void copy_chromatic_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                                typename TTypes<float, 2>::Tensor& out);
+    static void copy_spatial_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                              const int out_width,
+                                              const int out_height,
+                                              const int src_width,
+                                              const int src_height,
+                                              typename TTypes<float, 2>::Tensor& out,
+                                              const bool invert = false);
+};
+} // namespace tensorflow
+
+#endif // AUGMENTATION_LAYER_BASE_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
new file mode 100644
index 0000000..77b8c83
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
@@ -0,0 +1,461 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+template<>
+void Augment(OpKernelContext *context,
+             const CPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int64 channel_count                          = batch_size * out_height * out_width;
+  const int   kCostPerChannel                        = 10;
+  const DeviceBase::CpuWorkerThreads& worker_threads =
+    *context->device()->tensorflow_cpu_worker_threads();
+
+  Shard(worker_threads.num_threads,
+        worker_threads.workers,
+        channel_count,
+        kCostPerChannel,
+        [batch_size, channels, src_width,
+         src_height, src_count, out_width, out_height, src_data,
+         out_data, transMats, chromatic_coeffs](
+          int64 start_channel, int64 end_channel) {
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      for (int index = start_channel; index < end_channel; index++) {
+        int x = index % out_width;
+        int y = (index / out_width) % out_height;
+        int n = index / out_width / out_height;
+
+        const float *transMat = transMats + n * 6;
+
+        float gamma, brightness, contrast;
+
+        if (chromatic_coeffs) {
+          gamma      = chromatic_coeffs[n * 6 + 0];
+          brightness = chromatic_coeffs[n * 6 + 1];
+          contrast   = chromatic_coeffs[n * 6 + 2];
+        }
+
+        float xpos = x * transMat[0] + y * transMat[1] + transMat[2];
+        float ypos = x * transMat[3] + y * transMat[4] + transMat[5];
+
+        xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+        ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+        float tlx = floor(xpos);
+        float tly = floor(ypos);
+
+        float xdist = xpos - tlx;
+        float ydist = ypos - tly;
+
+        int srcTLIdxOffset = ((n * src_height + (int)tly) * src_width + (int)tlx) * channels;
+
+        // ((n * src_height + tly) * src_width + (tlx + 1)) * channels
+        int srcTRIdxOffset = srcTLIdxOffset + channels;
+
+        // ((n * src_height + (tly + 1)) * src_width + tlx) * channels
+        int srcBLIdxOffset = srcTLIdxOffset + channels * src_width;
+
+        // ((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels
+        int srcBRIdxOffset = srcTLIdxOffset + channels + channels * src_width;
+
+        // Variables for chromatic transform
+        int   data_index[3];
+        float rgb[3];
+        float mean_in  = 0;
+        float mean_out = 0;
+
+        for (int c = 0; c < channels; c++) {
+          // Bilinear interpolation
+          int srcTLIdx = srcTLIdxOffset + c;
+          int srcTRIdx = std::min(srcTRIdxOffset + c, src_count);
+          int srcBLIdx = std::min(srcBLIdxOffset + c, src_count);
+          int srcBRIdx = std::min(srcBRIdxOffset + c, src_count);
+
+          float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                       + (xdist) * (ydist) * src_data[srcBRIdx]
+                       + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                       + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+          if (chromatic_coeffs) {
+            // Gather data for chromatic transform
+            data_index[c] = index * channels + c;
+            rgb[c]        = dest;
+            mean_in      += rgb[c];
+
+            // Note: coeff[3] == color1, coeff[4] == color2, ...
+            rgb[c] *= chromatic_coeffs[n * 6 + (3 + c)];
+
+            mean_out += rgb[c];
+          } else {
+            out_data[index * channels + c] = dest;
+          }
+        }
+
+        float brightness_coeff = mean_in / (mean_out + 0.01f);
+
+        if (chromatic_coeffs) {
+          // Chromatic transformation
+          for (int c = 0; c < channels; c++) {
+            // compensate brightness
+            rgb[c] = clamp(rgb[c] * brightness_coeff, 0.0f, 1.0f);
+
+            // gamma change
+            rgb[c] = pow(rgb[c], gamma);
+
+            // brightness change
+            rgb[c] = rgb[c] + brightness;
+
+            // contrast change
+            rgb[c] = 0.5f + (rgb[c] - 0.5f) * contrast;
+
+            out_data[data_index[c]] = clamp(rgb[c], 0.0f, 1.0f);
+          }
+        }
+      }
+    });
+}
+
+template<typename Device>
+class DataAugmentation : public OpKernel {
+  public:
+    explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+
+      // TODO: Verify params are all the same length
+
+      // Get the tensors for params_a and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_a_rand_type", &params_a_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread", &params_a_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_coeff_schedule", &params_a_coeff_schedule_));
+
+      // Get the tensors for params_b and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_b_rand_type", &params_b_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread", &params_b_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_coeff_schedule", &params_b_coeff_schedule_));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      // Get the global step value
+      const Tensor& global_step_t = ctx->input(2);
+      auto global_step_eigen      = global_step_t.tensor<int64, 0>();
+      const int64 global_step     = global_step_eigen.data()[0];
+
+      // Dimension constants
+      const int batch_size = input_a_t.dim_size(0);
+      const int src_height = input_a_t.dim_size(1);
+      const int src_width  = input_a_t.dim_size(2);
+      const int channels   = input_a_t.dim_size(3);
+      const int src_count  = batch_size * src_height * src_width * channels;
+      const int out_height = crop_[0];
+      const int out_width  = crop_[1];
+      const int out_count  = batch_size * out_height * out_width * channels;
+
+      // All tensors for this op
+      Tensor chromatic_coeffs_a_t;
+      Tensor chromatic_coeffs_b_t;
+
+      // Allocate the memory for the output images
+      Tensor *output_a_t;
+      Tensor *output_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(0, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(1, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_b_t));
+
+      // Allocate the memory for the output spatial transforms
+      Tensor *spat_transform_a_t;
+      Tensor *spat_transform_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_b_t));
+
+      // Compute discount for coefficients if using a schedule
+      float discount_coeff_a = 1.0;
+      float discount_coeff_b = 1.0;
+
+      if (params_a_coeff_schedule_.size() == 3) {
+        float half_life     = params_a_coeff_schedule_[0];
+        float initial_coeff = params_a_coeff_schedule_[1];
+        float final_coeff   = params_a_coeff_schedule_[2];
+        discount_coeff_a = initial_coeff + (final_coeff - initial_coeff) *
+                           (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+      }
+
+      if (params_b_coeff_schedule_.size() == 3) {
+        if (params_a_coeff_schedule_.size() == 3) {
+          discount_coeff_b = discount_coeff_a;
+        } else {
+          float half_life     = params_b_coeff_schedule_[0];
+          float initial_coeff = params_b_coeff_schedule_[1];
+          float final_coeff   = params_b_coeff_schedule_[2];
+          discount_coeff_b = initial_coeff + (final_coeff - initial_coeff) *
+                             (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+        }
+      }
+
+      /*** BEGIN AUGMENTATION TO IMAGE A ***/
+      auto input_a  = input_a_t.tensor<float, 4>();
+      auto output_a = output_a_t->tensor<float, 4>();
+
+      // Load augmentation parameters for image A
+      AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+                                                    params_a_name_,
+                                                    params_a_rand_type_,
+                                                    params_a_exp_,
+                                                    params_a_mean_,
+                                                    params_a_spread_,
+                                                    params_a_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_a;
+
+
+      bool gen_spatial_transform   = aug_a.should_do_spatial_transform();
+      bool gen_chromatic_transform = aug_a.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff;
+
+        if (gen_spatial_transform) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_a, aug_a, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_a, aug_a, coeff);
+        }
+
+        coeffs_a.push_back(coeff);
+      }
+
+      // Copy spatial coefficients A to the output Tensor on the CPU
+      // (output for FlowAugmentation)
+      auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_a);
+
+      float *chromatic_coeffs_a_data = NULL;
+
+      if (gen_chromatic_transform) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_a_t));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_a = chromatic_coeffs_a_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_a, chromatic_coeffs_a);
+        chromatic_coeffs_a_data = chromatic_coeffs_a.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_a.data(),
+        output_a.data(),
+        spat_transform_a.data(),
+        chromatic_coeffs_a_data);
+
+      /*** END AUGMENTATION TO IMAGE A ***/
+
+      /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+      AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+                                                    params_b_name_,
+                                                    params_b_rand_type_,
+                                                    params_b_exp_,
+                                                    params_b_mean_,
+                                                    params_b_spread_,
+                                                    params_b_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_b;
+
+      bool gen_spatial_transform_b   = aug_b.should_do_spatial_transform();
+      bool gen_chromatic_transform_b = aug_b.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff(coeffs_a[n]);
+
+        // If we did a spatial transform on image A, we need to do the same one
+        // (+ possibly more) on image B
+        if (gen_spatial_transform_b) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_b, aug_b, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform_b) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_b, aug_b, coeff);
+        }
+
+        coeffs_b.push_back(coeff);
+      }
+
+      /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+
+      /*** BEGIN AUGMENTATION TO IMAGE B ***/
+      auto input_b  = input_b_t.tensor<float, 4>();
+      auto output_b = output_b_t->tensor<float, 4>();
+
+      // Copy spatial coefficients B to the output Tensor on the CPU
+      auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b);
+
+      float *chromatic_coeffs_b_data = NULL;
+
+      if (gen_chromatic_transform || gen_chromatic_transform_b) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        tensorflow::AllocatorAttributes pinned_allocator;
+        pinned_allocator.set_on_host(true);
+        pinned_allocator.set_gpu_compatible(true);
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_b_t, pinned_allocator));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_b = chromatic_coeffs_b_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_b, chromatic_coeffs_b);
+        chromatic_coeffs_b_data = chromatic_coeffs_b.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_b.data(),
+        output_b.data(),
+        spat_transform_b.data(),
+        chromatic_coeffs_b_data);
+
+      // FlowAugmentation needs the inverse
+      // TODO: To avoid rewriting, can we invert when we read on the
+      // FlowAugmentation side?
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b,
+                                                           true);
+
+      /*** END AUGMENTATION TO IMAGE B ***/
+    }
+
+  private:
+    std::vector<int32>crop_;
+
+    // Params A
+    std::vector<string>params_a_name_;
+    std::vector<string>params_a_rand_type_;
+    std::vector<bool>params_a_exp_;
+    std::vector<float>params_a_mean_;
+    std::vector<float>params_a_spread_;
+    std::vector<float>params_a_prob_;
+    std::vector<float>params_a_coeff_schedule_;
+
+    // Params B
+    std::vector<string>params_b_name_;
+    std::vector<string>params_b_rand_type_;
+    std::vector<bool>params_b_exp_;
+    std::vector<float>params_b_mean_;
+    std::vector<float>params_b_spread_;
+    std::vector<float>params_b_prob_;
+    std::vector<float>params_b_coeff_schedule_;
+};
+
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_CPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_GPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
new file mode 100644
index 0000000..7a2101d
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
@@ -0,0 +1,348 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+inline __device__ __host__ float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+__global__ void SpatialAugmentation(
+  const int32  nthreads,
+  const int    src_width,
+  const int    src_height,
+  const int    channels,
+  const int    src_count,
+  const int    out_width,
+  const int    out_height,
+  const float *src_data,
+  float       *out_data,
+  const float *transMats) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    int c = index % channels;
+    int x = (index / channels) % out_width;
+    int y = (index / channels / out_width) % out_height;
+    int n = index / channels / out_width / out_height;
+
+    const float *transMat = transMats + n * 6;
+    float xpos            = x * transMat[0] + y * transMat[1] + transMat[2];
+    float ypos            = x * transMat[3] + y * transMat[4] + transMat[5];
+
+    xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+    ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+    float tlx = floor(xpos);
+    float tly = floor(ypos);
+
+    // Bilinear interpolation
+    int srcTLIdx = ((n * src_height + tly) * src_width + tlx) * channels + c;
+    int srcTRIdx = min((int)(((n * src_height + tly) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+    int srcBLIdx = min((int)(((n * src_height + (tly + 1)) * src_width + tlx) * channels + c),
+                       src_count);
+    int srcBRIdx = min((int)(((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+
+    float xdist = xpos - tlx;
+    float ydist = ypos - tly;
+
+    float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                 + (xdist) * (ydist) * src_data[srcBRIdx]
+                 + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                 + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+    out_data[index] = dest;
+  }
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template<>
+void Augment(OpKernelContext *context,
+             const GPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int out_count     = batch_size * out_height * out_width * channels;
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_count, d);
+
+  printf("Chromatic transform not yet implemented on GPU, ignoring.");
+
+  SpatialAugmentation << < config.block_count, config.thread_per_block, 0, d.stream() >> > (
+    config.virtual_thread_count, src_width, src_height, channels, src_count,
+    out_width, out_height,
+    src_data, out_data, transMats);
+}
+
+//
+// template<typename Device>
+// class DataAugmentation : public OpKernel {
+//   public:
+//     explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+//       // Get the crop [height, width] tensor and verify its dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+//       OP_REQUIRES(ctx, crop_.size() == 2,
+//                   errors::InvalidArgument("crop must be 2 dimensions"));
+//
+//       // TODO: Verify params are all the same length
+//
+//       // Get the tensors for params_a and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_a_rand_type",
+// &params_a_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread",
+// &params_a_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+//
+//       // Get the tensors for params_b and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_b_rand_type",
+// &params_b_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread",
+// &params_b_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+//     }
+//
+//     void Compute(OpKernelContext *ctx) override {
+//       const GPUDevice& device = ctx->eigen_gpu_device();
+//
+//       // Get the input images
+//       const Tensor& input_a_t = ctx->input(0);
+//       const Tensor& input_b_t = ctx->input(1);
+//
+//       // Dimension constants
+//       const int batch_size = input_a_t.dim_size(0);
+//       const int src_height = input_a_t.dim_size(1);
+//       const int src_width  = input_a_t.dim_size(2);
+//       const int channels   = input_a_t.dim_size(3);
+//       const int src_count  = batch_size * src_height * src_width * channels;
+//       const int out_height = crop_[0];
+//       const int out_width  = crop_[1];
+//       const int out_count  = batch_size * out_height * out_width * channels;
+//
+//       // Allocate the memory for the output images
+//       Tensor *output_a_t;
+//       Tensor *output_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(0, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(1, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_b_t));
+//
+//       // Allocate the memory for the output spatial transforms
+//       Tensor *spat_transform_a_t;
+//       Tensor *spat_transform_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+//  &spat_transform_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+// &spat_transform_b_t));
+//
+//       // Allocate temporary pinned memory for the spatial transforms to be
+// used
+//       // on the GPU
+//       tensorflow::AllocatorAttributes pinned_allocator;
+//       pinned_allocator.set_on_host(true);
+//       pinned_allocator.set_gpu_compatible(true);
+//
+//       Tensor spat_transform_a_pinned_t;
+//       Tensor spat_transform_b_pinned_t;
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_a_pinned_t,
+// pinned_allocator));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_b_pinned_t,
+// pinned_allocator));
+//       auto spat_transform_a_pinned = spat_transform_a_pinned_t.tensor<float,
+// 2>();
+//       auto spat_transform_b_pinned = spat_transform_b_pinned_t.tensor<float,
+// 2>();
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE A ***/
+//       auto input_a  = input_a_t.tensor<float, 4>();
+//       auto output_a = output_a_t->tensor<float, 4>();
+//
+//       // Load augmentation parameters for image A
+//       AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+//                                                     params_a_name_,
+//                                                     params_a_rand_type_,
+//                                                     params_a_exp_,
+//                                                     params_a_mean_,
+//                                                     params_a_spread_,
+//                                                     params_a_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_a;
+//
+//       bool gen_spatial_transform = aug_a.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_a, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_a.push_back(coeff);
+//       }
+//
+//       // Copy spatial coefficients A to the output Tensor on the CPU (output
+// for
+//       // FlowAugmentation)
+//       auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_a);
+//
+//       // ...as well as a Tensor going to the GPU
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+//                                                            out_height,
+//                                                            src_width,
+//                                                            src_height,
+//
+//
+//
+//                                                    spat_transform_a_pinned);
+//
+//       CudaLaunchConfig config = GetCudaLaunchConfig(out_count, device);
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_a.data(), output_a.data(), spat_transform_a_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE A ***/
+//
+//       /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//       AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+//                                                     params_b_name_,
+//                                                     params_b_rand_type_,
+//                                                     params_b_exp_,
+//                                                     params_b_mean_,
+//                                                     params_b_spread_,
+//                                                     params_b_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_b;
+//
+//       gen_spatial_transform = aug_b.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_b, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_b.push_back(coeff);
+//       }
+//
+//       /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE B ***/
+//       auto input_b  = input_b_t.tensor<float, 4>();
+//       auto output_b = output_b_t->tensor<float, 4>();
+//
+//       // Copy spatial coefficients B to the output Tensor on the CPU
+//       auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_b,
+//                                                            true);
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//
+//
+//
+//                                                    spat_transform_b_pinned);
+//
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_b.data(), output_b.data(), spat_transform_b_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE B ***/
+//     }
+//
+//   private:
+//     std::vector<int32>crop_;
+//
+//     // Params A
+//     std::vector<string>params_a_name_;
+//     std::vector<string>params_a_rand_type_;
+//     std::vector<bool>params_a_exp_;
+//     std::vector<float>params_a_mean_;
+//     std::vector<float>params_a_spread_;
+//     std::vector<float>params_a_prob_;
+//
+//     // Params B
+//     std::vector<string>params_b_name_;
+//     std::vector<string>params_b_rand_type_;
+//     std::vector<bool>params_b_exp_;
+//     std::vector<float>params_b_mean_;
+//     std::vector<float>params_b_spread_;
+//     std::vector<float>params_b_prob_;
+// };
+} // namespace tensorflow
+#endif // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
new file mode 100644
index 0000000..545b8a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
@@ -0,0 +1,22 @@
+#ifndef FLOWNET_DATA_AUGMENTATION_H_
+#define FLOWNET_DATA_AUGMENTATION_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<class Device>
+void Augment(OpKernelContext *context,
+             const Device   & d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs);
+} // namespace tensorflow
+#endif // FLOWNET_DATA_AUGMENTATION_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
new file mode 100644
index 0000000..b5cc11f
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
@@ -0,0 +1,129 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline int clamp(int f, int a, int b) {
+  return std::max(a, std::min(f, b));
+}
+
+template<>
+void FillFlowAugmentation(const CPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int src_height      = flows.dimension(1);
+  const int src_width       = flows.dimension(2);
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+  float *output_ptr     = output.data();
+  const float *flow_ptr = flows.data();
+
+  for (int n = 0; n < batch_size; n++) {
+    const float *transMatA = transforms_from_a.data() + n * 6;
+    const float *transMatB = transforms_from_b.data() + n * 6;
+
+    for (int y = 0; y < out_height; y++) {
+      int outputIdxOffset = (n * out_height + y) * out_width;
+
+      for (int x = 0; x < out_width; x++) {
+        // Apply transformation matrix applied to first image
+        const float xpos1 = x * transMatA[0] + y * transMatA[1] + transMatA[2];
+        const float ypos1 = x * transMatA[3] + y * transMatA[4] + transMatA[5];
+
+        const int srcXIdx =
+          ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * 2 + 0;
+        const int srcYIdx = srcXIdx + 1;
+
+        const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+        const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+        // Apply inverse of the transformation matrix applied to second image
+        const float xpos3 = xpos2 * transMatB[0] + ypos2 * transMatB[1] + transMatB[2];
+        const float ypos3 = xpos2 * transMatB[3] + ypos2 * transMatB[4] + transMatB[5];
+
+        output_ptr[(outputIdxOffset + x) * 2 + 0] = xpos3 - (float)x;
+        output_ptr[(outputIdxOffset + x) * 2 + 1] = ypos3 - (float)y;
+      }
+    }
+  }
+}
+
+template<typename Device>
+class FlowAugmentation : public OpKernel {
+  public:
+    explicit FlowAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& flows_t             = ctx->input(0);
+      const Tensor& transforms_from_a_t = ctx->input(1);
+      const Tensor& transforms_from_b_t = ctx->input(2);
+
+      OP_REQUIRES(ctx, flows_t.dims() == 4,
+                  errors::InvalidArgument("Input images must have rank 4"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_a_t.shape()) &&
+                   transforms_from_a_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_a_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_a should be num_images x 6"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_b_t.shape()) &&
+                   transforms_from_b_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_b_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_b should be num_images x 6"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ flows_t.dim_size(0), crop_[0], crop_[1],
+                                     flows_t.dim_size(3) }), &output_t));
+
+      // Perform flow augmentation
+      auto flows             = flows_t.tensor<float, 4>();
+      auto transforms_from_a = transforms_from_a_t.tensor<float, 2>();
+      auto transforms_from_b = transforms_from_b_t.tensor<float, 2>();
+      auto output            = output_t->tensor<float, 4>();
+
+      FillFlowAugmentation(ctx->eigen_device<Device>(),
+                           output,
+                           flows,
+                           transforms_from_a,
+                           transforms_from_b);
+    }
+
+  private:
+    std::vector<int32>crop_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_CPU),
+                        FlowAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_GPU),
+                        FlowAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
new file mode 100644
index 0000000..7795991
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
@@ -0,0 +1,19 @@
+#ifndef FLOWNET_FLOW_AUG_H_
+#define FLOWNET_FLOW_AUG_H_
+
+// See docs in ../ops/image_ops.cc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+template<class Device>
+void FillFlowAugmentation(const Device& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOW_AUG_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
new file mode 100644
index 0000000..7e10864
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
@@ -0,0 +1,95 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+inline __device__ __host__ int clamp(int f, int a, int b) {
+  return max(a, min(f, b));
+}
+
+__global__ void FillFlowAugmentationKernel(
+  const int32 nthreads,
+  const float *flow_ptr,
+  const float *transforms_from_a,
+  const float *inv_transforms_from_b,
+  const int src_total_count, const int src_height, const int src_width,
+  const int batch_size, const int out_height,
+  const int out_width, float *output_ptr) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const float x = (float)(index % out_width);
+    const float y = (float)((index / out_width) % out_height);
+    const int   n = (index / out_width / out_height);
+
+    const int transformIdx = n * 6;
+
+    // Apply transformation matrix applied to second image
+    const float xpos1 = x * transforms_from_a[transformIdx + 0]
+                        + y * transforms_from_a[transformIdx + 1]
+                        + transforms_from_a[transformIdx + 2];
+    const float ypos1 = x * transforms_from_a[transformIdx + 3]
+                        + y * transforms_from_a[transformIdx + 4]
+                        + transforms_from_a[transformIdx + 5];
+
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    const int srcXIdx =
+      ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) *
+      2 + 0;
+    const int srcYIdx = srcXIdx + 1;
+
+    const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+    const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+    // Apply inverse of the transformation matrix applied to first image
+    const float xpos3 = xpos2 * inv_transforms_from_b[transformIdx + 0]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 1]
+                        + inv_transforms_from_b[transformIdx + 2];
+    const float ypos3 = xpos2 * inv_transforms_from_b[transformIdx + 3]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 4]
+                        + inv_transforms_from_b[transformIdx + 5];
+
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 0] = xpos3 -
+                                                                           x;
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 1] = ypos3 -
+                                                                           y;
+  }
+}
+
+template<>
+void FillFlowAugmentation(const GPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int depth           = 2;
+  const int total_count     = batch_size * out_height * out_width * depth;
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+
+  CudaLaunchConfig config = GetCudaLaunchConfig(total_count / 2, device);
+
+  FillFlowAugmentationKernel << < config.block_count, config.thread_per_block, 0,
+    device.stream() >> > (
+    total_count / 2, flows.data(), transforms_from_a.data(),
+    transforms_from_b.data(),
+    src_total_count, flows.dimension(1), flows.dimension(2), batch_size,
+    out_height, out_width, output.data());
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/preprocessing.cc b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
new file mode 100644
index 0000000..086a0d0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext *c) {
+  ShapeHandle input;
+
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch    = c->Dim(input, 0);
+  DimensionHandle    depth = c->Dim(input, 3);
+  std::vector<int32> crop_;
+  c->GetAttr("crop", &crop_);
+  DimensionHandle height = c->MakeDim(crop_[0]);
+  DimensionHandle width  = c->MakeDim(crop_[1]);
+  c->set_output(0, c->MakeShape({ batch, height, width, depth }));
+  return Status::OK();
+}
+
+REGISTER_OP("DataAugmentation")
+.Input("image_a: float32")
+.Input("image_b: float32")
+.Input("global_step: int64")
+.Attr("crop: list(int) >= 2")
+.Attr("params_a_name: list(string)")
+.Attr("params_a_rand_type: list(string)")
+.Attr("params_a_exp: list(bool)")
+.Attr("params_a_mean: list(float)")
+.Attr("params_a_spread: list(float)")
+.Attr("params_a_prob: list(float)")
+.Attr("params_a_coeff_schedule: list(float)")
+.Attr("params_b_name: list(string)")
+.Attr("params_b_rand_type: list(string)")
+.Attr("params_b_exp: list(bool)")
+.Attr("params_b_mean: list(float)")
+.Attr("params_b_spread: list(float)")
+.Attr("params_b_prob: list(float)")
+.Attr("params_b_coeff_schedule: list(float)")
+.Output("aug_image_a: float32")
+.Output("aug_image_b: float32")
+.Output("transforms_from_a: float32")
+.Output("transforms_from_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Verify input A and input B both have 4 dimensions
+    ShapeHandle input_shape_a, input_shape_b;
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape_a));
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_shape_b));
+
+    // TODO: Verify params vectors all have the same length
+
+    // TODO: Move this out of here and into Compute
+    // Verify input A and input B are the same shape
+    DimensionHandle batch_size, unused;
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 0),
+                                    c->Value(c->Dim(input_shape_b, 0)),
+                                    &batch_size));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 1),
+                                    c->Value(c->Dim(input_shape_b, 1)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 2),
+                                    c->Value(c->Dim(input_shape_b, 2)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 3),
+                                    c->Value(c->Dim(input_shape_b, 3)), &unused));
+
+    // Get cropping dimensions
+    std::vector<int32>crop_;
+    TF_RETURN_IF_ERROR(c->GetAttr("crop", &crop_));
+
+    // Reshape input shape to cropped shape
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 1, c->MakeDim(crop_[0]),
+                                     &input_shape_a));
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 2, c->MakeDim(crop_[1]),
+                                     &input_shape_a));
+
+    // Set output images shapes
+    c->set_output(0, input_shape_a);
+    c->set_output(1, input_shape_a);
+
+    // Set output spatial transforms shapes
+    c->set_output(2, c->MakeShape({ batch_size, 6 }));
+    c->set_output(3, c->MakeShape({ batch_size, 6 }));
+
+    return Status::OK();
+  });
+
+REGISTER_OP("FlowAugmentation")
+.Input("flows: float32")
+.Input("transforms_from_a: float32")
+.Input("transforms_from_b: float32")
+.Attr("crop: list(int) >= 2")
+.Output("transformed_flows: float32")
+.SetShapeFn(SetOutputToSizedImage);
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/training_schedules.py b/Codes/flownet2/src/training_schedules.py
new file mode 100644
index 0000000..4db5aab
--- /dev/null
+++ b/Codes/flownet2/src/training_schedules.py
@@ -0,0 +1,12 @@
+LONG_SCHEDULE = {
+    'step_values': [400000, 600000, 800000, 1000000],
+    'learning_rates': [0.0001, 0.00005, 0.000025, 0.0000125, 0.00000625],
+    'momentum': 0.9,
+    'momentum2': 0.999,
+    'weight_decay': 0.0004,
+    'max_iter': 1200000,
+}
+
+FINETUNE_SCHEDULE = {
+    # TODO: Finetune schedule
+}
diff --git a/Codes/flownet2/src/utils.py b/Codes/flownet2/src/utils.py
new file mode 100644
index 0000000..f6abe18
--- /dev/null
+++ b/Codes/flownet2/src/utils.py
@@ -0,0 +1,46 @@
+import tensorflow as tf
+
+
+# Thanks, https://github.com/tensorflow/tensorflow/issues/4079
+def LeakyReLU(x, leak=0.1, name="lrelu"):
+    with tf.variable_scope(name):
+        f1 = 0.5 * (1.0 + leak)
+        f2 = 0.5 * (1.0 - leak)
+        return f1 * x + f2 * abs(x)
+
+
+def average_endpoint_error(labels, predictions):
+    """
+    Given labels and predictions of size (N, H, W, 2), calculates average endpoint error:
+        sqrt[sum_across_channels{(X - Y)^2}]
+    """
+    num_samples = predictions.shape.as_list()[0]
+    with tf.name_scope(None, "average_endpoint_error", (predictions, labels)) as scope:
+        predictions = tf.to_float(predictions)
+        labels = tf.to_float(labels)
+        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+        squared_difference = tf.square(tf.subtract(predictions, labels))
+        # sum across channels: sum[(X - Y)^2] -> N, H, W, 1
+        loss = tf.reduce_sum(squared_difference, 3, keep_dims=True)
+        loss = tf.sqrt(loss)
+        return tf.reduce_sum(loss) / num_samples
+
+
+def pad(tensor, num=1):
+    """
+    Pads the given tensor along the height and width dimensions with `num` 0s on each side
+    """
+    return tf.pad(tensor, [[0, 0], [num, num], [num, num], [0, 0]], "CONSTANT")
+
+
+def antipad(tensor, num=1):
+    """
+    Performs a crop. "padding" for a deconvolutional layer (conv2d tranpose) removes
+    padding from the output rather than adding it to the input.
+    """
+    batch, h, w, c = tensor.get_shape().as_list()
+    # print(batch, h, w, c)
+    # print(type(batch), type(h), type(w), type(c))
+    # return tf.slice(tensor, begin=[0, num, num, 0], size=[batch, h - 2 * num, w - 2 * num, c])
+    return tensor[:, num: num + h - 2 * num, num: num + w - 2 * num, :]
diff --git a/Codes/flownet2/test.py b/Codes/flownet2/test.py
new file mode 100644
index 0000000..2fcb380
--- /dev/null
+++ b/Codes/flownet2/test.py
@@ -0,0 +1,163 @@
+import os
+import tensorflow as tf
+import numpy as np
+from scipy.misc import imread
+import matplotlib
+from src.flowlib import read_flow, flow_to_image
+matplotlib.use('TKAgg')
+import matplotlib.pyplot as plt
+
+_preprocessing_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./src/ops/build/preprocessing.so"))
+
+
+def display(img, c):
+    plt.subplot(int('22' + str(c + 1)))
+    plt.imshow(img[0, :, :, :])
+
+
+def main():
+    """
+.Input("image_a: float32")
+.Input("image_b: float32")
+.Attr("crop: list(int) >= 2")
+.Attr("params_a_name: list(string)")
+.Attr("params_a_rand_type: list(string)")
+.Attr("params_a_exp: list(bool)")
+.Attr("params_a_mean: list(float32)")
+.Attr("params_a_spread: list(float32)")
+.Attr("params_a_prob: list(float32)")
+.Attr("params_b_name: list(string)")
+.Attr("params_b_rand_type: list(string)")
+.Attr("params_b_exp: list(bool)")
+.Attr("params_b_mean: list(float32)")
+.Attr("params_b_spread: list(float32)")
+.Attr("params_b_prob: list(float32)")
+.Output("aug_image_a: float32")
+.Output("aug_image_b: float32")
+.Output("spatial_transform_a: float32")
+.Output("inv_spatial_transform_b: float32")
+    """
+
+    crop = [364, 492]
+    params_a_name = ['translate_x', 'translate_y']
+    params_a_rand_type = ['uniform_bernoulli', 'uniform_bernoulli']
+    params_a_exp = [False, False]
+    params_a_mean = [0.0, 0.0]
+    params_a_spread = [0.4, 0.4]
+    params_a_prob = [1.0, 1.0]
+    params_b_name = []
+    params_b_rand_type = []
+    params_b_exp = []
+    params_b_mean = []
+    params_b_spread = []
+    params_b_prob = []
+
+    with tf.Session() as sess:
+        with tf.device('/gpu:0'):
+            image_a = imread('./img0.ppm') / 255.0
+            image_b = imread('./img1.ppm') / 255.0
+            flow = read_flow('./flow.flo')
+
+            image_a_tf = tf.expand_dims(tf.to_float(tf.constant(image_a, dtype=tf.float64)), 0)
+            image_b_tf = tf.expand_dims(tf.to_float(tf.constant(image_b, dtype=tf.float64)), 0)
+
+            preprocess = _preprocessing_ops.data_augmentation(image_a_tf,
+                                                              image_b_tf,
+                                                              crop,
+                                                              params_a_name,
+                                                              params_a_rand_type,
+                                                              params_a_exp,
+                                                              params_a_mean,
+                                                              params_a_spread,
+                                                              params_a_prob,
+                                                              params_b_name,
+                                                              params_b_rand_type,
+                                                              params_b_exp,
+                                                              params_b_mean,
+                                                              params_b_spread,
+                                                              params_b_prob)
+
+            out = sess.run(preprocess)
+            trans = out.spatial_transform_a
+            inv_trans = out.inv_spatial_transform_b
+
+            print(trans.shape)
+            print(inv_trans.shape)
+
+            flow_tf = tf.expand_dims(tf.to_float(tf.constant(flow)), 0)
+            aug_flow_tf = _preprocessing_ops.flow_augmentation(flow_tf, trans, inv_trans, crop)
+
+            aug_flow = sess.run(aug_flow_tf)[0, :, :, :]
+
+            # Plot img0, img0aug
+            plt.subplot(321)
+            plt.imshow(image_a)
+            plt.subplot(322)
+            plt.imshow(out.aug_image_a[0, :, :, :])
+
+            # Plot img1, img1aug
+            plt.subplot(323)
+            plt.imshow(image_b)
+            plt.subplot(324)
+            plt.imshow(out.aug_image_b[0, :, :, :])
+
+            # Plot flow, flowaug
+            plt.subplot(325)
+            plt.imshow(flow_to_image(flow))
+            plt.subplot(326)
+            plt.imshow(flow_to_image(aug_flow))
+
+            plt.show()
+
+            # image_b_aug = sess.run(image_b_tf)
+            #
+            # display(np.expand_dims(image_a, 0), 0)
+            # display(np.expand_dims(image_b, 0), 1)
+            # display(image_a_aug, 2)
+            # display(image_b_aug, 3)
+            # plt.show()
+
+            # o = _preprocessing_ops.flow_augmentation(flow, trans, inv_t, [4, 8])
+            # print n[:, :, :]
+            # print n[0, 0, 1], n[0, 0, 0]
+            # print n[1, 0, 1], n[1, 0, 0]
+            # print n[2, 0, 1], n[2, 0, 0]
+            # print '---'
+            # print sess.run(o)
+
+            """# Goes along width first!!
+            // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+            // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+
+            H=5, W=10, K=2
+            n=0, h=1, w=5, k=0
+
+            (2 * 10)                + c
+
+            30      49                  n[0, 1, 5, 0]"""
+
+
+print(os.getpid())
+input("Press Enter to continue...")
+main()
+
+# Last index is channel!!
+
+#   K
+
+# value 13 should be at [0, 2, 7, 1] aka batch=0, height=1, width=0, channel=0. it is at index=20.
+#
+# items = {
+#     'N': [0, 0],
+#     'H': [5, 2],
+#     'W': [10, 7],
+#     'K': [2, 1],
+# }
+#
+# for (i1, v1) in items.iteritems():
+#     for (i2, v2) in items.iteritems():
+#         for (i3, v3) in items.iteritems():
+#             for (i4, v4) in items.iteritems():
+#                 if ((v1[1] * v2[0] + v2[1]) * v3[0] + v3[1]) * v4[0] + v4[1] == 55:
+#                     print 'found it: ', i1, i2, i3, i4
diff --git a/Codes/inference.py b/Codes/inference.py
new file mode 100644
index 0000000..0263339
--- /dev/null
+++ b/Codes/inference.py
@@ -0,0 +1,149 @@
+import tensorflow as tf
+import os
+import time
+import numpy as np
+import pickle
+
+
+from models import generator
+from utils import DataLoader, load, save, psnr_error
+from constant import const
+import evaluate
+
+
+slim = tf.contrib.slim
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = const.GPU
+
+dataset_name = const.DATASET
+test_folder = const.TEST_FOLDER
+
+num_his = const.NUM_HIS
+height, width = 256, 256
+
+snapshot_dir = const.SNAPSHOT_DIR
+psnr_dir = const.PSNR_DIR
+evaluate_name = const.EVALUATE
+
+print(const)
+
+
+# define dataset
+with tf.name_scope('dataset'):
+    test_video_clips_tensor = tf.placeholder(shape=[1, height, width, 3 * (num_his + 1)],
+                                             dtype=tf.float32)
+    test_inputs = test_video_clips_tensor[..., 0:num_his*3]
+    test_gt = test_video_clips_tensor[..., -3:]
+    print('test inputs = {}'.format(test_inputs))
+    print('test prediction gt = {}'.format(test_gt))
+
+# define testing generator function and
+# in testing, only generator networks, there is no discriminator networks and flownet.
+with tf.variable_scope('generator', reuse=None):
+    print('testing = {}'.format(tf.get_variable_scope().name))
+    test_outputs = generator(test_inputs, layers=4, output_channel=3)
+    test_psnr_error = psnr_error(gen_frames=test_outputs, gt_frames=test_gt)
+
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+with tf.Session(config=config) as sess:
+    # dataset
+    data_loader = DataLoader(test_folder, height, width)
+
+    # initialize weights
+    sess.run(tf.global_variables_initializer())
+    print('Init global successfully!')
+
+    # tf saver
+    saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=None)
+
+    restore_var = [v for v in tf.global_variables()]
+    loader = tf.train.Saver(var_list=restore_var)
+
+    def inference_func(ckpt, dataset_name, evaluate_name):
+        load(loader, sess, ckpt)
+
+        psnr_records = []
+        videos_info = data_loader.videos
+        num_videos = len(videos_info.keys())
+        total = 0
+        timestamp = time.time()
+
+        for video_name, video in videos_info.items():
+            length = video['length']
+            total += length
+            psnrs = np.empty(shape=(length,), dtype=np.float32)
+
+            for i in range(num_his, length):
+                video_clip = data_loader.get_video_clips(video_name, i - num_his, i + 1)
+                psnr = sess.run(test_psnr_error,
+                                feed_dict={test_video_clips_tensor: video_clip[np.newaxis, ...]})
+                psnrs[i] = psnr
+
+                print('video = {} / {}, i = {} / {}, psnr = {:.6f}'.format(
+                    video_name, num_videos, i, length, psnr))
+
+            psnrs[0:num_his] = psnrs[num_his]
+            psnr_records.append(psnrs)
+
+        result_dict = {'dataset': dataset_name, 'psnr': psnr_records, 'flow': [], 'names': [], 'diff_mask': []}
+
+        used_time = time.time() - timestamp
+        print('total time = {}, fps = {}'.format(used_time, total / used_time))
+
+        # TODO specify what's the actual name of ckpt.
+        pickle_path = os.path.join(psnr_dir, os.path.split(ckpt)[-1])
+        with open(pickle_path, 'wb') as writer:
+            pickle.dump(result_dict, writer, pickle.HIGHEST_PROTOCOL)
+
+        results = evaluate.evaluate(evaluate_name, pickle_path)
+        print(results)
+
+
+    if os.path.isdir(snapshot_dir):
+        def check_ckpt_valid(ckpt_name):
+            is_valid = False
+            ckpt = ''
+            if ckpt_name.startswith('model.ckpt-'):
+                ckpt_name_splits = ckpt_name.split('.')
+                ckpt = str(ckpt_name_splits[0]) + '.' + str(ckpt_name_splits[1])
+                ckpt_path = os.path.join(snapshot_dir, ckpt)
+                if os.path.exists(ckpt_path + '.index') and os.path.exists(ckpt_path + '.meta') and \
+                        os.path.exists(ckpt_path + '.data-00000-of-00001'):
+                    is_valid = True
+
+            return is_valid, ckpt
+
+        def scan_psnr_folder():
+            tested_ckpt_in_psnr_sets = set()
+            for test_psnr in os.listdir(psnr_dir):
+                tested_ckpt_in_psnr_sets.add(test_psnr)
+            return tested_ckpt_in_psnr_sets
+
+        def scan_model_folder():
+            saved_models = set()
+            for ckpt_name in os.listdir(snapshot_dir):
+                is_valid, ckpt = check_ckpt_valid(ckpt_name)
+                if is_valid:
+                    saved_models.add(ckpt)
+            return saved_models
+
+        tested_ckpt_sets = scan_psnr_folder()
+        while True:
+            all_model_ckpts = scan_model_folder()
+            new_model_ckpts = all_model_ckpts - tested_ckpt_sets
+
+            for ckpt_name in new_model_ckpts:
+                # inference
+                ckpt = os.path.join(snapshot_dir, ckpt_name)
+                inference_func(ckpt, dataset_name, evaluate_name)
+
+                tested_ckpt_sets.add(ckpt_name)
+
+            print('waiting for models...')
+            evaluate.evaluate('compute_auc', psnr_dir)
+            time.sleep(60)
+    else:
+        inference_func(snapshot_dir, dataset_name, evaluate_name)
diff --git a/Codes/loss_functions.py b/Codes/loss_functions.py
new file mode 100644
index 0000000..ca97966
--- /dev/null
+++ b/Codes/loss_functions.py
@@ -0,0 +1,54 @@
+import tensorflow as tf
+import numpy as np
+
+
+def flow_loss(gen_flows, gt_flows):
+    print(gen_flows['flow'])
+    return tf.reduce_mean(tf.abs(gen_flows['flow'] - gt_flows['flow']))
+
+
+def intensity_loss(gen_frames, gt_frames, l_num):
+    """
+    Calculates the sum of lp losses between the predicted and ground truth frames.
+
+    @param gen_frames: The predicted frames at each scale.
+    @param gt_frames: The ground truth frames at each scale
+    @param l_num: 1 or 2 for l1 and l2 loss, respectively).
+
+    @return: The lp loss.
+    """
+    return tf.reduce_mean(tf.abs((gen_frames - gt_frames) ** l_num))
+
+
+def gradient_loss(gen_frames, gt_frames, alpha):
+    """
+    Calculates the sum of GDL losses between the predicted and ground truth frames.
+
+    @param gen_frames: The predicted frames at each scale.
+    @param gt_frames: The ground truth frames at each scale
+    @param alpha: The power to which each gradient term is raised.
+
+    @return: The GDL loss.
+    """
+    # calculate the loss for each scale
+    # create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
+
+    channels = gen_frames.get_shape().as_list()[-1]
+    pos = tf.constant(np.identity(channels), dtype=tf.float32)     # 3 x 3
+    neg = -1 * pos
+    filter_x = tf.expand_dims(tf.stack([neg, pos]), 0)  # [-1, 1]
+    filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)])  # [[1],[-1]]
+    strides = [1, 1, 1, 1]  # stride of (1, 1)
+    padding = 'SAME'
+
+    gen_dx = tf.abs(tf.nn.conv2d(gen_frames, filter_x, strides, padding=padding))
+    gen_dy = tf.abs(tf.nn.conv2d(gen_frames, filter_y, strides, padding=padding))
+    gt_dx = tf.abs(tf.nn.conv2d(gt_frames, filter_x, strides, padding=padding))
+    gt_dy = tf.abs(tf.nn.conv2d(gt_frames, filter_y, strides, padding=padding))
+
+    grad_diff_x = tf.abs(gt_dx - gen_dx)
+    grad_diff_y = tf.abs(gt_dy - gen_dy)
+
+    # condense into one tensor and avg
+    return tf.reduce_mean(grad_diff_x ** alpha + grad_diff_y ** alpha)
+
diff --git a/Codes/models.py b/Codes/models.py
new file mode 100644
index 0000000..8c20134
--- /dev/null
+++ b/Codes/models.py
@@ -0,0 +1,44 @@
+import tensorflow as tf
+
+import unet
+import pix2pix
+
+from flownet2.src.flowlib import flow_to_image
+from flownet2.src.flownet_sd.flownet_sd import FlowNetSD  # Ok
+from flownet2.src.training_schedules import LONG_SCHEDULE
+from flownet2.src.net import Mode
+
+
+slim = tf.contrib.slim
+
+
+def generator(inputs, layers, features_root=64, filter_size=3, pool_size=2, output_channel=3):
+    return unet.unet(inputs, layers, features_root, filter_size, pool_size, output_channel)
+
+
+def discriminator(inputs, num_filers=(128, 256, 512, 512)):
+    logits, end_points = pix2pix.pix2pix_discriminator(inputs, num_filers)
+    return logits, end_points['predictions']
+
+
+def flownet(input_a, input_b, height, width, reuse=None):
+    net = FlowNetSD(mode=Mode.TEST)
+    # train preds flow
+    input_a = (input_a + 1.0) / 2.0     # flownet receives image with color space in [0, 1]
+    input_b = (input_b + 1.0) / 2.0     # flownet receives image with color space in [0, 1]
+    # input size is 384 x 512
+    input_a = tf.image.resize_images(input_a, [height, width])
+    input_b = tf.image.resize_images(input_b, [height, width])
+    flows = net.model(
+        inputs={'input_a': input_a, 'input_b': input_b},
+        training_schedule=LONG_SCHEDULE,
+        trainable=False, reuse=reuse
+    )
+    return flows['flow']
+
+
+def initialize_flownet(sess, checkpoint):
+    flownet_vars = slim.get_variables_to_restore(include=['FlowNetSD'])
+    flownet_saver = tf.train.Saver(flownet_vars)
+    print('FlownetSD restore from {}!'.format(checkpoint))
+    flownet_saver.restore(sess, checkpoint)
diff --git a/Codes/models/download_pretrains.sh b/Codes/models/download_pretrains.sh
new file mode 100644
index 0000000..08e58ec
--- /dev/null
+++ b/Codes/models/download_pretrains.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Downloading trained models on ped1, ped2 and avenue datasets ....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4mHfGdUxGoa7NnnI-eIlTqInymvmHyDOSGGw5zKM08jOGukHKdYdxmtZiEEh-rCAWK7oTDTstQ5bKazvjdyTtsIUW7zxcKnVgIsgZg6DpEb-Qdq83Zmnnw6nv7pX5HhiOkMxc42CLl65QK0A2Mv1Cmj-062Pyodm-Mt5r24Id3_glS0NT6BdvAp7-VbevkXygnmXQrcXRQU6d0y1cHlZJ2ig/pretrains.tar.gz"
+tar -xvf pretrains.tar.gz
+rm pretrains.tar.gz
+
+echo "Download pretrains successfully..."
+
+
diff --git a/Codes/pix2pix.py b/Codes/pix2pix.py
new file mode 100644
index 0000000..941c8fc
--- /dev/null
+++ b/Codes/pix2pix.py
@@ -0,0 +1,274 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Implementation of the Image-to-Image Translation model.
+This network represents a port of the following work:
+  Image-to-Image Translation with Conditional Adversarial Networks
+  Phillip Isola, Jun-Yan Zhu, Tinghui Zhou and Alexei A. Efros
+  Arxiv, 2017
+  https://phillipi.github.io/pix2pix/
+A reference implementation written in Lua can be found at:
+https://github.com/phillipi/pix2pix/blob/master/models.lua
+"""
+import collections
+import functools
+
+import tensorflow as tf
+
+layers = tf.contrib.layers
+
+
+def pix2pix_arg_scope():
+    """Returns a default argument scope for isola_net.
+    Returns:
+      An arg scope.
+    """
+    # These parameters come from the online port, which don't necessarily match
+    # those in the paper.
+    # TODO(nsilberman): confirm these values with Philip.
+    instance_norm_params = {
+        'center': True,
+        'scale': True,
+        'epsilon': 0.00001,
+    }
+
+    with tf.contrib.framework.arg_scope(
+            [layers.conv2d, layers.conv2d_transpose],
+            normalizer_fn=layers.instance_norm,
+            normalizer_params=instance_norm_params,
+            weights_initializer=tf.random_normal_initializer(0, 0.02)) as sc:
+        return sc
+
+
+def upsample(net, num_outputs, kernel_size, method='nn_upsample_conv'):
+    """Upsamples the given inputs.
+    Args:
+      net: A `Tensor` of size [batch_size, height, width, filters].
+      num_outputs: The number of output filters.
+      kernel_size: A list of 2 scalars or a 1x2 `Tensor` indicating the scale,
+        relative to the inputs, of the output dimensions. For example, if kernel
+        size is [2, 3], then the output height and width will be twice and three
+        times the input size.
+      method: The upsampling method.
+    Returns:
+      An `Tensor` which was upsampled using the specified method.
+    Raises:
+      ValueError: if `method` is not recognized.
+    """
+    net_shape = tf.shape(net)
+    height = net_shape[1]
+    width = net_shape[2]
+
+    if method == 'nn_upsample_conv':
+        net = tf.image.resize_nearest_neighbor(
+            net, [kernel_size[0] * height, kernel_size[1] * width])
+        net = layers.conv2d(net, num_outputs, [4, 4], activation_fn=None)
+    elif method == 'conv2d_transpose':
+        net = layers.conv2d_transpose(
+            net, num_outputs, [4, 4], stride=kernel_size, activation_fn=None)
+    else:
+        raise ValueError('Unknown method: [%s]', method)
+
+    return net
+
+
+class Block(
+    collections.namedtuple('Block', ['num_filters', 'decoder_keep_prob'])):
+    """Represents a single block of encoder and decoder processing.
+    The Image-to-Image translation paper works a bit differently than the original
+    U-Net model. In particular, each block represents a single operation in the
+    encoder which is concatenated with the corresponding decoder representation.
+    A dropout layer follows the concatenation and convolution of the concatenated
+    features.
+    """
+    pass
+
+
+def _default_generator_blocks():
+    """Returns the default generator block definitions.
+    Returns:
+      A list of generator blocks.
+    """
+    return [
+        Block(64, 0.5),
+        Block(128, 0.5),
+        Block(256, 0.5),
+        Block(512, 0),
+        Block(512, 0),
+        Block(512, 0),
+        Block(512, 0),
+    ]
+
+
+def pix2pix_generator(net,
+                      num_outputs,
+                      blocks=None,
+                      upsample_method='nn_upsample_conv',
+                      is_training=False):  # pylint: disable=unused-argument
+    """Defines the network architecture.
+    Args:
+      net: A `Tensor` of size [batch, height, width, channels]. Note that the
+        generator currently requires square inputs (e.g. height=width).
+      num_outputs: The number of (per-pixel) outputs.
+      blocks: A list of generator blocks or `None` to use the default generator
+        definition.
+      upsample_method: The method of upsampling images, one of 'nn_upsample_conv'
+        or 'conv2d_transpose'
+      is_training: Whether or not we're in training or testing mode.
+    Returns:
+      A `Tensor` representing the model output and a dictionary of model end
+        points.
+    Raises:
+      ValueError: if the input heights do not match their widths.
+    """
+    end_points = {}
+
+    blocks = blocks or _default_generator_blocks()
+
+    input_size = net.get_shape().as_list()
+    height, width = input_size[1], input_size[2]
+    if height != width:
+        raise ValueError('The input height must match the input width.')
+
+    input_size[3] = num_outputs
+
+    upsample_fn = functools.partial(upsample, method=upsample_method)
+
+    encoder_activations = []
+
+    ###########
+    # Encoder #
+    ###########
+    with tf.variable_scope('encoder'):
+        with tf.contrib.framework.arg_scope(
+                [layers.conv2d],
+                kernel_size=[4, 4],
+                stride=2,
+                activation_fn=tf.nn.leaky_relu):
+
+            for block_id, block in enumerate(blocks):
+                # No normalizer for the first encoder layers as per 'Image-to-Image',
+                # Section 5.1.1
+                if block_id == 0:
+                    # First layer doesn't use normalizer_fn
+                    net = layers.conv2d(net, block.num_filters, normalizer_fn=None)
+                elif block_id < len(blocks) - 1:
+                    net = layers.conv2d(net, block.num_filters)
+                else:
+                    # Last layer doesn't use activation_fn nor normalizer_fn
+                    net = layers.conv2d(
+                        net, block.num_filters, activation_fn=None, normalizer_fn=None)
+
+                encoder_activations.append(net)
+                end_points['encoder%d' % block_id] = net
+
+    ###########
+    # Decoder #
+    ###########
+    reversed_blocks = list(blocks)
+    reversed_blocks.reverse()
+
+    with tf.variable_scope('decoder'):
+        # Dropout is used at both train and test time as per 'Image-to-Image',
+        # Section 2.1 (last paragraph).
+        with tf.contrib.framework.arg_scope([layers.dropout], is_training=is_training):
+
+            for block_id, block in enumerate(reversed_blocks):
+                if block_id > 0:
+                    net = tf.concat([net, encoder_activations[-block_id - 1]], axis=3)
+
+                # The Relu comes BEFORE the upsample op:
+                net = tf.nn.relu(net)
+                net = upsample_fn(net, block.num_filters, [2, 2])
+                if block.decoder_keep_prob > 0:
+                    net = layers.dropout(net, keep_prob=block.decoder_keep_prob)
+                end_points['decoder%d' % block_id] = net
+
+    with tf.variable_scope('output'):
+        logits = layers.conv2d(net, num_outputs, [4, 4], activation_fn=None)
+        # print(logits)
+        # logits = tf.reshape(logits, input_size)
+
+        end_points['logits'] = logits
+        end_points['predictions'] = tf.tanh(logits)
+
+    return logits, end_points
+
+
+def pix2pix_discriminator(net, num_filters, padding=2, is_training=False):
+    """Creates the Image2Image Translation Discriminator.
+    Args:
+      net: A `Tensor` of size [batch_size, height, width, channels] representing
+        the input.
+      num_filters: A list of the filters in the discriminator. The length of the
+        list determines the number of layers in the discriminator.
+      padding: Amount of reflection padding applied before each convolution.
+      is_training: Whether or not the model is training or testing.
+    Returns:
+      A logits `Tensor` of size [batch_size, N, N, 1] where N is the number of
+      'patches' we're attempting to discriminate and a dictionary of model end
+      points.
+    """
+    del is_training
+    end_points = {}
+
+    num_layers = len(num_filters)
+
+    def padded(net, scope):
+        if padding:
+            with tf.variable_scope(scope):
+                spatial_pad = tf.constant(
+                    [[0, 0], [padding, padding], [padding, padding], [0, 0]],
+                    dtype=tf.int32)
+                return tf.pad(net, spatial_pad, 'REFLECT')
+        else:
+            return net
+
+    with tf.contrib.framework.arg_scope(
+            [layers.conv2d],
+            kernel_size=[4, 4],
+            stride=2,
+            padding='valid',
+            activation_fn=tf.nn.leaky_relu):
+
+        # No normalization on the input layer.
+        net = layers.conv2d(
+            padded(net, 'conv0'), num_filters[0], normalizer_fn=None, scope='conv0')
+
+        end_points['conv0'] = net
+
+        for i in range(1, num_layers - 1):
+            net = layers.conv2d(
+                padded(net, 'conv%d' % i), num_filters[i], scope='conv%d' % i)
+            end_points['conv%d' % i] = net
+
+        # Stride 1 on the last layer.
+        net = layers.conv2d(
+            padded(net, 'conv%d' % (num_layers - 1)),
+            num_filters[-1],
+            stride=1,
+            scope='conv%d' % (num_layers - 1))
+        end_points['conv%d' % (num_layers - 1)] = net
+
+        # 1-dim logits, stride 1, no activation, no normalization.
+        logits = layers.conv2d(
+            padded(net, 'conv%d' % num_layers),
+            1,
+            stride=1,
+            activation_fn=None,
+            normalizer_fn=None,
+            scope='conv%d' % num_layers)
+        end_points['logits'] = logits
+        end_points['predictions'] = tf.sigmoid(logits)
+    return logits, end_points
diff --git a/Codes/requirements.txt b/Codes/requirements.txt
new file mode 100644
index 0000000..91d2206
--- /dev/null
+++ b/Codes/requirements.txt
@@ -0,0 +1,9 @@
+numpy==1.14.1
+scipy==1.0.0
+matplotlib==2.1.2
+tensorflow==1.4.1
+tensorflow_gpu==1.4.1
+Pillow==5.0.0
+pypng==0.0.18
+scikit_learn==0.19.1
+opencv-python==3.2.0.6
diff --git a/Codes/runner.sh b/Codes/runner.sh
new file mode 100644
index 0000000..f0b545f
--- /dev/null
+++ b/Codes/runner.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+/home/liuwen/ssd/datasets/ped2/training/frames
+/home/liuwen/ssd/datasets/ped2/testing/frames
+
+python train.py  --dataset  ped2    \
+                 --train_folder  ../Data/ped2/training/frames     \
+                 --test_folder  ../Data/ped2/testing/frames       \
+                 --gpu  0       \
+                 --iters    80000
+
+
+python inference.py  --dataset  ped2    \
+                    --test_folder  /home/liuwen/ssd/datasets/ped2/testing/frames      \
+                    --gpu  3    \
+                    --snapshot_dir    models/pretrains/ped2
+
+
+python train.py  --dataset  avenue    \
+                 --train_folder  ../Data/avenue/training/frames     \
+                 --test_folder  ../Data/avenue/testing/frames       \
+                 --gpu  2       \
+                 --iters    80000
+
+python inference.py  --dataset  avenue    \
+                     --test_folder  ../Data/avenue/testing/frames       \
+                     --gpu  3
+
+
+python train.py  --dataset  ped1    \
+                 --train_folder  ../Data/ped1/training/frames     \
+                 --test_folder  ../Data/ped1/testing/frames       \
+                 --gpu  2       \
+                 --iters    80000
+
+python inference.py  --dataset  ped1    \
+                     --test_folder  ../Data/ped1/testing/frames       \
+                     --gpu  3
+
+python train.py  --dataset  ped1    \
+                 --train_folder  ../Data/ped1/training/frames     \
+                 --test_folder  ../Data/ped1/testing/frames       \
+                 --gpu  0       \
+                 --iters    80000   \
+                 --config   training_hyper_params/hyper_params_lp_0.ini
+
+python inference.py  --dataset  ped1    \
+                     --test_folder  ../Data/ped1/testing/frames       \
+                     --gpu  1   \
+                     --config   training_hyper_params/hyper_params_lp_0.ini
+
+
+python inference.py  --dataset  ped2    \
+                     --test_folder  /home/liuwen/ssd/datasets/ped2/testing/frames       \
+                     --gpu  1   \
+                     --snapshot_dir     models/pretrains/ped2
+\ No newline at end of file
diff --git a/Codes/train.py b/Codes/train.py
new file mode 100644
index 0000000..42a8fc9
--- /dev/null
+++ b/Codes/train.py
@@ -0,0 +1,215 @@
+import tensorflow as tf
+import os
+
+from models import generator, discriminator, flownet, initialize_flownet
+from loss_functions import intensity_loss, gradient_loss
+from utils import DataLoader, load, save, psnr_error
+from constant import const
+
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = const.GPU
+
+dataset_name = const.DATASET
+train_folder = const.TRAIN_FOLDER
+test_folder = const.TEST_FOLDER
+
+batch_size = const.BATCH_SIZE
+iterations = const.ITERATIONS
+num_his = const.NUM_HIS
+height, width = 256, 256
+flow_height, flow_width = const.FLOW_HEIGHT, const.FLOW_WIDTH
+
+l_num = const.L_NUM
+alpha_num = const.ALPHA_NUM
+lam_lp = const.LAM_LP
+lam_gdl = const.LAM_GDL
+lam_adv = const.LAM_ADV
+lam_flow = const.LAM_FLOW
+adversarial = (lam_adv != 0)
+
+summary_dir = const.SUMMARY_DIR
+snapshot_dir = const.SNAPSHOT_DIR
+
+
+print(const)
+
+# define dataset
+with tf.name_scope('dataset'):
+    train_loader = DataLoader(train_folder, resize_height=height, resize_width=width)
+    train_dataset = train_loader(batch_size=batch_size, time_steps=num_his, num_pred=1)
+
+    train_it = train_dataset.make_one_shot_iterator()
+    train_videos_clips_tensor = train_it.get_next()
+    train_videos_clips_tensor.set_shape([batch_size, height, width, 3*(num_his + 1)])
+
+    train_inputs = train_videos_clips_tensor[..., 0:num_his*3]
+    train_gt = train_videos_clips_tensor[..., -3:]
+
+    print('train inputs = {}'.format(train_inputs))
+    print('train prediction gt = {}'.format(train_gt))
+
+    test_loader = DataLoader(test_folder, resize_height=height, resize_width=width)
+    test_dataset = test_loader(batch_size=batch_size, time_steps=num_his, num_pred=1)
+    test_it = test_dataset.make_one_shot_iterator()
+    test_videos_clips_tensor = test_it.get_next()
+    test_videos_clips_tensor.set_shape([batch_size, height, width, 3*(num_his + 1)])
+
+    test_inputs = test_videos_clips_tensor[..., 0:num_his*3]
+    test_gt = test_videos_clips_tensor[..., -3:]
+
+    print('test inputs = {}'.format(test_inputs))
+    print('test prediction gt = {}'.format(test_gt))
+
+# define training generator function
+with tf.variable_scope('generator', reuse=None):
+    print('training = {}'.format(tf.get_variable_scope().name))
+    train_outputs = generator(train_inputs, layers=4, output_channel=3)
+    train_psnr_error = psnr_error(gen_frames=train_outputs, gt_frames=train_gt)
+
+# define testing generator function
+with tf.variable_scope('generator', reuse=True):
+    print('testing = {}'.format(tf.get_variable_scope().name))
+    test_outputs = generator(test_inputs, layers=4, output_channel=3)
+    test_psnr_error = psnr_error(gen_frames=test_outputs, gt_frames=test_gt)
+
+
+# define intensity loss
+if lam_lp != 0:
+    lp_loss = intensity_loss(gen_frames=train_outputs, gt_frames=train_gt, l_num=l_num)
+else:
+    lp_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define gdl loss
+if lam_gdl != 0:
+    gdl_loss = gradient_loss(gen_frames=train_outputs, gt_frames=train_gt, alpha=alpha_num)
+else:
+    gdl_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define flow loss
+if lam_flow != 0:
+    train_gt_flow = flownet(input_a=train_inputs[..., -3:], input_b=train_gt,
+                            height=flow_height, width=flow_width, reuse=None)
+    train_pred_flow = flownet(input_a=train_inputs[..., -3:], input_b=train_outputs,
+                              height=flow_height, width=flow_width, reuse=True)
+    flow_loss = tf.reduce_mean(tf.abs(train_gt_flow - train_pred_flow))
+else:
+    flow_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define adversarial loss
+if adversarial:
+    with tf.variable_scope('discriminator', reuse=None):
+        real_logits, real_outputs = discriminator(inputs=train_gt)
+    with tf.variable_scope('discriminator', reuse=True):
+        fake_logits, fake_outputs = discriminator(inputs=train_outputs)
+
+    print('real_outputs = {}'.format(real_outputs))
+    print('fake_outputs = {}'.format(fake_outputs))
+
+    adv_loss = tf.reduce_mean(tf.square(fake_outputs - 1) / 2)
+    dis_loss = tf.reduce_mean(tf.square(real_outputs - 1) / 2) + tf.reduce_mean(tf.square(fake_outputs) / 2)
+else:
+    adv_loss = tf.constant(0.0, dtype=tf.float32)
+    dis_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+with tf.name_scope('training'):
+    g_loss = tf.add_n([lp_loss * lam_lp, gdl_loss * lam_gdl, adv_loss * lam_adv, flow_loss * lam_flow], name='g_loss')
+
+    g_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='g_step')
+    g_lrate = tf.train.piecewise_constant(g_step, boundaries=const.LRATE_G_BOUNDARIES, values=const.LRATE_G)
+    g_optimizer = tf.train.AdamOptimizer(learning_rate=g_lrate, name='g_optimizer')
+    g_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator')
+
+    g_train_op = g_optimizer.minimize(g_loss, global_step=g_step, var_list=g_vars, name='g_train_op')
+
+    if adversarial:
+        # training discriminator
+        d_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='d_step')
+        d_lrate = tf.train.piecewise_constant(d_step, boundaries=const.LRATE_D_BOUNDARIES, values=const.LRATE_D)
+        d_optimizer = tf.train.AdamOptimizer(learning_rate=d_lrate, name='g_optimizer')
+        d_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator')
+
+        d_train_op = d_optimizer.minimize(dis_loss, global_step=d_step, var_list=d_vars, name='d_optimizer')
+    else:
+        d_step = None
+        d_lrate = None
+        d_train_op = None
+
+# add all to summaries
+tf.summary.scalar(tensor=train_psnr_error, name='train_psnr_error')
+tf.summary.scalar(tensor=test_psnr_error, name='test_psnr_error')
+tf.summary.scalar(tensor=g_loss, name='g_loss')
+tf.summary.scalar(tensor=adv_loss, name='adv_loss')
+tf.summary.scalar(tensor=dis_loss, name='dis_loss')
+tf.summary.image(tensor=train_outputs, name='train_outputs')
+tf.summary.image(tensor=train_gt, name='train_gt')
+tf.summary.image(tensor=test_outputs, name='test_outputs')
+tf.summary.image(tensor=test_gt, name='test_gt')
+summary_op = tf.summary.merge_all()
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+with tf.Session(config=config) as sess:
+    # summaries
+    summary_writer = tf.summary.FileWriter(summary_dir, graph=sess.graph)
+
+    # initialize weights
+    sess.run(tf.global_variables_initializer())
+    print('Init successfully!')
+
+    if lam_flow != 0:
+        # initialize flownet
+        initialize_flownet(sess, const.FLOWNET_CHECKPOINT)
+
+    # tf saver
+    saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=None)
+    restore_var = [v for v in tf.global_variables()]
+    loader = tf.train.Saver(var_list=restore_var)
+    if os.path.isdir(snapshot_dir):
+        ckpt = tf.train.get_checkpoint_state(snapshot_dir)
+        if ckpt and ckpt.model_checkpoint_path:
+            load(loader, sess, ckpt.model_checkpoint_path)
+        else:
+            print('No checkpoint file found.')
+    else:
+        load(loader, sess, snapshot_dir)
+
+    _step, _loss, _summaries = 0, None, None
+    while _step < iterations:
+        try:
+            if adversarial:
+                print('Training discriminator...')
+                _, _d_lr, _d_step, _dis_loss = sess.run([d_train_op, d_lrate, d_step, dis_loss])
+            else:
+                _d_step = 0
+                _d_lr = 0
+                _dis_loss = 0
+
+            print('Training generator...')
+            _, _g_lr, _step, _lp_loss, _gdl_loss, _adv_loss, _flow_loss, _g_loss, _train_psnr, _summaries = sess.run(
+                [g_train_op, g_lrate, g_step, lp_loss, gdl_loss, adv_loss, flow_loss, g_loss, train_psnr_error, summary_op])
+
+            if _step % 10 == 0:
+                print('DiscriminatorModel: Step {} | Global Loss: {:.6f}, lr = {:.6f}'.format(_d_step, _dis_loss, _d_lr))
+                print('GeneratorModel : Step {}, lr = {:.6f}'.format(_step, _g_lr))
+                print('                 Global      Loss : ', _g_loss)
+                print('                 intensity   Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_lp_loss, lam_lp, _lp_loss * lam_lp))
+                print('                 gradient    Loss : ({:.4f} * {:.4f} = {:.4f})'.format( _gdl_loss, lam_gdl, _gdl_loss * lam_gdl))
+                print('                 adversarial Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_adv_loss, lam_adv, _adv_loss * lam_adv))
+                print('                 flownet     Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_flow_loss, lam_flow, _flow_loss * lam_flow))
+                print('                 PSNR  Error      : ', _train_psnr)
+            if _step % 100 == 0:
+                summary_writer.add_summary(_summaries, global_step=_step)
+                print('Save summaries...')
+
+            if _step % 1000 == 0:
+                save(saver, sess, snapshot_dir, _step)
+
+        except tf.errors.OutOfRangeError:
+            print('Finish successfully!')
+            save(saver, sess, snapshot_dir, _step)
+            break
diff --git a/Codes/training_hyper_params/hyper_params.ini b/Codes/training_hyper_params/hyper_params.ini
new file mode 100644
index 0000000..99dbf00
--- /dev/null
+++ b/Codes/training_hyper_params/hyper_params.ini
@@ -0,0 +1,103 @@
+[ped2]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [7000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [7000]
+
+[ped1]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 0.01
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [40000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [40000]
+
+
+[avenue]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss,
+# we found in smaller lp is slightly better in avenue, but not too much difference.
+LAM_LP = 0
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0002, 0.00002]
+LRATE_G_BOUNDARIES = [100000]
+
+LRATE_D = [0.00002, 0.000002]
+LRATE_D_BOUNDARIES = [100000]
+
+
+[shanghaitech]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0002, 0.00002]
+LRATE_G_BOUNDARIES = [50000]
+
+LRATE_D = [0.00002, 0.000002]
+LRATE_D_BOUNDARIES = [50000]
+
+
+[toydata]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [7000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [7000]
diff --git a/Codes/unet.py b/Codes/unet.py
new file mode 100644
index 0000000..ac4c6aa
--- /dev/null
+++ b/Codes/unet.py
@@ -0,0 +1,42 @@
+import tensorflow as tf
+from tensorflow.contrib.layers import conv2d, max_pool2d, conv2d_transpose
+
+
+def unet(inputs, layers, features_root=64, filter_size=3, pool_size=2, output_channel=1):
+    """
+    :param inputs: input tensor, shape[None, height, width, channel]
+    :param layers: number of layers
+    :param features_root: number of features in the first layer
+    :param filter_size: size of each conv layer
+    :param pool_size:  size of each max pooling layer
+    :param output_channel:  number of channel for output tensor
+    :return: a tensor, shape[None, height, width, output_channel]
+    """
+
+    in_node = inputs
+    conv = []
+    for layer in range(0, layers):
+        features = 2**layer*features_root
+
+        conv1 = conv2d(inputs=in_node, num_outputs=features, kernel_size=filter_size)
+        conv2 = conv2d(inputs=conv1, num_outputs=features, kernel_size=filter_size)
+        conv.append(conv2)
+
+        if layer < layers - 1:
+            in_node = max_pool2d(inputs=conv2, kernel_size=pool_size, padding='SAME')
+            # in_node = conv2d(inputs=conv2, num_outputs=features, kernel_size=filter_size, stride=2)
+
+    in_node = conv[-1]
+
+    for layer in range(layers-2, -1, -1):
+        features = 2**(layer+1)*features_root
+
+        h_deconv = conv2d_transpose(inputs=in_node, num_outputs=features//2, kernel_size=pool_size, stride=pool_size)
+        h_deconv_concat = tf.concat([conv[layer], h_deconv], axis=3)
+
+        conv1 = conv2d(inputs=h_deconv_concat, num_outputs=features//2, kernel_size=filter_size)
+        in_node = conv2d(inputs=conv1, num_outputs=features//2, kernel_size=filter_size)
+
+    output = conv2d(inputs=in_node, num_outputs=output_channel, kernel_size=filter_size, activation_fn=None)
+    output = tf.tanh(output)
+    return output
diff --git a/Codes/utils.py b/Codes/utils.py
new file mode 100644
index 0000000..efeab8e
--- /dev/null
+++ b/Codes/utils.py
@@ -0,0 +1,227 @@
+import tensorflow as tf
+import numpy as np
+from collections import OrderedDict
+import os
+import glob
+import cv2
+
+
+rng = np.random.RandomState(2017)
+
+
+def np_load_frame(filename, resize_height, resize_width):
+    image_decoded = cv2.imread(filename)
+    image_resized = cv2.resize(image_decoded, (resize_width, resize_height))
+    image_resized = image_resized.astype(dtype=np.float32)
+    image_resized = (image_resized / 127.5) - 1.0
+    return image_resized
+
+
+class DataLoader(object):
+    def __init__(self, video_folder, resize_height=256, resize_width=256):
+        self.dir = video_folder
+        self.videos = {}
+        self._resize_height = resize_height
+        self._resize_width = resize_width
+        self.setup()
+
+    def __call__(self, batch_size, time_steps, num_pred=1):
+        video_info_list = list(self.videos.values())
+        num_videos = len(video_info_list)
+
+        clip_length = time_steps + num_pred
+        resize_height, resize_width = self._resize_height, self._resize_width
+
+        def video_clip_generator():
+            v_id = -1
+            while True:
+                v_id = (v_id + 1) % num_videos
+
+                video_info = video_info_list[v_id]
+                start = rng.randint(0, video_info['length'] - clip_length)
+                video_clip = []
+                for frame_id in range(start, start + clip_length):
+                    video_clip.append(np_load_frame(video_info['frame'][frame_id], resize_height, resize_width))
+                video_clip = np.concatenate(video_clip, axis=2)
+
+                yield video_clip
+
+        # video clip paths
+        dataset = tf.data.Dataset.from_generator(generator=video_clip_generator,
+                                                 output_types=tf.float32,
+                                                 output_shapes=[resize_height, resize_width, clip_length * 3])
+        print('generator dataset, {}'.format(dataset))
+        dataset = dataset.prefetch(buffer_size=1000)
+        dataset = dataset.shuffle(buffer_size=1000).batch(batch_size)
+        print('epoch dataset, {}'.format(dataset))
+
+        return dataset
+
+    def __getitem__(self, video_name):
+        assert video_name in self.videos.keys(), 'video = {} is not in {}!'.format(video_name, self.videos.keys())
+        return self.videos[video_name]
+
+    def setup(self):
+        videos = glob.glob(os.path.join(self.dir, '*'))
+        for video in sorted(videos):
+            video_name = video.split('/')[-1]
+            self.videos[video_name] = {}
+            self.videos[video_name]['path'] = video
+            self.videos[video_name]['frame'] = glob.glob(os.path.join(video, '*.jpg'))
+            self.videos[video_name]['frame'].sort()
+            self.videos[video_name]['length'] = len(self.videos[video_name]['frame'])
+
+    def get_video_clips(self, video, start, end):
+        # assert video in self.videos, 'video = {} must in {}!'.format(video, self.videos.keys())
+        # assert start >= 0, 'start = {} must >=0!'.format(start)
+        # assert end <= self.videos[video]['length'], 'end = {} must <= {}'.format(video, self.videos[video]['length'])
+
+        batch = []
+        for i in range(start, end):
+            image = np_load_frame(self.videos[video]['frame'][i], self._resize_height, self._resize_width)
+            batch.append(image)
+
+        return np.concatenate(batch, axis=2)
+
+    # def get_video_clips(self, video_name, start, end):
+    #     video_idx = np.arange(start, end)
+    #     video_clip = np.empty(shape=[self._resize_height, self._resize_height, 3*len(video_idx)], dtype=np.float32)
+    #     for idx, v_idx in enumerate(video_idx):
+    #         filename = self.videos[video_name]['frame'][v_idx]
+    #         video_clip[..., idx*3:(idx+1)*3] = np_load_frame(filename, self._resize_height, self._resize_width)
+    #
+    #     return video_clip
+
+
+def log10(t):
+    """
+    Calculates the base-10 log of each element in t.
+
+    @param t: The tensor from which to calculate the base-10 log.
+
+    @return: A tensor with the base-10 log of each element in t.
+    """
+
+    numerator = tf.log(t)
+    denominator = tf.log(tf.constant(10, dtype=numerator.dtype))
+    return numerator / denominator
+
+
+def psnr_error(gen_frames, gt_frames):
+    """
+    Computes the Peak Signal to Noise Ratio error between the generated images and the ground
+    truth images.
+
+    @param gen_frames: A tensor of shape [batch_size, height, width, 3]. The frames generated by the
+                       generator model.
+    @param gt_frames: A tensor of shape [batch_size, height, width, 3]. The ground-truth frames for
+                      each frame in gen_frames.
+
+    @return: A scalar tensor. The mean Peak Signal to Noise Ratio error over each frame in the
+             batch.
+    """
+    shape = tf.shape(gen_frames)
+    num_pixels = tf.to_float(shape[1] * shape[2] * shape[3])
+    gt_frames = (gt_frames + 1.0) / 2.0
+    gen_frames = (gen_frames + 1.0) / 2.0
+    square_diff = tf.square(gt_frames - gen_frames)
+
+    batch_errors = 10 * log10(1 / ((1 / num_pixels) * tf.reduce_sum(square_diff, [1, 2, 3])))
+    return tf.reduce_mean(batch_errors)
+
+
+def sharp_diff_error(gen_frames, gt_frames, channels=3):
+    """
+    Computes the Sharpness Difference error between the generated images and the ground truth
+    images.
+
+    @param gen_frames: A tensor of shape [batch_size, height, width, 3]. The frames generated by the
+                       generator model.
+    @param gt_frames: A tensor of shape [batch_size, height, width, 3]. The ground-truth frames for
+                      each frame in gen_frames.
+    @param channels: The number of channels, 3 is RGB and 1 is Gray, default is 3.
+
+    @return: A scalar tensor. The Sharpness Difference error over each frame in the batch.
+    """
+    shape = tf.shape(gen_frames)
+    num_pixels = tf.to_float(shape[1] * shape[2] * shape[3])
+
+    # gradient difference
+    # create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
+    # TODO: Could this be simplified with one filter [[-1, 2], [0, -1]]?
+    pos = tf.constant(np.identity(channels), dtype=tf.float32)
+    neg = -1 * pos
+    filter_x = tf.expand_dims(tf.stack([neg, pos]), 0)  # [-1, 1]
+    filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)])  # [[1],[-1]]
+    strides = [1, 1, 1, 1]  # stride of (1, 1)
+    padding = 'SAME'
+
+    gen_dx = tf.abs(tf.nn.conv2d(gen_frames, filter_x, strides, padding=padding))
+    gen_dy = tf.abs(tf.nn.conv2d(gen_frames, filter_y, strides, padding=padding))
+    gt_dx = tf.abs(tf.nn.conv2d(gt_frames, filter_x, strides, padding=padding))
+    gt_dy = tf.abs(tf.nn.conv2d(gt_frames, filter_y, strides, padding=padding))
+
+    gen_grad_sum = gen_dx + gen_dy
+    gt_grad_sum = gt_dx + gt_dy
+
+    grad_diff = tf.abs(gt_grad_sum - gen_grad_sum)
+
+    batch_errors = 10 * log10(1 / ((1 / num_pixels) * tf.reduce_sum(grad_diff, [1, 2, 3])))
+    return tf.reduce_mean(batch_errors)
+
+
+def diff_mask(gen_frames, gt_frames, min_value=-1, max_value=1):
+    # normalize to [0, 1]
+    delta = max_value - min_value
+    gen_frames = (gen_frames - min_value) / delta
+    gt_frames = (gt_frames - min_value) / delta
+
+    gen_gray_frames = tf.image.rgb_to_grayscale(gen_frames)
+    gt_gray_frames = tf.image.rgb_to_grayscale(gt_frames)
+
+    diff = tf.abs(gen_gray_frames - gt_gray_frames)
+    return diff
+
+
+def load(saver, sess, ckpt_path):
+    saver.restore(sess, ckpt_path)
+    print("Restored model parameters from {}".format(ckpt_path))
+
+
+def save(saver, sess, logdir, step):
+    model_name = 'model.ckpt'
+    checkpoint_path = os.path.join(logdir, model_name)
+    if not os.path.exists(logdir):
+        os.makedirs(logdir)
+    saver.save(sess, checkpoint_path, global_step=step)
+    print('The checkpoint has been created.')
+
+
+# if __name__ == '__main__':
+#     os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+#     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+#
+#     data_loader = DataLoader('/home/liuwen/ssd/datasets/avenue/training/frames')
+#     dataset, epoch_size = data_loader(10, 4, 1, 3, 1)
+#
+#     # debug
+#     iteration = dataset.make_one_shot_iterator()
+#     batch_video_clip_tensor = iteration.get_next()
+#
+#     config = tf.ConfigProto()
+#     config.gpu_options.allow_growth = True
+#     with tf.Session(config=config) as sess:
+#         # batch_video_clip = sess.run(next(it))
+#
+#         for i in range(100):
+#             batch_video_clip = sess.run(batch_video_clip_tensor)
+#             # print(batch_video_clip.shape)
+#
+#             for vid, video_clip in enumerate(batch_video_clip):
+#                 for fid, frame in enumerate(video_clip):
+#                     print(i, vid, fid)
+#                     cv2.imshow('visualization', frame + 0.5)
+#                     cv2.waitKey(100)
+
+
+