first commit

author: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
committer: StevenLiuWen <liuwen@shanghaitech.edu.cn> 2018-03-13 03:28:06 -0400
commit: fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree: af7f6e759b5dec4fc2964daed09e903958b919ed
89 files changed, 8316 insertions, 0 deletions
diff --git a/Codes/constant.py b/Codes/constant.py
new file mode 100644
index 0000000..eafeab9
--- /dev/null
+++ b/Codes/constant.py
@@ -0,0 +1,153 @@
+import os
+import argparse
+import configparser
+
+
+def get_dir(directory):
+    """
+    get the directory, if no such directory, then make it.
+
+    @param directory: The new directory.
+    """
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    return directory
+
+
+def parser_args():
+    parser = argparse.ArgumentParser(description='Options to run the network.')
+    parser.add_argument('-g', '--gpu', type=str, default='0',
+                        help='the device id of gpu.')
+    parser.add_argument('-i', '--iters', type=int, default=1,
+                        help='set the number of iterations, default is 1')
+    parser.add_argument('-b', '--batch', type=int, default=4,
+                        help='set the batch size, default is 4.')
+    parser.add_argument('--num_his', type=int, default=4,
+                        help='set the time steps, default is 4.')
+
+    parser.add_argument('-d', '--dataset', type=str,
+                        help='the name of dataset.')
+    parser.add_argument('--train_folder', type=str, default='',
+                        help='set the training folder path.')
+    parser.add_argument('--test_folder', type=str, default='',
+                        help='set the testing folder path.')
+
+    parser.add_argument('--config', type=str, default='training_hyper_params/hyper_params.ini',
+                        help='the path of training_hyper_params, default is training_hyper_params/hyper_params.ini')
+
+    parser.add_argument('--snapshot_dir', type=str, default='',
+                        help='if it is folder, then it is the directory to save models, '
+                             'if it is a specific model.ckpt-xxx, then the system will load it for testing.')
+    parser.add_argument('--summary_dir', type=str, default='', help='the directory to save summaries.')
+    parser.add_argument('--psnr_dir', type=str, default='', help='the directory to save psnrs results in testing.')
+
+    parser.add_argument('--evaluate', type=str, default='compute_auc',
+                        help='the evaluation metric, default is compute_auc')
+
+    return parser.parse_args()
+
+
+class Const(object):
+    class ConstError(TypeError):
+        pass
+
+    class ConstCaseError(ConstError):
+        pass
+
+    def __setattr__(self, name, value):
+        if name in self.__dict__:
+            raise self.ConstError("Can't change const.{}".format(name))
+        if not name.isupper():
+            raise self.ConstCaseError('const name {} is not all uppercase'.format(name))
+
+        self.__dict__[name] = value
+
+    def __str__(self):
+        _str = '<================ Constants information ================>\n'
+        for name, value in self.__dict__.items():
+            print(name, value)
+            _str += '\t{}\t{}\n'.format(name, value)
+
+        return _str
+
+
+args = parser_args()
+const = Const()
+
+# inputs constants
+const.DATASET = args.dataset
+const.TRAIN_FOLDER = args.train_folder
+const.TEST_FOLDER = args.test_folder
+
+const.GPU = args.gpu
+
+const.BATCH_SIZE = args.batch
+const.NUM_HIS = args.num_his
+const.ITERATIONS = args.iters
+
+const.EVALUATE = args.evaluate
+
+# network constants
+const.HEIGHT = 256
+const.WIDTH = 256
+const.FLOWNET_CHECKPOINT = 'flownet2/checkpoints/FlowNetSD/flownet-SD.ckpt-0'
+const.FLOW_HEIGHT = 384
+const.FLOW_WIDTH = 512
+
+# set training hyper-parameters of different datasets
+config = configparser.ConfigParser()
+assert config.read(args.config)
+
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+const.L_NUM = config.getint(const.DATASET, 'L_NUM')
+# the power to which each gradient term is raised in GDL loss
+const.ALPHA_NUM = config.getint(const.DATASET, 'ALPHA_NUM')
+# the percentage of the adversarial loss to use in the combined loss
+const.LAM_ADV = config.getfloat(const.DATASET, 'LAM_ADV')
+# the percentage of the lp loss to use in the combined loss
+const.LAM_LP = config.getfloat(const.DATASET, 'LAM_LP')
+# the percentage of the GDL loss to use in the combined loss
+const.LAM_GDL = config.getfloat(const.DATASET, 'LAM_GDL')
+# the percentage of the different frame loss
+const.LAM_FLOW = config.getfloat(const.DATASET, 'LAM_FLOW')
+
+# Learning rate of generator
+const.LRATE_G = eval(config.get(const.DATASET, 'LRATE_G'))
+const.LRATE_G_BOUNDARIES = eval(config.get(const.DATASET, 'LRATE_G_BOUNDARIES'))
+
+# Learning rate of discriminator
+const.LRATE_D = eval(config.get(const.DATASET, 'LRATE_D'))
+const.LRATE_D_BOUNDARIES = eval(config.get(const.DATASET, 'LRATE_D_BOUNDARIES'))
+
+
+const.SAVE_DIR = '{dataset}_l_{L_NUM}_alpha_{ALPHA_NUM}_lp_{LAM_LP}_' \
+                 'adv_{LAM_ADV}_gdl_{LAM_GDL}_flow_{LAM_FLOW}'.format(dataset=const.DATASET,
+                                                                      L_NUM=const.L_NUM,
+                                                                      ALPHA_NUM=const.ALPHA_NUM,
+                                                                      LAM_LP=const.LAM_LP, LAM_ADV=const.LAM_ADV,
+                                                                      LAM_GDL=const.LAM_GDL, LAM_FLOW=const.LAM_FLOW)
+
+if args.snapshot_dir:
+    # if the snapshot_dir is model.ckpt-xxx, which means it is the single model for testing.
+    if os.path.exists(args.snapshot_dir + '.meta') or os.path.exists(args.snapshot_dir + '.data-00000-of-00001') or \
+            os.path.exists(args.snapshot_dir + '.index'):
+        const.SNAPSHOT_DIR = args.snapshot_dir
+        print(const.SNAPSHOT_DIR)
+    else:
+        const.SNAPSHOT_DIR = get_dir(os.path.join('models', const.SAVE_DIR + '_' + args.snapshot_dir))
+else:
+    const.SNAPSHOT_DIR = get_dir(os.path.join('models', const.SAVE_DIR))
+
+if args.summary_dir:
+    const.SUMMARY_DIR = get_dir(os.path.join('summary', const.SAVE_DIR + '_' + args.summary_dir))
+else:
+    const.SUMMARY_DIR = get_dir(os.path.join('summary', const.SAVE_DIR))
+
+if args.psnr_dir:
+    const.PSNR_DIR = get_dir(os.path.join('psnrs', const.SAVE_DIR + '_' + args.psnr_dir))
+else:
+    const.PSNR_DIR = get_dir(os.path.join('psnrs', const.SAVE_DIR))
+
+
diff --git a/Codes/evaluate.py b/Codes/evaluate.py
new file mode 100644
index 0000000..2bce871
--- /dev/null
+++ b/Codes/evaluate.py
@@ -0,0 +1,576 @@
+import numpy as np
+import scipy.io as scio
+import os
+import argparse
+import pickle
+from sklearn import metrics
+import json
+import socket
+
+
+# data folder contain all datasets, such as ped1, ped2, avenue, shanghaitech, etc
+# DATA_DIR = '../Data'
+hostname = socket.gethostname()
+if hostname == 'dl-T8520-G10':  # 119
+    DATA_DIR = '/home/liuwen/ssd/datasets'
+elif hostname == 'admin' or hostname == 'compute101' or hostname == 'compute113' or hostname == 'compute106' \
+        or hostname == 'compute107' or hostname == 'compute114':   # node02
+    DATA_DIR = '/home/luowx/liuwen/datasets'
+elif hostname == 'gpu13' or 'gpu14':
+    DATA_DIR = '/public/home/gaoshenghua/liuwen/datasets'
+else:
+    # raise NotImplementedError('Not found this machine {}!'.format(hostname))
+    DATA_DIR = '../Data'
+
+
+# normalize scores in each sub video
+NORMALIZE = True
+
+# number of history frames, since in prediction based method, the first 4 frames can not be predicted, so that
+# the first 4frames are undecidable, we just ignore the first 4 frames
+DECIDABLE_IDX = 4
+
+
+def parser_args():
+    parser = argparse.ArgumentParser(description='evaluating the model, computing the roc/auc.')
+
+    parser.add_argument('-f', '--file', type=str, help='the path of loss file.')
+    parser.add_argument('-t', '--type', type=str, default='compute_auc',
+                        help='the type of evaluation, choosing type is: plot_roc, compute_auc, '
+                             'test_func\n, the default type is compute_auc')
+    return parser.parse_args()
+
+
+class RecordResult(object):
+    def __init__(self, fpr=None, tpr=None, auc=-np.inf, dataset=None, loss_file=None):
+        self.fpr = fpr
+        self.tpr = tpr
+        self.auc = auc
+        self.dataset = dataset
+        self.loss_file = loss_file
+
+    def __lt__(self, other):
+        return self.auc < other.auc
+
+    def __gt__(self, other):
+        return self.auc > other.auc
+
+    def __str__(self):
+        return 'dataset = {}, loss file = {}, auc = {}'.format(self.dataset, self.loss_file, self.auc)
+
+
+class GroundTruthLoader(object):
+    AVENUE = 'avenue'
+    PED1 = 'ped1'
+    PED1_PIXEL_SUBSET = 'ped1_pixel_subset'
+    PED2 = 'ped2'
+    ENTRANCE = 'enter'
+    EXIT = 'exit'
+    SHANGHAITECH = 'shanghaitech'
+    SHANGHAITECH_LABEL_PATH = os.path.join(DATA_DIR, 'shanghaitech/testing/test_frame_mask')
+    TOY_DATA = 'toydata'
+    TOY_DATA_LABEL_PATH = os.path.join(DATA_DIR, TOY_DATA, 'toydata.json')
+
+    NAME_MAT_MAPPING = {
+        AVENUE: os.path.join(DATA_DIR, 'avenue/avenue.mat'),
+        PED1: os.path.join(DATA_DIR, 'ped1/ped1.mat'),
+        PED2: os.path.join(DATA_DIR, 'ped2/ped2.mat'),
+        ENTRANCE: os.path.join(DATA_DIR, 'enter/enter.mat'),
+        EXIT: os.path.join(DATA_DIR, 'exit/exit.mat')
+    }
+
+    NAME_FRAMES_MAPPING = {
+        AVENUE: os.path.join(DATA_DIR, 'avenue/testing/frames'),
+        PED1: os.path.join(DATA_DIR, 'ped1/testing/frames'),
+        PED2: os.path.join(DATA_DIR, 'ped2/testing/frames'),
+        ENTRANCE: os.path.join(DATA_DIR, 'enter/testing/frames'),
+        EXIT: os.path.join(DATA_DIR, 'exit/testing/frames')
+    }
+
+    def __init__(self, mapping_json=None):
+        """
+        Initial a ground truth loader, which loads the ground truth with given dataset name.
+
+        :param mapping_json: the mapping from dataset name to the path of ground truth.
+        """
+
+        if mapping_json is not None:
+            with open(mapping_json, 'rb') as json_file:
+                self.mapping = json.load(json_file)
+        else:
+            self.mapping = GroundTruthLoader.NAME_MAT_MAPPING
+
+    def __call__(self, dataset):
+        """ get the ground truth by provided the name of dataset.
+
+        :type dataset: str
+        :param dataset: the name of dataset.
+        :return: np.ndarray, shape(#video)
+                 np.array[0] contains all the start frame and end frame of abnormal events of video 0,
+                 and its shape is (#frapsnr, )
+        """
+
+        if dataset == GroundTruthLoader.SHANGHAITECH:
+            gt = self.__load_shanghaitech_gt()
+        elif dataset == GroundTruthLoader.TOY_DATA:
+            gt = self.__load_toydata_gt()
+        else:
+            gt = self.__load_ucsd_avenue_subway_gt(dataset)
+        return gt
+
+    def __load_ucsd_avenue_subway_gt(self, dataset):
+        assert dataset in self.mapping, 'there is no dataset named {} \n Please check {}' \
+            .format(dataset, GroundTruthLoader.NAME_MAT_MAPPING.keys())
+
+        mat_file = self.mapping[dataset]
+        abnormal_events = scio.loadmat(mat_file, squeeze_me=True)['gt']
+
+        if abnormal_events.ndim == 2:
+            abnormal_events = abnormal_events.reshape(-1, abnormal_events.shape[0], abnormal_events.shape[1])
+
+        num_video = abnormal_events.shape[0]
+        dataset_video_folder = GroundTruthLoader.NAME_FRAMES_MAPPING[dataset]
+        video_list = os.listdir(dataset_video_folder)
+        video_list.sort()
+
+        assert num_video == len(video_list), 'ground true does not match the number of testing videos. {} != {}' \
+            .format(num_video, len(video_list))
+
+        # get the total frames of sub video
+        def get_video_length(sub_video_number):
+            # video_name = video_name_template.format(sub_video_number)
+            video_name = os.path.join(dataset_video_folder, video_list[sub_video_number])
+            assert os.path.isdir(video_name), '{} is not directory!'.format(video_name)
+
+            length = len(os.listdir(video_name))
+
+            return length
+
+        # need to test [].append, or np.array().append(), which one is faster
+        gt = []
+        for i in range(num_video):
+            length = get_video_length(i)
+
+            sub_video_gt = np.zeros((length,), dtype=np.int8)
+            sub_abnormal_events = abnormal_events[i]
+            if sub_abnormal_events.ndim == 1:
+                sub_abnormal_events = sub_abnormal_events.reshape((sub_abnormal_events.shape[0], -1))
+
+            _, num_abnormal = sub_abnormal_events.shape
+
+            for j in range(num_abnormal):
+                # (start - 1, end - 1)
+                start = sub_abnormal_events[0, j] - 1
+                end = sub_abnormal_events[1, j]
+
+                sub_video_gt[start: end] = 1
+
+            gt.append(sub_video_gt)
+
+        return gt
+
+    @staticmethod
+    def __load_shanghaitech_gt():
+        video_path_list = os.listdir(GroundTruthLoader.SHANGHAITECH_LABEL_PATH)
+        video_path_list.sort()
+
+        gt = []
+        for video in video_path_list:
+            # print(os.path.join(GroundTruthLoader.SHANGHAITECH_LABEL_PATH, video))
+            gt.append(np.load(os.path.join(GroundTruthLoader.SHANGHAITECH_LABEL_PATH, video)))
+
+        return gt
+
+    @staticmethod
+    def __load_toydata_gt():
+        with open(GroundTruthLoader.TOY_DATA_LABEL_PATH, 'r') as gt_file:
+            gt_dict = json.load(gt_file)
+
+        gt = []
+        for video, video_info in gt_dict.items():
+            length = video_info['length']
+            video_gt = np.zeros((length,), dtype=np.uint8)
+            sub_gt = np.array(np.matrix(video_info['gt']))
+
+            for anomaly in sub_gt:
+                start = anomaly[0]
+                end = anomaly[1] + 1
+                video_gt[start: end] = 1
+            gt.append(video_gt)
+        return gt
+
+    @staticmethod
+    def get_pixel_masks_file_list(dataset):
+        # pixel mask folder
+        pixel_mask_folder = os.path.join(DATA_DIR, dataset, 'pixel_masks')
+        pixel_mask_file_list = os.listdir(pixel_mask_folder)
+        pixel_mask_file_list.sort()
+
+        # get all testing videos
+        dataset_video_folder = GroundTruthLoader.NAME_FRAMES_MAPPING[dataset]
+        video_list = os.listdir(dataset_video_folder)
+        video_list.sort()
+
+        # get all testing video names with pixel masks
+        pixel_video_ids = []
+        ids = 0
+        for pixel_mask_name in pixel_mask_file_list:
+            while ids < len(video_list):
+                if video_list[ids] + '.npy' == pixel_mask_name:
+                    pixel_video_ids.append(ids)
+                    ids += 1
+                    break
+                else:
+                    ids += 1
+
+        assert len(pixel_video_ids) == len(pixel_mask_file_list)
+
+        for i in range(len(pixel_mask_file_list)):
+            pixel_mask_file_list[i] = os.path.join(pixel_mask_folder, pixel_mask_file_list[i])
+
+        return pixel_mask_file_list, pixel_video_ids
+
+
+def load_psnr_gt(loss_file):
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+
+    dataset = results['dataset']
+    psnr_records = results['psnr']
+
+    num_videos = len(psnr_records)
+
+    # load ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    assert num_videos == len(gt), 'the number of saved videos does not match the ground truth, {} != {}' \
+        .format(num_videos, len(gt))
+
+    return dataset, psnr_records, gt
+
+
+def load_psnr_gt_flow(loss_file):
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+
+    dataset = results['dataset']
+    psnrs = results['psnr']
+    flows = results['flow']
+
+    num_videos = len(psnrs)
+
+    # load ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    assert num_videos == len(gt), 'the number of saved videos does not match the ground truth, {} != {}' \
+        .format(num_videos, len(gt))
+
+    return dataset, psnrs, flows, gt
+
+
+def load_psnr(loss_file):
+    """
+    load image psnr or optical flow psnr.
+    :param loss_file: loss file path
+    :return:
+    """
+    with open(loss_file, 'rb') as reader:
+        # results {
+        #   'dataset': the name of dataset
+        #   'psnr': the psnr of each testing videos,
+        # }
+
+        # psnr_records['psnr'] is np.array, shape(#videos)
+        # psnr_records[0] is np.array   ------>     01.avi
+        # psnr_records[1] is np.array   ------>     02.avi
+        #               ......
+        # psnr_records[n] is np.array   ------>     xx.avi
+
+        results = pickle.load(reader)
+    psnrs = results['psnr']
+    return psnrs
+
+
+def get_scores_labels(loss_file):
+    # the name of dataset, loss, and ground truth
+    dataset, psnr_records, gt = load_psnr_gt(loss_file=loss_file)
+
+    # the number of videos
+    num_videos = len(psnr_records)
+
+    scores = np.array([], dtype=np.float32)
+    labels = np.array([], dtype=np.int8)
+    # video normalization
+    for i in range(num_videos):
+        distance = psnr_records[i]
+
+        if NORMALIZE:
+            distance -= distance.min()  # distances = (distance - min) / (max - min)
+            distance /= distance.max()
+            # distance = 1 - distance
+
+        scores = np.concatenate((scores[:], distance[DECIDABLE_IDX:]), axis=0)
+        labels = np.concatenate((labels[:], gt[i][DECIDABLE_IDX:]), axis=0)
+    return dataset, scores, labels
+
+
+def precision_recall_auc(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult()
+    for sub_loss_file in loss_file_list:
+        dataset, scores, labels = get_scores_labels(sub_loss_file)
+        precision, recall, thresholds = metrics.precision_recall_curve(labels, scores, pos_label=0)
+        auc = metrics.auc(recall, precision)
+
+        results = RecordResult(recall, precision, auc, dataset, sub_loss_file)
+
+        if optimal_results < results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def cal_eer(fpr, tpr):
+    # makes fpr + tpr = 1
+    eer = fpr[np.nanargmin(np.absolute((fpr + tpr - 1)))]
+    return eer
+
+
+def compute_eer(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult(auc=np.inf)
+    for sub_loss_file in loss_file_list:
+        dataset, scores, labels = get_scores_labels(sub_loss_file)
+        fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=0)
+        eer = cal_eer(fpr, tpr)
+
+        results = RecordResult(fpr, tpr, eer, dataset, sub_loss_file)
+
+        if optimal_results > results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def compute_auc(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    optimal_results = RecordResult()
+    for sub_loss_file in loss_file_list:
+        # the name of dataset, loss, and ground truth
+        dataset, psnr_records, gt = load_psnr_gt(loss_file=sub_loss_file)
+
+        # the number of videos
+        num_videos = len(psnr_records)
+
+        scores = np.array([], dtype=np.float32)
+        labels = np.array([], dtype=np.int8)
+        # video normalization
+        for i in range(num_videos):
+            distance = psnr_records[i]
+
+            if NORMALIZE:
+                distance -= distance.min()  # distances = (distance - min) / (max - min)
+                distance /= distance.max()
+                # distance = 1 - distance
+
+            scores = np.concatenate((scores, distance[DECIDABLE_IDX:]), axis=0)
+            labels = np.concatenate((labels, gt[i][DECIDABLE_IDX:]), axis=0)
+
+        fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=0)
+        auc = metrics.auc(fpr, tpr)
+
+        results = RecordResult(fpr, tpr, auc, dataset, sub_loss_file)
+
+        if optimal_results < results:
+            optimal_results = results
+
+        if os.path.isdir(loss_file):
+            print(results)
+    print('##### optimal result and model = {}'.format(optimal_results))
+    return optimal_results
+
+
+def average_psnr(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_list = [loss_file]
+    else:
+        loss_file_list = os.listdir(loss_file)
+        loss_file_list = [os.path.join(loss_file, sub_loss_file) for sub_loss_file in loss_file_list]
+
+    max_avg_psnr = -np.inf
+    max_file = ''
+    for file in loss_file_list:
+        psnr_records = load_psnr(file)
+
+        psnr_records = np.concatenate(psnr_records, axis=0)
+        avg_psnr = np.mean(psnr_records)
+        if max_avg_psnr < avg_psnr:
+            max_avg_psnr = avg_psnr
+            max_file = file
+        print('{}, average psnr = {}'.format(file, avg_psnr))
+
+    print('max average psnr file = {}, psnr = {}'.format(max_file, max_avg_psnr))
+
+
+def calculate_psnr(loss_file):
+    optical_result = compute_auc(loss_file)
+    print('##### optimal result and model = {}'.format(optical_result))
+
+    mean_psnr = []
+    for file in os.listdir(loss_file):
+        file = os.path.join(loss_file, file)
+        dataset, psnr_records, gt = load_psnr_gt(file)
+
+        psnr_records = np.concatenate(psnr_records, axis=0)
+        gt = np.concatenate(gt, axis=0)
+
+        mean_normal_psnr = np.mean(psnr_records[gt == 0])
+        mean_abnormal_psnr = np.mean(psnr_records[gt == 1])
+        mean = np.mean(psnr_records)
+        print('mean normal psrn = {}, mean abnormal psrn = {}, mean = {}'.format(
+            mean_normal_psnr,
+            mean_abnormal_psnr,
+            mean)
+        )
+        mean_psnr.append(mean)
+    print('max mean psnr = {}'.format(np.max(mean_psnr)))
+
+
+def calculate_score(loss_file):
+    if not os.path.isdir(loss_file):
+        loss_file_path = loss_file
+    else:
+        optical_result = compute_auc(loss_file)
+        loss_file_path = optical_result.loss_file
+        print('##### optimal result and model = {}'.format(optical_result))
+    dataset, psnr_records, gt = load_psnr_gt(loss_file=loss_file_path)
+
+    # the number of videos
+    num_videos = len(psnr_records)
+
+    scores = np.array([], dtype=np.float32)
+    labels = np.array([], dtype=np.int8)
+    # video normalization
+    for i in range(num_videos):
+        distance = psnr_records[i]
+
+        distance = (distance - distance.min()) / (distance.max() - distance.min())
+
+        scores = np.concatenate((scores, distance[DECIDABLE_IDX:]), axis=0)
+        labels = np.concatenate((labels, gt[i][DECIDABLE_IDX:]), axis=0)
+
+    mean_normal_scores = np.mean(scores[labels == 0])
+    mean_abnormal_scores = np.mean(scores[labels == 1])
+    print('mean normal scores = {}, mean abnormal scores = {}, '
+          'delta = {}'.format(mean_normal_scores, mean_abnormal_scores, mean_normal_scores - mean_abnormal_scores))
+
+
+def test_func(*args):
+    # simulate testing on CUHK AVENUE dataset
+    dataset = GroundTruthLoader.AVENUE
+
+    # load the ground truth
+    gt_loader = GroundTruthLoader()
+    gt = gt_loader(dataset=dataset)
+
+    num_videos = len(gt)
+
+    simulated_results = {
+        'dataset': dataset,
+        'psnr': []
+    }
+
+    simulated_psnr = []
+    for i in range(num_videos):
+        sub_video_length = gt[i].shape[0]
+        simulated_psnr.append(np.random.random(size=sub_video_length))
+
+    simulated_results['psnr'] = simulated_psnr
+
+    # writing to file, 'generated_loss.bin'
+    with open('generated_loss.bin', 'wb') as writer:
+        pickle.dump(simulated_results, writer, pickle.HIGHEST_PROTOCOL)
+
+    print(file_path.name)
+    result = compute_auc(file_path.name)
+
+    print('optimal = {}'.format(result))
+
+
+eval_type_function = {
+    'compute_auc': compute_auc,
+    'compute_eer': compute_eer,
+    'precision_recall_auc': precision_recall_auc,
+    'calculate_psnr': calculate_psnr,
+    'calculate_score': calculate_score,
+    'average_psnr': average_psnr,
+    'average_psnr_sample': average_psnr
+}
+
+
+def evaluate(eval_type, save_file):
+    assert eval_type in eval_type_function, 'there is no type of evaluation {}, please check {}' \
+        .format(eval_type, eval_type_function.keys())
+    eval_func = eval_type_function[eval_type]
+    optimal_results = eval_func(save_file)
+    return optimal_results
+
+
+if __name__ == '__main__':
+    args = parser_args()
+
+    eval_type = args.type
+    file_path = args.file
+
+    print('Evaluate type = {}'.format(eval_type))
+    print('File path = {}'.format(file_path))
+
+    if eval_type == 'test_func':
+        test_func()
+    else:
+        evaluate(eval_type, file_path)
+\ No newline at end of file
diff --git a/Codes/flownet2/.gitignore b/Codes/flownet2/.gitignore
new file mode 100644
index 0000000..31abf4e
--- /dev/null
+++ b/Codes/flownet2/.gitignore
@@ -0,0 +1,9 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.o
+*.so
+*.so.dSYM
+checkpoints/
+!checkpoints/download.sh
+!checkpoints/README.md
diff --git a/Codes/flownet2/LICENSE b/Codes/flownet2/LICENSE
new file mode 100644
index 0000000..d2cc224
--- /dev/null
+++ b/Codes/flownet2/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Sam Pepose
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Codes/flownet2/Makefile b/Codes/flownet2/Makefile
new file mode 100644
index 0000000..073c011
--- /dev/null
+++ b/Codes/flownet2/Makefile
@@ -0,0 +1,82 @@
+# Makefile
+
+TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"`
+
+ifndef CUDA_HOME
+    CUDA_HOME := /usr/local/cuda
+endif
+
+CC        = gcc -O2 -pthread
+CXX       = g++
+GPUCC     = nvcc
+CFLAGS    = -std=c++11 -I$(TF_INC) -I"$(CUDA_HOME)/include" -DGOOGLE_CUDA=1
+GPUCFLAGS = -c
+LFLAGS    = -pthread -shared -fPIC
+GPULFLAGS = -x cu -Xcompiler -fPIC
+CGPUFLAGS = -L$(CUDA_HOME)/lib -L$(CUDA_HOME)/lib64 -lcudart
+
+OUT_DIR   = src/ops/build
+PREPROCESSING_SRC = "src/ops/preprocessing/preprocessing.cc" "src/ops/preprocessing/kernels/flow_augmentation.cc" "src/ops/preprocessing/kernels/augmentation_base.cc" "src/ops/preprocessing/kernels/data_augmentation.cc"
+GPU_SRC_DATA_AUG  	= src/ops/preprocessing/kernels/data_augmentation.cu.cc
+GPU_SRC_FLOW     	= src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
+GPU_PROD_DATA_AUG 	= $(OUT_DIR)/data_augmentation.o
+GPU_PROD_FLOW    	= $(OUT_DIR)/flow_augmentation_gpu.o
+PREPROCESSING_PROD	= $(OUT_DIR)/preprocessing.so
+
+DOWNSAMPLE_SRC = "src/ops/downsample/downsample_kernel.cc" "src/ops/downsample/downsample_op.cc"
+GPU_SRC_DOWNSAMPLE  = src/ops/downsample/downsample_kernel_gpu.cu.cc
+GPU_PROD_DOWNSAMPLE = $(OUT_DIR)/downsample_kernel_gpu.o
+DOWNSAMPLE_PROD 	= $(OUT_DIR)/downsample.so
+
+CORRELATION_SRC = "src/ops/correlation/correlation_kernel.cc" "src/ops/correlation/correlation_grad_kernel.cc" "src/ops/correlation/correlation_op.cc"
+GPU_SRC_CORRELATION  = src/ops/correlation/correlation_kernel.cu.cc
+GPU_SRC_CORRELATION_GRAD  = src/ops/correlation/correlation_grad_kernel.cu.cc
+GPU_SRC_PAD = src/ops/correlation/pad.cu.cc
+GPU_PROD_CORRELATION = $(OUT_DIR)/correlation_kernel_gpu.o
+GPU_PROD_CORRELATION_GRAD = $(OUT_DIR)/correlation_grad_kernel_gpu.o
+GPU_PROD_PAD = $(OUT_DIR)/correlation_pad_gpu.o
+CORRELATION_PROD 	= $(OUT_DIR)/correlation.so
+
+FLOWWARP_SRC = "src/ops/flow_warp/flow_warp_op.cc" "src/ops/flow_warp/flow_warp.cc" "src/ops/flow_warp/flow_warp_grad.cc"
+GPU_SRC_FLOWWARP = "src/ops/flow_warp/flow_warp.cu.cc"
+GPU_SRC_FLOWWARP_GRAD = "src/ops/flow_warp/flow_warp_grad.cu.cc"
+GPU_PROD_FLOWWARP = "$(OUT_DIR)/flow_warp_gpu.o"
+GPU_PROD_FLOWWARP_GRAD = "$(OUT_DIR)/flow_warp_grad_gpu.o"
+FLOWWARP_PROD = "$(OUT_DIR)/flow_warp.so"
+
+ifeq ($(OS),Windows_NT)
+    detected_OS := Windows
+else
+    detected_OS := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+endif
+ifeq ($(detected_OS),Darwin)  # Mac OS X
+	CGPUFLAGS += -undefined dynamic_lookup
+endif
+ifeq ($(detected_OS),Linux)
+	CFLAGS += -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES -D__STRICT_ANSI__ -D_GLIBCXX_USE_CXX11_ABI=0
+endif
+
+all: preprocessing downsample correlation flowwarp
+
+preprocessing:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_DATA_AUG) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_DATA_AUG)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOW) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOW)
+	$(CXX) -g $(CFLAGS)  $(PREPROCESSING_SRC) $(GPU_PROD_DATA_AUG) $(GPU_PROD_FLOW) $(LFLAGS) $(CGPUFLAGS) -o $(PREPROCESSING_PROD)
+
+downsample:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_DOWNSAMPLE) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_DOWNSAMPLE)
+	$(CXX) -g $(CFLAGS)  $(DOWNSAMPLE_SRC) $(GPU_PROD_DOWNSAMPLE) $(LFLAGS) $(CGPUFLAGS) -o $(DOWNSAMPLE_PROD)
+
+correlation:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_CORRELATION) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_CORRELATION)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_CORRELATION_GRAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_CORRELATION_GRAD)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_PAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_PAD)
+	$(CXX) -g $(CFLAGS)  $(CORRELATION_SRC) $(GPU_PROD_CORRELATION) $(GPU_PROD_CORRELATION_GRAD) $(GPU_PROD_PAD) $(LFLAGS) $(CGPUFLAGS) -o $(CORRELATION_PROD)
+
+flowwarp:
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOWWARP) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOWWARP)
+	$(GPUCC) -g $(CFLAGS) $(GPUCFLAGS) $(GPU_SRC_FLOWWARP_GRAD) $(GPULFLAGS) $(GPUDEF) -o $(GPU_PROD_FLOWWARP_GRAD)
+	$(CXX) -g $(CFLAGS)  $(FLOWWARP_SRC) $(GPU_PROD_FLOWWARP) $(GPU_PROD_FLOWWARP_GRAD) $(LFLAGS) $(CGPUFLAGS) -o $(FLOWWARP_PROD)
+
+clean:
+	rm -f $(PREPROCESSING_PROD) $(GPU_PROD_FLOW) $(GPU_PROD_DATA_AUG) $(DOWNSAMPLE_PROD) $(GPU_PROD_DOWNSAMPLE)
diff --git a/Codes/flownet2/README.md b/Codes/flownet2/README.md
new file mode 100644
index 0000000..8647723
--- /dev/null
+++ b/Codes/flownet2/README.md
@@ -0,0 +1,66 @@
+## FlowNet2 (TensorFlow)
+
+This repo contains FlowNet2[1] for TensorFlow. It includes FlowNetC, S, CS, CSS, CSS-ft-sd, SD, and 2.
+
+### Installation
+```
+pip install enum
+pip install pypng
+pip install matplotlib
+pip install image
+pip install scipy
+pip install numpy
+pip install tensorflow
+```
+
+Linux:
+`sudo apt-get install python-tk`
+
+You must have CUDA installed:
+`make all`
+
+### Download weights
+To download the weights for all models (4.4GB), run the `download.sh` script in the `checkpoints` directory. All test scripts rely on these checkpoints to work properly.
+
+
+### Flow Generation (1 image pair)
+
+```
+python -m src.flownet2.test --input_a data/samples/0img0.ppm --input_b data/samples/0img1.ppm --out ./
+```
+
+Available models:
+* `flownet2`
+* `flownet_s`
+* `flownet_c`
+* `flownet_cs`
+* `flownet_css` (can edit test.py to use css-ft-sd weights)
+* `flownet_sd`
+
+If installation is successful, you should predict the following flow from samples/0img0.ppm:
+![FlowNet2 Sample Prediction](/data/samples/0flow-pred-flownet2.png?raw=true)
+
+### Training
+If you would like to train any of the networks from scratch (replace `flownet2` with the appropriate model):
+```
+python -m src.flownet2.train
+```
+For stacked networks, previous network weights will be loaded and fixed. For example, if training CS, the C weights are loaded and fixed and the S weights are randomly initialized.
+
+
+### Fine-tuning
+TODO
+
+### Benchmarks
+Benchmarks are for a forward pass with each model of two 512x384 images. All benchmarks were tested with a K80 GPU and Intel Xeon CPU E5-2682 v4 @ 2.30GHz. Code was executed with TensorFlow-1.2.1 and python 2.7.12 on Ubuntu 16.04. Resulting times were averaged over 10 runs. The first run is always slower as it sets up the Tensorflow Session.
+
+| | S | C | CS | CSS | SD | 2
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
+| First Run | 681.039ms | 898.792ms | 998.584ms | 1063.357ms | 933.806ms | 1882.003ms |
+| Subsequent Runs | 38.067ms | 78.789ms | 123.300ms | 161.186ms | 62.061ms | 276.641ms |
+
+
+### Sources
+[1] E. Ilg, N. Mayer, T. Saikia, M. Keuper, A. Dosovitskiy, T. Brox
+FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks,
+IEEE Conference in Computer Vision and Pattern Recognition (CVPR), 2017.
diff --git a/Codes/flownet2/__init__.py b/Codes/flownet2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/__init__.py
diff --git a/Codes/flownet2/corr.py b/Codes/flownet2/corr.py
new file mode 100644
index 0000000..3301d8c
--- /dev/null
+++ b/Codes/flownet2/corr.py
@@ -0,0 +1,45 @@
+import tensorflow as tf
+import numpy as np
+import math
+
+BATCH_SIZE = 8
+HEIGHT = 30
+WIDTH = 60
+CHANNELS = 3
+
+NEIGHBORHOOD_SIZE = 41
+MAX_DISPLACEMENT = int(math.ceil(NEIGHBORHOOD_SIZE / 2.0))
+STRIDE_2 = 2
+
+assert(STRIDE_2 <= NEIGHBORHOOD_SIZE)
+
+# Define two feature maps
+fmA = tf.ones((BATCH_SIZE, HEIGHT, WIDTH, CHANNELS), dtype=tf.int32)
+fmB = tf.convert_to_tensor(np.random.randint(5, size=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS)), dtype=tf.int32)
+
+depth = int(math.floor((2.0 * MAX_DISPLACEMENT + 1) / STRIDE_2) ** 2)
+
+print('Output should be size:', (BATCH_SIZE, HEIGHT, WIDTH, depth))
+print('Striding at values: ', [e for e in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2)])
+
+def main():
+    out = []
+    for i in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2): # height
+        for j in range(-MAX_DISPLACEMENT + 1, MAX_DISPLACEMENT, STRIDE_2): # width
+            padded_a = tf.pad(fmA, [[0,0], [0, abs(i)], [0, abs(j)], [0, 0]])
+            padded_b = tf.pad(fmB, [[0, 0], [abs(i), 0], [abs(j), 0], [0, 0]])
+            m = padded_a * padded_b
+
+            height_start_idx = 0 if i <= 0 else i
+            height_end_idx = height_start_idx + HEIGHT
+            width_start_idx = 0 if j <= 0 else j
+            width_end_idx = width_start_idx + WIDTH
+            cut = m[:, height_start_idx:height_end_idx, width_start_idx:width_end_idx, :]
+
+            final = tf.reduce_sum(cut, 3)
+            out.append(final)
+    corr = tf.stack(out, 3)
+    print('Output size: ', corr.shape)
+
+
+main()
diff --git a/Codes/flownet2/src/__init__.py b/Codes/flownet2/src/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/__init__.py
diff --git a/Codes/flownet2/src/correlation.py b/Codes/flownet2/src/correlation.py
new file mode 100644
index 0000000..60a5c37
--- /dev/null
+++ b/Codes/flownet2/src/correlation.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+
+_correlation_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/correlation.so"))
+
+
+def correlation(input_a, input_b, kernel_size, max_displacement, stride_1, stride_2, padding):
+    return _correlation_ops.correlation(input_a,
+                                        input_b,
+                                        kernel_size,
+                                        max_displacement,
+                                        stride_1,
+                                        stride_2,
+                                        padding)
+
+
+@tf.RegisterGradient("Correlation")
+def _correlation_grad(corr_op, gradients):
+    kernel_size = corr_op.get_attr("kernel_size")
+    max_displacement = corr_op.get_attr("max_displacement")
+    stride_1 = corr_op.get_attr("stride_1")
+    stride_2 = corr_op.get_attr("stride_2")
+    pad = corr_op.get_attr("pad")
+
+    corr_grads = _correlation_ops.correlation_grad(gradients,
+                                                   corr_op.inputs[0],
+                                                   corr_op.inputs[1],
+                                                   kernel_size,
+                                                   max_displacement,
+                                                   stride_1,
+                                                   stride_2,
+                                                   pad)
+
+    # Return the gradients with respect to input_a and input_b
+    return corr_grads.backprops_a, corr_grads.backprops_b
diff --git a/Codes/flownet2/src/dataloader.py b/Codes/flownet2/src/dataloader.py
new file mode 100644
index 0000000..22a6ddb
--- /dev/null
+++ b/Codes/flownet2/src/dataloader.py
@@ -0,0 +1,329 @@
+# -*- coding: utf-8 -*-
+import tensorflow as tf
+import copy
+slim = tf.contrib.slim
+
+_preprocessing_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/preprocessing.so"))
+
+
+# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+class Image(slim.tfexample_decoder.ItemHandler):
+    """An ItemHandler that decodes a parsed Tensor as an image."""
+
+    def __init__(self,
+                 image_key=None,
+                 format_key=None,
+                 shape=None,
+                 channels=3,
+                 dtype=tf.uint8,
+                 repeated=False):
+        """Initializes the image.
+        Args:
+          image_key: the name of the TF-Example feature in which the encoded image
+            is stored.
+          shape: the output shape of the image as 1-D `Tensor`
+            [height, width, channels]. If provided, the image is reshaped
+            accordingly. If left as None, no reshaping is done. A shape should
+            be supplied only if all the stored images have the same shape.
+          channels: the number of channels in the image.
+          dtype: images will be decoded at this bit depth. Different formats
+            support different bit depths.
+              See tf.image.decode_image,
+                  tf.decode_raw,
+          repeated: if False, decodes a single image. If True, decodes a
+            variable number of image strings from a 1D tensor of strings.
+        """
+        if not image_key:
+            image_key = 'image/encoded'
+
+        super(Image, self).__init__([image_key])
+        self._image_key = image_key
+        self._shape = shape
+        self._channels = channels
+        self._dtype = dtype
+        self._repeated = repeated
+
+    def tensors_to_item(self, keys_to_tensors):
+        """See base class."""
+        image_buffer = keys_to_tensors[self._image_key]
+
+        if self._repeated:
+            return functional_ops.map_fn(lambda x: self._decode(x),
+                                         image_buffer, dtype=self._dtype)
+        else:
+            return self._decode(image_buffer)
+
+    def _decode(self, image_buffer):
+        """Decodes the image buffer.
+        Args:
+          image_buffer: The tensor representing the encoded image tensor.
+        Returns:
+          A tensor that represents decoded image of self._shape, or
+          (?, ?, self._channels) if self._shape is not specified.
+        """
+        def decode_raw():
+            """Decodes a raw image."""
+            return tf.decode_raw(image_buffer, out_type=self._dtype)
+
+        image = decode_raw()
+        # image.set_shape([None, None, self._channels])
+        if self._shape is not None:
+            image = tf.reshape(image, self._shape)
+
+        return image
+
+
+def __get_dataset(dataset_config, split_name):
+    """
+    dataset_config: A dataset_config defined in datasets.py
+    split_name: 'train'/'validate'
+    """
+    with tf.name_scope('__get_dataset'):
+        if split_name not in dataset_config['SIZES']:
+            raise ValueError('split name %s not recognized' % split_name)
+
+        IMAGE_HEIGHT, IMAGE_WIDTH = dataset_config['IMAGE_HEIGHT'], dataset_config['IMAGE_WIDTH']
+        reader = tf.TFRecordReader
+        keys_to_features = {
+            'image_a': tf.FixedLenFeature((), tf.string),
+            'image_b': tf.FixedLenFeature((), tf.string),
+            'flow': tf.FixedLenFeature((), tf.string),
+        }
+        items_to_handlers = {
+            'image_a': Image(
+                image_key='image_a',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'image_b': Image(
+                image_key='image_b',
+                dtype=tf.float64,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3],
+                channels=3),
+            'flow': Image(
+                image_key='flow',
+                dtype=tf.float32,
+                shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 2],
+                channels=2),
+        }
+        decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
+        return slim.dataset.Dataset(
+            data_sources=dataset_config['PATHS'][split_name],
+            reader=reader,
+            decoder=decoder,
+            num_samples=dataset_config['SIZES'][split_name],
+            items_to_descriptions=dataset_config['ITEMS_TO_DESCRIPTIONS'])
+
+
+def config_to_arrays(dataset_config):
+    output = {
+        'name': [],
+        'rand_type': [],
+        'exp': [],
+        'mean': [],
+        'spread': [],
+        'prob': [],
+        'coeff_schedule': [],
+    }
+    config = copy.deepcopy(dataset_config)
+
+    if 'coeff_schedule_param' in config:
+        del config['coeff_schedule_param']
+
+    # Get all attributes
+    for (name, value) in config.iteritems():
+        if name == 'coeff_schedule_param':
+            output['coeff_schedule'] = [value['half_life'],
+                                        value['initial_coeff'],
+                                        value['final_coeff']]
+        else:
+            output['name'].append(name)
+            output['rand_type'].append(value['rand_type'])
+            output['exp'].append(value['exp'])
+            output['mean'].append(value['mean'])
+            output['spread'].append(value['spread'])
+            output['prob'].append(value['prob'])
+
+    return output
+
+
+# https://github.com/tgebru/transform/blob/master/src/caffe/layers/data_augmentation_layer.cpp#L34
+def _generate_coeff(param, discount_coeff=tf.constant(1.0), default_value=tf.constant(0.0)):
+    if not all(name in param for name in ['rand_type', 'exp', 'mean', 'spread', 'prob']):
+        raise RuntimeError('Expected rand_type, exp, mean, spread, prob in `param`')
+
+    rand_type = param['rand_type']
+    exp = float(param['exp'])
+    mean = tf.convert_to_tensor(param['mean'], dtype=tf.float32)
+    spread = float(param['spread'])  # AKA standard deviation
+    prob = float(param['prob'])
+
+    # Multiply spread by our discount_coeff so it changes over time
+    spread = spread * discount_coeff
+
+    if rand_type == 'uniform':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_uniform([], mean - spread, mean + spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'gaussian':
+        value = tf.cond(spread > 0.0,
+                        lambda: tf.random_normal([], mean, spread),
+                        lambda: mean)
+        if exp:
+            value = tf.exp(value)
+    elif rand_type == 'bernoulli':
+        if prob > 0.0:
+            value = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            value = 0.0
+    elif rand_type == 'uniform_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_uniform([], mean - spread, mean + spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    elif rand_type == 'gaussian_bernoulli':
+        tmp1 = 0.0
+        tmp2 = 0
+        if prob > 0.0:
+            tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([])
+        else:
+            tmp2 = 0
+
+        if tmp2 == 0:
+            if default_value is not None:
+                return default_value
+        else:
+            tmp1 = tf.cond(spread > 0.0,
+                           lambda: tf.random_normal([], mean, spread),
+                           lambda: mean)
+        if exp:
+            tmp1 = tf.exp(tmp1)
+        value = tmp1
+    else:
+        raise ValueError('Unknown distribution type %s.' % rand_type)
+    return value
+
+
+def load_batch(dataset_config, split_name, global_step):
+    num_threads = 32
+    reader_kwargs = {'options': tf.python_io.TFRecordOptions(
+        tf.python_io.TFRecordCompressionType.ZLIB)}
+
+    with tf.name_scope('load_batch'):
+        dataset = __get_dataset(dataset_config, split_name)
+        data_provider = slim.dataset_data_provider.DatasetDataProvider(
+            dataset,
+            num_readers=num_threads,
+            common_queue_capacity=2048,
+            common_queue_min=1024,
+            reader_kwargs=reader_kwargs)
+        image_a, image_b, flow = data_provider.get(['image_a', 'image_b', 'flow'])
+        image_a, image_b, flow = map(tf.to_float, [image_a, image_b, flow])
+
+        if dataset_config['PREPROCESS']['scale']:
+            image_a = image_a / 255.0
+            image_b = image_b / 255.0
+
+        crop = [dataset_config['PREPROCESS']['crop_height'],
+                dataset_config['PREPROCESS']['crop_width']]
+        config_a = config_to_arrays(dataset_config['PREPROCESS']['image_a'])
+        config_b = config_to_arrays(dataset_config['PREPROCESS']['image_b'])
+
+        image_as, image_bs, flows = map(lambda x: tf.expand_dims(x, 0), [image_a, image_b, flow])
+
+        # Perform data augmentation on GPU
+        with tf.device('/cpu:0'):
+            image_as, image_bs, transforms_from_a, transforms_from_b = \
+                _preprocessing_ops.data_augmentation(image_as,
+                                                     image_bs,
+                                                     global_step,
+                                                     crop,
+                                                     config_a['name'],
+                                                     config_a['rand_type'],
+                                                     config_a['exp'],
+                                                     config_a['mean'],
+                                                     config_a['spread'],
+                                                     config_a['prob'],
+                                                     config_a['coeff_schedule'],
+                                                     config_b['name'],
+                                                     config_b['rand_type'],
+                                                     config_b['exp'],
+                                                     config_b['mean'],
+                                                     config_b['spread'],
+                                                     config_b['prob'],
+                                                     config_b['coeff_schedule'])
+
+            noise_coeff_a = None
+            noise_coeff_b = None
+
+            # Generate and apply noise coeff for A if defined in A params
+            if 'noise' in dataset_config['PREPROCESS']['image_a']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_a']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+
+                noise_coeff_a = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_a']['noise'], discount_coeff)
+                noise_a = tf.random_normal(shape=tf.shape(image_as),
+                                           mean=0.0, stddev=noise_coeff_a,
+                                           dtype=tf.float32)
+                image_as = tf.clip_by_value(image_as + noise_a, 0.0, 1.0)
+
+            # Generate noise coeff for B if defined in B params
+            if 'noise' in dataset_config['PREPROCESS']['image_b']:
+                discount_coeff = tf.constant(1.0)
+                if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_b']:
+                    initial_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['initial_coeff']
+                    final_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['final_coeff']
+                    half_life = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['half_life']
+                    discount_coeff = initial_coeff + \
+                        (final_coeff - initial_coeff) * \
+                        (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0)
+                noise_coeff_b = _generate_coeff(
+                    dataset_config['PREPROCESS']['image_b']['noise'], discount_coeff)
+
+            # Combine coeff from a with coeff from b
+            if noise_coeff_a is not None:
+                if noise_coeff_b is not None:
+                    noise_coeff_b = noise_coeff_a * noise_coeff_b
+                else:
+                    noise_coeff_b = noise_coeff_a
+
+            # Add noise to B if needed
+            if noise_coeff_b is not None:
+                noise_b = tf.random_normal(shape=tf.shape(image_bs),
+                                           mean=0.0, stddev=noise_coeff_b,
+                                           dtype=tf.float32)
+                image_bs = tf.clip_by_value(image_bs + noise_b, 0.0, 1.0)
+
+                # Perform flow augmentation using spatial parameters from data augmentation
+            flows = _preprocessing_ops.flow_augmentation(
+                flows, transforms_from_a, transforms_from_b, crop)
+
+            return tf.train.batch([image_as, image_bs, flows],
+                                  enqueue_many=True,
+                                  batch_size=dataset_config['BATCH_SIZE'],
+                                  capacity=dataset_config['BATCH_SIZE'] * 4,
+                                  num_threads=num_threads,
+                                  allow_smaller_final_batch=False)
diff --git a/Codes/flownet2/src/dataset_configs.py b/Codes/flownet2/src/dataset_configs.py
new file mode 100644
index 0000000..fbda5d0
--- /dev/null
+++ b/Codes/flownet2/src/dataset_configs.py
@@ -0,0 +1,153 @@
+"""
+Add dataset configurations here. Each dataset must have the following structure:
+
+NAME = {
+    IMAGE_HEIGHT: int,
+    IMAGE_WIDTH: int,
+    ITEMS_TO_DESCRIPTIONS: {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    SIZES: {
+        'train': int,
+        'validate': int,    (optional)
+        ...
+    },
+    BATCH_SIZE: int,
+    PATHS: {
+        'train': '',
+        'validate': '', (optional)
+        ...
+    }
+}
+"""
+
+"""
+note that one step = one batch of data processed, ~not~ an entire epoch
+'coeff_schedule_param': {
+    'half_life': 50000,         after this many steps, the value will be i + (f - i)/2
+    'initial_coeff': 0.5,       initial value
+    'final_coeff': 1,           final value
+},
+"""
+
+FLYING_CHAIRS_DATASET_CONFIG = {
+    'IMAGE_HEIGHT': 384,
+    'IMAGE_WIDTH': 512,
+    'ITEMS_TO_DESCRIPTIONS': {
+        'image_a': 'A 3-channel image.',
+        'image_b': 'A 3-channel image.',
+        'flow': 'A 2-channel optical flow field',
+    },
+    'SIZES': {
+        'train': 22232,
+        'validate': 640,
+        'sample': 8,
+    },
+    'BATCH_SIZE': 8,
+    'PATHS': {
+        'train': './data/tfrecords/fc_train.tfrecords',
+        'validate': './data/tfrecords/fc_val.tfrecords',
+        'sample': './data/tfrecords/fc_sample.tfrecords',
+    },
+    'PREPROCESS': {
+        'scale': False,
+        'crop_height': 320,
+        'crop_width': 448,
+        'image_a': {
+            'translate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0.2,
+                'spread': 0.4,
+                'prob': 1.0,
+            },
+            'squeeze': {
+                'rand_type': "uniform_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.3,
+                'prob': 1.0,
+            },
+            'noise': {
+                'rand_type': "uniform_bernoulli",
+                'exp': False,
+                'mean': 0.03,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+        },
+        # All preprocessing to image A will be applied to image B in addition to the following.
+        'image_b': {
+            'translate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'rotate': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'zoom': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.03,
+                'prob': 1.0,
+            },
+            'gamma': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'brightness': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': False,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'contrast': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'color': {
+                'rand_type': "gaussian_bernoulli",
+                'exp': True,
+                'mean': 0,
+                'spread': 0.02,
+                'prob': 1.0,
+            },
+            'coeff_schedule_param': {
+                'half_life': 50000,
+                'initial_coeff': 0.5,
+                'final_coeff': 1,
+            },
+        }
+    },
+}
diff --git a/Codes/flownet2/src/downsample.py b/Codes/flownet2/src/downsample.py
new file mode 100644
index 0000000..5e6fc95
--- /dev/null
+++ b/Codes/flownet2/src/downsample.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+_downsample = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/downsample.so"))
+
+
+def downsample(tensor, size):
+    return _downsample.downsample(tensor, size)
diff --git a/Codes/flownet2/src/flow_warp.py b/Codes/flownet2/src/flow_warp.py
new file mode 100644
index 0000000..fe5fd4d
--- /dev/null
+++ b/Codes/flownet2/src/flow_warp.py
@@ -0,0 +1,15 @@
+import tensorflow as tf
+
+_flow_warp_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./ops/build/flow_warp.so"))
+
+
+def flow_warp(image, flow):
+    return _flow_warp_ops.flow_warp(image, flow)
+
+
+@tf.RegisterGradient("FlowWarp")
+def _flow_warp_grad(flow_warp_op, gradients):
+    return _flow_warp_ops.flow_warp_grad(flow_warp_op.inputs[0],
+                                         flow_warp_op.inputs[1],
+                                         gradients)
diff --git a/Codes/flownet2/src/flowlib.py b/Codes/flownet2/src/flowlib.py
new file mode 100644
index 0000000..36c56d4
--- /dev/null
+++ b/Codes/flownet2/src/flowlib.py
@@ -0,0 +1,554 @@
+#!/usr/bin/python
+"""
+# ==============================
+# flowlib.py
+# library for optical flow processing
+# Author: Ruoteng Li
+# Date: 6th Aug 2016
+# ==============================
+"""
+import png
+import numpy as np
+import matplotlib.colors as cl
+import matplotlib.pyplot as plt
+from PIL import Image
+import tensorflow as tf
+
+
+UNKNOWN_FLOW_THRESH = 1e7
+SMALLFLOW = 0.0
+LARGEFLOW = 1e8
+
+"""
+=============
+Flow Section
+=============
+"""
+
+
+def show_flow(filename):
+    """
+    visualize optical flow map using matplotlib
+    :param filename: optical flow file
+    :return: None
+    """
+    flow = read_flow(filename)
+    img = flow_to_image(flow)
+    plt.imshow(img)
+    plt.show()
+
+
+def visualize_flow(flow, mode='Y'):
+    """
+    this function visualize the input flow
+    :param flow: input flow in array
+    :param mode: choose which color mode to visualize the flow (Y: Ccbcr, RGB: RGB color)
+    :return: None
+    """
+    if mode == 'Y':
+        # Ccbcr color wheel
+        img = flow_to_image(flow)
+        plt.imshow(img)
+        plt.show()
+    elif mode == 'RGB':
+        (h, w) = flow.shape[0:2]
+        du = flow[:, :, 0]
+        dv = flow[:, :, 1]
+        valid = flow[:, :, 2]
+        max_flow = max(np.max(du), np.max(dv))
+        img = np.zeros((h, w, 3), dtype=np.float64)
+        # angle layer
+        img[:, :, 0] = np.arctan2(dv, du) / (2 * np.pi)
+        # magnitude layer, normalized to 1
+        img[:, :, 1] = np.sqrt(du * du + dv * dv) * 8 / max_flow
+        # phase layer
+        img[:, :, 2] = 8 - img[:, :, 1]
+        # clip to [0,1]
+        small_idx = img[:, :, 0:3] < 0
+        large_idx = img[:, :, 0:3] > 1
+        img[small_idx] = 0
+        img[large_idx] = 1
+        # convert to rgb
+        img = cl.hsv_to_rgb(img)
+        # remove invalid point
+        img[:, :, 0] = img[:, :, 0] * valid
+        img[:, :, 1] = img[:, :, 1] * valid
+        img[:, :, 2] = img[:, :, 2] * valid
+        # show
+        plt.imshow(img)
+        plt.show()
+
+    return None
+
+
+def read_flow(filename):
+    """
+    read optical flow from Middlebury .flo file
+    :param filename: name of the flow file
+    :return: optical flow data in matrix
+    """
+    f = open(filename, 'rb')
+    magic = np.fromfile(f, np.float32, count=1)
+    data2d = None
+
+    if 202021.25 != magic:
+        print('Magic number incorrect. Invalid .flo file')
+    else:
+        w = np.fromfile(f, np.int32, count=1)
+        h = np.fromfile(f, np.int32, count=1)
+        print("Reading %d x %d flo file" % (h, w))
+        data2d = np.fromfile(f, np.float32, count=2 * w * h)
+        # reshape data into 3D array (columns, rows, channels)
+        data2d = np.resize(data2d, (h[0], w[0], 2))
+    f.close()
+    return data2d
+
+
+def read_flow_png(flow_file):
+    """
+    Read optical flow from KITTI .png file
+    :param flow_file: name of the flow file
+    :return: optical flow data in matrix
+    """
+    flow_object = png.Reader(filename=flow_file)
+    flow_direct = flow_object.asDirect()
+    flow_data = list(flow_direct[2])
+    (w, h) = flow_direct[3]['size']
+    flow = np.zeros((h, w, 3), dtype=np.float64)
+    for i in range(len(flow_data)):
+        flow[i, :, 0] = flow_data[i][0::3]
+        flow[i, :, 1] = flow_data[i][1::3]
+        flow[i, :, 2] = flow_data[i][2::3]
+
+    invalid_idx = (flow[:, :, 2] == 0)
+    flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0
+    flow[invalid_idx, 0] = 0
+    flow[invalid_idx, 1] = 0
+    return flow
+
+
+def write_flow(flow, filename):
+    """
+    write optical flow in Middlebury .flo format
+    :param flow: optical flow map
+    :param filename: optical flow file path to be saved
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = flow.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    flow.tofile(f)
+    f.close()
+
+
+def segment_flow(flow):
+    h = flow.shape[0]
+    w = flow.shape[1]
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW))
+    idx2 = (abs(u) == SMALLFLOW)
+    class0 = (v == 0) & (u == 0)
+    u[idx2] = 0.00001
+    tan_value = v / u
+
+    class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0)
+    class2 = (tan_value >= 1) & (u >= 0) & (v >= 0)
+    class3 = (tan_value < -1) & (u <= 0) & (v >= 0)
+    class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0)
+    class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0)
+    class7 = (tan_value < -1) & (u >= 0) & (v <= 0)
+    class6 = (tan_value >= 1) & (u <= 0) & (v <= 0)
+    class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0)
+
+    seg = np.zeros((h, w))
+
+    seg[class1] = 1
+    seg[class2] = 2
+    seg[class3] = 3
+    seg[class4] = 4
+    seg[class5] = 5
+    seg[class6] = 6
+    seg[class7] = 7
+    seg[class8] = 8
+    seg[class0] = 0
+    seg[idx] = 0
+
+    return seg
+
+
+def flow_error(tu, tv, u, v):
+    """
+    Calculate average end point error
+    :param tu: ground-truth horizontal flow map
+    :param tv: ground-truth vertical flow map
+    :param u:  estimated horizontal flow map
+    :param v:  estimated vertical flow map
+    :return: End point error of the estimated flow
+    """
+    smallflow = 0.0
+    '''
+    stu = tu[bord+1:end-bord,bord+1:end-bord]
+    stv = tv[bord+1:end-bord,bord+1:end-bord]
+    su = u[bord+1:end-bord,bord+1:end-bord]
+    sv = v[bord+1:end-bord,bord+1:end-bord]
+    '''
+    stu = tu[:]
+    stv = tv[:]
+    su = u[:]
+    sv = v[:]
+
+    idxUnknow = (abs(stu) > UNKNOWN_FLOW_THRESH) | (abs(stv) > UNKNOWN_FLOW_THRESH)
+    stu[idxUnknow] = 0
+    stv[idxUnknow] = 0
+    su[idxUnknow] = 0
+    sv[idxUnknow] = 0
+
+    ind2 = [(np.absolute(stu) > smallflow) | (np.absolute(stv) > smallflow)]
+    index_su = su[ind2]
+    index_sv = sv[ind2]
+    an = 1.0 / np.sqrt(index_su ** 2 + index_sv ** 2 + 1)
+    un = index_su * an
+    vn = index_sv * an
+
+    index_stu = stu[ind2]
+    index_stv = stv[ind2]
+    tn = 1.0 / np.sqrt(index_stu ** 2 + index_stv ** 2 + 1)
+    tun = index_stu * tn
+    tvn = index_stv * tn
+
+    '''
+    angle = un * tun + vn * tvn + (an * tn)
+    index = [angle == 1.0]
+    angle[index] = 0.999
+    ang = np.arccos(angle)
+    mang = np.mean(ang)
+    mang = mang * 180 / np.pi
+    '''
+
+    epe = np.sqrt((stu - su) ** 2 + (stv - sv) ** 2)
+    epe = epe[ind2]
+    mepe = np.mean(epe)
+    return mepe
+
+
+def flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, 0]
+    v = flow[:, :, 1]
+
+    maxu = -999.
+    maxv = -999.
+    minu = 999.
+    minv = 999.
+
+    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
+    u[idxUnknow] = 0
+    v[idxUnknow] = 0
+
+    maxu = max(maxu, np.max(u))
+    minu = min(minu, np.min(u))
+
+    maxv = max(maxv, np.max(v))
+    minv = min(minv, np.min(v))
+
+    rad = np.sqrt(u ** 2 + v ** 2)
+    maxrad = max(-1, np.max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu,maxu, minv, maxv))
+
+    u = u/(maxrad + np.finfo(float).eps)
+    v = v/(maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    img[idx] = 0
+
+    return np.uint8(img)
+
+
+def tf_flow_to_image(flow):
+    """
+    Convert flow into middlebury color code image
+    :param flow: optical flow map
+    :return: optical flow image in middlebury color
+    """
+    u = flow[:, :, :, 0]
+    v = flow[:, :, :, 1]
+
+    maxu = tf.constant(-999.)
+    maxv = tf.constant(-999.)
+    minu = tf.constant(999.)
+    minv = tf.constant(999.)
+
+    zeros = tf.zeros_like(u, dtype=tf.float32)
+    u = tf.where(tf.greater(u, UNKNOWN_FLOW_THRESH), zeros, u)
+    v = tf.where(tf.greater(v, UNKNOWN_FLOW_THRESH), zeros, v)
+
+    rad = tf.sqrt(u ** 2 + v ** 2)
+    maxrad = tf.reduce_max(-1, tf.reduce_max(rad))
+
+    # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu, maxu, minv, maxv))
+
+    u = u / (maxrad + np.finfo(float).eps)
+    v = v / (maxrad + np.finfo(float).eps)
+
+    img = compute_color(u, v)
+
+    # idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
+    # img[idx] = 0
+
+    return np.uint8(img)
+
+
+def evaluate_flow_file(gt, pred):
+    """
+    evaluate the estimated optical flow end point error according to ground truth provided
+    :param gt: ground truth file path
+    :param pred: estimated optical flow file path
+    :return: end point error, float32
+    """
+    # Read flow files and calculate the errors
+    gt_flow = read_flow(gt)        # ground truth flow
+    eva_flow = read_flow(pred)     # predicted flow
+    # Calculate errors
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], eva_flow[:, :, 0], eva_flow[:, :, 1])
+    return average_pe
+
+
+def evaluate_flow(gt_flow, pred_flow):
+    """
+    gt: ground-truth flow
+    pred: estimated flow
+    """
+    average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], pred_flow[:, :, 0], pred_flow[:, :, 1])
+    return average_pe
+
+
+"""
+==============
+Disparity Section
+==============
+"""
+
+
+def read_disp_png(file_name):
+    """
+    Read optical flow from KITTI .png file
+    :param file_name: name of the flow file
+    :return: optical flow data in matrix
+    """
+    image_object = png.Reader(filename=file_name)
+    image_direct = image_object.asDirect()
+    image_data = list(image_direct[2])
+    (w, h) = image_direct[3]['size']
+    channel = len(image_data[0]) / w
+    flow = np.zeros((h, w, channel), dtype=np.uint16)
+    for i in range(len(image_data)):
+        for j in range(channel):
+            flow[i, :, j] = image_data[i][j::channel]
+    return flow[:, :, 0] / 256
+
+
+def disp_to_flowfile(disp, filename):
+    """
+    Read KITTI disparity file in png format
+    :param disp: disparity matrix
+    :param filename: the flow file name to save
+    :return: None
+    """
+    f = open(filename, 'wb')
+    magic = np.array([202021.25], dtype=np.float32)
+    (height, width) = disp.shape[0:2]
+    w = np.array([width], dtype=np.int32)
+    h = np.array([height], dtype=np.int32)
+    empty_map = np.zeros((height, width), dtype=np.float32)
+    data = np.dstack((disp, empty_map))
+    magic.tofile(f)
+    w.tofile(f)
+    h.tofile(f)
+    data.tofile(f)
+    f.close()
+
+
+"""
+==============
+Image Section
+==============
+"""
+
+
+def read_image(filename):
+    """
+    Read normal image of any format
+    :param filename: name of the image file
+    :return: image data in matrix uint8 type
+    """
+    img = Image.open(filename)
+    im = np.array(img)
+    return im
+
+
+def warp_image(im, flow):
+    """
+    Use optical flow to warp image to the next
+    :param im: image to warp
+    :param flow: optical flow
+    :return: warped image
+    """
+    from scipy import interpolate
+    image_height = im.shape[0]
+    image_width = im.shape[1]
+    flow_height = flow.shape[0]
+    flow_width = flow.shape[1]
+    n = image_height * image_width
+    (iy, ix) = np.mgrid[0:image_height, 0:image_width]
+    (fy, fx) = np.mgrid[0:flow_height, 0:flow_width]
+    fx += flow[:,:,0]
+    fy += flow[:,:,1]
+    mask = np.logical_or(fx <0 , fx > flow_width)
+    mask = np.logical_or(mask, fy < 0)
+    mask = np.logical_or(mask, fy > flow_height)
+    fx = np.minimum(np.maximum(fx, 0), flow_width)
+    fy = np.minimum(np.maximum(fy, 0), flow_height)
+    points = np.concatenate((ix.reshape(n,1), iy.reshape(n,1)), axis=1)
+    xi = np.concatenate((fx.reshape(n, 1), fy.reshape(n,1)), axis=1)
+    warp = np.zeros((image_height, image_width, im.shape[2]))
+    for i in range(im.shape[2]):
+        channel = im[:, :, i]
+        plt.imshow(channel, cmap='gray')
+        values = channel.reshape(n, 1)
+        new_channel = interpolate.griddata(points, values, xi, method='cubic')
+        new_channel = np.reshape(new_channel, [flow_height, flow_width])
+        new_channel[mask] = 1
+        warp[:, :, i] = new_channel.astype(np.uint8)
+
+    return warp.astype(np.uint8)
+
+
+"""
+==============
+Others
+==============
+"""
+
+
+def scale_image(image, new_range):
+    """
+    Linearly scale the image into desired range
+    :param image: input image
+    :param new_range: the new range to be aligned
+    :return: image normalized in new range
+    """
+    min_val = np.min(image).astype(np.float32)
+    max_val = np.max(image).astype(np.float32)
+    min_val_new = np.array(min(new_range), dtype=np.float32)
+    max_val_new = np.array(max(new_range), dtype=np.float32)
+    scaled_image = (image - min_val) / (max_val - min_val) * (max_val_new - min_val_new) + min_val_new
+    return scaled_image.astype(np.uint8)
+
+
+def compute_color(u, v):
+    """
+    compute optical flow color map
+    :param u: optical flow horizontal map
+    :param v: optical flow vertical map
+    :return: optical flow in color code
+    """
+    [h, w] = u.shape
+    img = np.zeros([h, w, 3])
+    nanIdx = np.isnan(u) | np.isnan(v)
+    u[nanIdx] = 0
+    v[nanIdx] = 0
+
+    colorwheel = make_color_wheel()
+    # ncols = np.size(colorwheel, 0)
+    ncols = colorwheel.shape[0]
+
+    rad = np.sqrt(u**2+v**2)
+
+    a = np.arctan2(-v, -u) / np.pi
+
+    fk = (a+1) / 2 * (ncols - 1) + 1
+
+    k0 = np.floor(fk).astype(int)
+
+    k1 = k0 + 1
+    k1[k1 == ncols+1] = 1
+    f = fk - k0
+
+    for i in range(0, np.size(colorwheel, 1)):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0-1] / 255
+        col1 = tmp[k1-1] / 255
+        col = (1-f) * col0 + f * col1
+
+        idx = rad <= 1
+        col[idx] = 1-rad[idx]*(1-col[idx])
+        notidx = np.logical_not(idx)
+
+        col[notidx] *= 0.75
+        img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx)))
+
+    return img
+
+
+def make_color_wheel():
+    """
+    Generate color wheel according Middlebury color code
+    :return: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+
+    colorwheel = np.zeros([ncols, 3])
+
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY))
+    col += RY
+
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG))
+    colorwheel[col:col+YG, 1] = 255
+    col += YG
+
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC))
+    col += GC
+
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB))
+    colorwheel[col:col+CB, 2] = 255
+    col += CB
+
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM))
+    col += + BM
+
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
+    colorwheel[col:col+MR, 0] = 255
+
+    return colorwheel
diff --git a/Codes/flownet2/src/flownet2/__init__.py b/Codes/flownet2/src/flownet2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/__init__.py
diff --git a/Codes/flownet2/src/flownet2/flownet2.py b/Codes/flownet2/src/flownet2/flownet2.py
new file mode 100644
index 0000000..d44ed10
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/flownet2.py
@@ -0,0 +1,118 @@
+from ..net import Net, Mode
+from ..flownet_css.flownet_css import FlowNetCSS
+from ..flownet_sd.flownet_sd import FlowNetSD
+from ..flow_warp import flow_warp
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNet2(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_css = FlowNetCSS(mode, debug)
+        self.net_sd = FlowNetSD(mode, debug)
+        super(FlowNet2, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNet2'):
+            # Forward pass through FlowNetCSS and FlowNetSD with weights frozen
+            net_css_predictions = self.net_css.model(inputs, training_schedule, trainable=True)
+            net_sd_predictions = self.net_sd.model(inputs, training_schedule, trainable=True)
+
+            def ChannelNorm(tensor):
+                sq = tf.square(tensor)
+                r_sum = tf.reduce_sum(sq, keep_dims=True, axis=3)
+                return tf.sqrt(r_sum)
+
+            sd_flow_norm = ChannelNorm(net_sd_predictions['flow'])
+            css_flow_norm = ChannelNorm(net_css_predictions['flow'])
+
+            flow_warp_sd = flow_warp(inputs['input_b'], net_sd_predictions['flow'])
+            img_diff_sd = inputs['input_a'] - flow_warp_sd
+            img_diff_sd_norm = ChannelNorm(img_diff_sd)
+
+            flow_warp_css = flow_warp(inputs['input_b'], net_css_predictions['flow'])
+            img_diff_css = inputs['input_a'] - flow_warp_css
+            img_diff_css_norm = ChannelNorm(img_diff_css)
+
+            input_to_fusion = tf.concat([inputs['input_a'],
+                                         net_sd_predictions['flow'],
+                                         net_css_predictions['flow'],
+                                         sd_flow_norm,
+                                         css_flow_norm,
+                                         img_diff_sd_norm,
+                                         img_diff_css_norm], axis=3)
+
+            # Fusion Network
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    fuse_conv0 = slim.conv2d(pad(input_to_fusion), 64, 3, scope='fuse_conv0')
+                    fuse_conv1 = slim.conv2d(pad(fuse_conv0), 64, 3, stride=2, scope='fuse_conv1')
+                    fuse_conv1_1 = slim.conv2d(pad(fuse_conv1), 128, 3, scope='fuse_conv1_1')
+                    fuse_conv2 = slim.conv2d(pad(fuse_conv1_1), 128, 3,
+                                             stride=2, scope='fuse_conv2')
+                    fuse_conv2_1 = slim.conv2d(pad(fuse_conv2), 128, 3, scope='fuse_conv2_1')
+
+                    predict_flow2 = slim.conv2d(pad(fuse_conv2_1), 2, 3,
+                                                scope='predict_flow2',
+                                                activation_fn=None)
+                    fuse_deconv1 = antipad(slim.conv2d_transpose(fuse_conv2_1, 32, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv1'))
+                    fuse_upsample_flow2to1 = antipad(slim.conv2d_transpose(predict_flow2, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow2to1',
+                                                                           activation_fn=None))
+                    concat1 = tf.concat([fuse_conv1_1, fuse_deconv1,
+                                         fuse_upsample_flow2to1], axis=3)
+                    fuse_interconv1 = slim.conv2d(pad(concat1), 32, 3,
+                                                  activation_fn=None, scope='fuse_interconv1')
+
+                    predict_flow1 = slim.conv2d(pad(fuse_interconv1), 2, 3,
+                                                scope='predict_flow1',
+                                                activation_fn=None)
+                    fuse_deconv0 = antipad(slim.conv2d_transpose(concat1, 16, 4,
+                                                                 stride=2,
+                                                                 scope='fuse_deconv0'))
+                    fuse_upsample_flow1to0 = antipad(slim.conv2d_transpose(predict_flow1, 2, 4,
+                                                                           stride=2,
+                                                                           scope='fuse_upsample_flow1to0',
+                                                                           activation_fn=None))
+                    concat0 = tf.concat([fuse_conv0, fuse_deconv0, fuse_upsample_flow1to0], axis=3)
+                    fuse_interconv0 = slim.conv2d(pad(concat0), 16, 3,
+                                                  activation_fn=None, scope='fuse_interconv0')
+
+                    predict_flow0 = slim.conv2d(pad(fuse_interconv0), 2,
+                                                3, activation_fn=None, scope='predict_flow0')
+
+                    flow = tf.image.resize_bilinear(
+                        predict_flow0, tf.stack([height, width]), align_corners=True)
+                    print(predict_flow0)
+                    print(flow)
+                    return {
+                        'predict_flow0': predict_flow0,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        # L2 loss between predict_flow0, true flow (weighted w/ 0.005)
+        predict_flow0 = predictions['predict_flow0']
+        size = [predict_flow0.shape[1], predict_flow0.shape[2]]
+        downsampled_flow0 = downsample(flow, size)
+        loss = average_endpoint_error(downsampled_flow0, predict_flow0)
+        tf.losses.add_loss(loss)
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet2/test.py b/Codes/flownet2/src/flownet2/test.py
new file mode 100644
index 0000000..3177614
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet2 import FlowNet2
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNet2(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNet2/flownet-2.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet2/train.py b/Codes/flownet2/src/flownet2/train.py
new file mode 100644
index 0000000..40c028d
--- /dev/null
+++ b/Codes/flownet2/src/flownet2/train.py
@@ -0,0 +1,24 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet2 import FlowNet2
+
+# Create a new network
+net = FlowNet2()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_2',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CSS and SD parts of network
+    checkpoints={
+        './checkpoints/FlowNetCSS-ft-sd/flownet-CSS-ft-sd.ckpt-0': ('FlowNet2/FlowNetCSS', 'FlowNet2'),
+        './checkpoints/FlowNetSD/flownet-SD.ckpt-0': ('FlowNet2/FlowNetSD', 'FlowNet2')
+    }
+)
diff --git a/Codes/flownet2/src/flownet_c/__init__.py b/Codes/flownet2/src/flownet_c/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/__init__.py
diff --git a/Codes/flownet2/src/flownet_c/flownet_c.py b/Codes/flownet2/src/flownet_c/flownet_c.py
new file mode 100644
index 0000000..d333ee2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/flownet_c.py
@@ -0,0 +1,167 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..correlation import correlation
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetC(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetC, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetC'):
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_a_1 = slim.conv2d(pad(inputs['input_a'], 3), 64, 7, scope='conv1')
+                        conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2')
+                        conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3')
+
+                        conv_b_1 = slim.conv2d(pad(inputs['input_b'], 3),
+                                               64, 7, scope='conv1', reuse=True)
+                        conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True)
+                        conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True)
+
+                        # Compute cross correlation with leaky relu activation
+                        cc = correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20)
+                        cc_relu = LeakyReLU(cc)
+
+                    # Combine cross correlation results with convolution of feature map A
+                    netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir')
+                    # Concatenate along the channels axis
+                    net = tf.concat([netA_conv, cc_relu], axis=3)
+
+                    conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_c/test.py b/Codes/flownet2/src/flownet_c/test.py
new file mode 100644
index 0000000..692f22d
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_c import FlowNetC
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetC(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetC/flownet-C.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_c/train.py b/Codes/flownet2/src/flownet_c/train.py
new file mode 100644
index 0000000..9296ac7
--- /dev/null
+++ b/Codes/flownet2/src/flownet_c/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_c import FlowNetC
+
+# Create a new network
+net = FlowNetC()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_c',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_cs/__init__.py b/Codes/flownet2/src/flownet_cs/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/__init__.py
diff --git a/Codes/flownet2/src/flownet_cs/flownet_cs.py b/Codes/flownet2/src/flownet_cs/flownet_cs.py
new file mode 100644
index 0000000..aeaea47
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/flownet_cs.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_c.flownet_c import FlowNetC
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_c = FlowNetC(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCS'):
+            # Forward pass through FlowNetC with weights frozen
+            net_c_predictions = self.net_c.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_c_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_c_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_cs/test.py b/Codes/flownet2/src/flownet_cs/test.py
new file mode 100644
index 0000000..ae00ff4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_cs import FlowNetCS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCS/flownet-CS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_cs/train.py b/Codes/flownet2/src/flownet_cs/train.py
new file mode 100644
index 0000000..9376132
--- /dev/null
+++ b/Codes/flownet2/src/flownet_cs/train.py
@@ -0,0 +1,21 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_cs import FlowNetCS
+
+# Create a new network
+net = FlowNetCS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_cs',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for C part of network
+    checkpoints={'./checkpoints/FlowNetC/flownet-C.ckpt-0': ('FlowNetCS/FlowNetC', 'FlowNetCS')}
+)
diff --git a/Codes/flownet2/src/flownet_css/__init__.py b/Codes/flownet2/src/flownet_css/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/__init__.py
diff --git a/Codes/flownet2/src/flownet_css/flownet_css.py b/Codes/flownet2/src/flownet_css/flownet_css.py
new file mode 100644
index 0000000..93d9db2
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/flownet_css.py
@@ -0,0 +1,41 @@
+from ..net import Net, Mode
+from ..flownet_cs.flownet_cs import FlowNetCS
+from ..flownet_s.flownet_s import FlowNetS
+from ..flow_warp import flow_warp
+import tensorflow as tf
+
+
+class FlowNetCSS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.net_cs = FlowNetCS(mode, debug)
+        self.net_s = FlowNetS(mode, debug)
+        super(FlowNetCSS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        with tf.variable_scope('FlowNetCSS'):
+            # Forward pass through FlowNetCS with weights frozen
+            net_cs_predictions = self.net_cs.model(inputs, training_schedule, trainable=True)
+
+            # Perform flow warping (to move image B closer to image A based on flow prediction)
+            warped = flow_warp(inputs['input_b'], net_cs_predictions['flow'])
+
+            # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels)
+            brightness_error = inputs['input_a'] - warped
+            brightness_error = tf.square(brightness_error)
+            brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3)
+            brightness_error = tf.sqrt(brightness_error)
+
+            # Gather all inputs to FlowNetS
+            inputs_to_s = {
+                'input_a': inputs['input_a'],
+                'input_b': inputs['input_b'],
+                'warped': warped,
+                'flow': net_cs_predictions['flow'] * 0.05,
+                'brightness_error': brightness_error,
+            }
+
+            return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable)
+
+    def loss(self, flow, predictions):
+        return self.net_s.loss(flow, predictions)
diff --git a/Codes/flownet2/src/flownet_css/test.py b/Codes/flownet2/src/flownet_css/test.py
new file mode 100644
index 0000000..9d1249e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_css import FlowNetCSS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetCSS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetCSS/flownet-CSS.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_css/train.py b/Codes/flownet2/src/flownet_css/train.py
new file mode 100644
index 0000000..2964f3e
--- /dev/null
+++ b/Codes/flownet2/src/flownet_css/train.py
@@ -0,0 +1,22 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_css import FlowNetCSS
+
+# Create a new network
+net = FlowNetCSS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_css',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow,
+    # Load trained weights for CS part of network
+    checkpoints={
+        './checkpoints/FlowNetCS/flownet-CS.ckpt-0': ('FlowNetCSS/FlowNetCS', 'FlowNetCSS')}
+)
diff --git a/Codes/flownet2/src/flownet_s/__init__.py b/Codes/flownet2/src/flownet_s/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/__init__.py
diff --git a/Codes/flownet2/src/flownet_s/flownet_s.py b/Codes/flownet2/src/flownet_s/flownet_s.py
new file mode 100644
index 0000000..f6704b1
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/flownet_s.py
@@ -0,0 +1,161 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetS(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetS, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        stacked = False
+        with tf.variable_scope('FlowNetS'):
+            if 'warped' in inputs and 'flow' in inputs and 'brightness_error' in inputs:
+                stacked = True
+                concat_inputs = tf.concat([inputs['input_a'],
+                                           inputs['input_b'],
+                                           inputs['warped'],
+                                           inputs['flow'],
+                                           inputs['brightness_error']], axis=3)
+            else:
+                concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    with slim.arg_scope([slim.conv2d], stride=2):
+                        conv_1 = slim.conv2d(pad(concat_inputs, 3), 64, 7, scope='conv1')
+                        conv_2 = slim.conv2d(pad(conv_1, 2), 128, 5, scope='conv2')
+                        conv_3 = slim.conv2d(pad(conv_2, 2), 256, 5, scope='conv3')
+
+                    conv3_1 = slim.conv2d(pad(conv_3), 256, 3, scope='conv3_1')
+                    with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3):
+                        conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4')
+                        conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1')
+                        conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5')
+                        conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+
+                        predict_flow5 = slim.conv2d(pad(concat5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+
+                        predict_flow4 = slim.conv2d(pad(concat4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+
+                        predict_flow3 = slim.conv2d(pad(concat3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv_2, deconv2, upsample_flow3to2], axis=3)
+
+                        predict_flow2 = slim.conv2d(pad(concat2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 20.0
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    def loss(self, flow, predictions):
+        flow = flow * 0.05
+
+        losses = []
+        INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+
+        # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+        predict_flow6 = predictions['predict_flow6']
+        size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+        downsampled_flow6 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+
+        # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+        predict_flow5 = predictions['predict_flow5']
+        size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+        downsampled_flow5 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+
+        # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+        predict_flow4 = predictions['predict_flow4']
+        size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+        downsampled_flow4 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+
+        # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+        predict_flow3 = predictions['predict_flow3']
+        size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+        downsampled_flow3 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+
+        # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+        predict_flow2 = predictions['predict_flow2']
+        size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+        downsampled_flow2 = downsample(flow, size)
+        losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+
+        loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+
+        # Return the 'total' loss: loss fns + regularization terms defined in the model
+        return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_s/test.py b/Codes/flownet2/src/flownet_s/test.py
new file mode 100644
index 0000000..ae1b2f3
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_s import FlowNetS
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetS(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetS/flownet-S.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_s/train.py b/Codes/flownet2/src/flownet_s/train.py
new file mode 100644
index 0000000..13a792a
--- /dev/null
+++ b/Codes/flownet2/src/flownet_s/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_s import FlowNetS
+
+# Create a new network
+net = FlowNetS()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_s_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/flownet_sd/__init__.py b/Codes/flownet2/src/flownet_sd/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/__init__.py
diff --git a/Codes/flownet2/src/flownet_sd/flownet_sd.py b/Codes/flownet2/src/flownet_sd/flownet_sd.py
new file mode 100644
index 0000000..2f5c9e4
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/flownet_sd.py
@@ -0,0 +1,160 @@
+from ..net import Net, Mode
+from ..utils import LeakyReLU, average_endpoint_error, pad, antipad
+# from ..downsample import downsample
+import math
+import tensorflow as tf
+slim = tf.contrib.slim
+
+
+class FlowNetSD(Net):
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        super(FlowNetSD, self).__init__(mode=mode, debug=debug)
+
+    def model(self, inputs, training_schedule, trainable=True, reuse=None):
+        _, height, width, _ = inputs['input_a'].shape.as_list()
+        with tf.variable_scope('FlowNetSD', reuse=reuse):
+            concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3)
+            with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                                # Only backprop this network if trainable
+                                trainable=trainable,
+                                # He (aka MSRA) weight initialization
+                                weights_initializer=slim.variance_scaling_initializer(),
+                                activation_fn=LeakyReLU,
+                                # We will do our own padding to match the original Caffe code
+                                padding='VALID'):
+
+                weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay'])
+                with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer):
+                    conv0 = slim.conv2d(pad(concat_inputs), 64, 3, scope='conv0')
+                    conv1 = slim.conv2d(pad(conv0), 64, 3, stride=2, scope='conv1')
+                    conv1_1 = slim.conv2d(pad(conv1), 128, 3, scope='conv1_1')
+                    conv2 = slim.conv2d(pad(conv1_1), 128, 3, stride=2, scope='conv2')
+                    conv2_1 = slim.conv2d(pad(conv2), 128, 3, scope='conv2_1')
+                    conv3 = slim.conv2d(pad(conv2_1), 256, 3, stride=2, scope='conv3')
+                    conv3_1 = slim.conv2d(pad(conv3), 256, 3, scope='conv3_1')
+                    conv4 = slim.conv2d(pad(conv3_1), 512, 3, stride=2, scope='conv4')
+                    conv4_1 = slim.conv2d(pad(conv4), 512, 3, scope='conv4_1')
+                    conv5 = slim.conv2d(pad(conv4_1), 512, 3, stride=2, scope='conv5')
+                    conv5_1 = slim.conv2d(pad(conv5), 512, 3, scope='conv5_1')
+                    conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6')
+                    conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1')
+
+                    """ START: Refinement Network """
+                    with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None):
+                        predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3,
+                                                    scope='predict_flow6',
+                                                    activation_fn=None)
+                        deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4,
+                                                                stride=2,
+                                                                scope='deconv5'))
+                        upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow6to5',
+                                                                          activation_fn=None))
+                        concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3)
+                        interconv5 = slim.conv2d(pad(concat5), 512, 3,
+                                                 activation_fn=None, scope='interconv5')
+
+                        predict_flow5 = slim.conv2d(pad(interconv5), 2, 3,
+                                                    scope='predict_flow5',
+                                                    activation_fn=None)
+                        deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4,
+                                                                stride=2,
+                                                                scope='deconv4'))
+                        upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow5to4',
+                                                                          activation_fn=None))
+                        concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3)
+                        interconv4 = slim.conv2d(pad(concat4), 256, 3,
+                                                 activation_fn=None, scope='interconv4')
+
+                        predict_flow4 = slim.conv2d(pad(interconv4), 2, 3,
+                                                    scope='predict_flow4',
+                                                    activation_fn=None)
+                        deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4,
+                                                                stride=2,
+                                                                scope='deconv3'))
+                        upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow4to3',
+                                                                          activation_fn=None))
+                        concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3)
+                        interconv3 = slim.conv2d(pad(concat3), 128, 3,
+                                                 activation_fn=None, scope='interconv3')
+
+                        predict_flow3 = slim.conv2d(pad(interconv3), 2, 3,
+                                                    scope='predict_flow3',
+                                                    activation_fn=None)
+                        deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4,
+                                                                stride=2,
+                                                                scope='deconv2'))
+                        upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4,
+                                                                          stride=2,
+                                                                          scope='upsample_flow3to2',
+                                                                          activation_fn=None))
+                        concat2 = tf.concat([conv2, deconv2, upsample_flow3to2], axis=3)
+                        interconv2 = slim.conv2d(pad(concat2), 64, 3,
+                                                 activation_fn=None, scope='interconv2')
+
+                        predict_flow2 = slim.conv2d(pad(interconv2), 2, 3,
+                                                    scope='predict_flow2',
+                                                    activation_fn=None)
+                    """ END: Refinement Network """
+
+                    flow = predict_flow2 * 0.05
+                    # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different
+                    flow = tf.image.resize_bilinear(flow,
+                                                    tf.stack([height, width]),
+                                                    align_corners=True)
+
+                    return {
+                        'predict_flow6': predict_flow6,
+                        'predict_flow5': predict_flow5,
+                        'predict_flow4': predict_flow4,
+                        'predict_flow3': predict_flow3,
+                        'predict_flow2': predict_flow2,
+                        'flow': flow,
+                    }
+
+    # def loss(self, flow, predictions):
+    #     flow = flow * 20.0
+    #
+    #     losses = []
+    #     INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value)
+    #
+    #     # L2 loss between predict_flow6, blob23 (weighted w/ 0.32)
+    #     predict_flow6 = predictions['predict_flow6']
+    #     size = [predict_flow6.shape[1], predict_flow6.shape[2]]
+    #     downsampled_flow6 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow6, predict_flow6))
+    #
+    #     # L2 loss between predict_flow5, blob28 (weighted w/ 0.08)
+    #     predict_flow5 = predictions['predict_flow5']
+    #     size = [predict_flow5.shape[1], predict_flow5.shape[2]]
+    #     downsampled_flow5 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow5, predict_flow5))
+    #
+    #     # L2 loss between predict_flow4, blob33 (weighted w/ 0.02)
+    #     predict_flow4 = predictions['predict_flow4']
+    #     size = [predict_flow4.shape[1], predict_flow4.shape[2]]
+    #     downsampled_flow4 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow4, predict_flow4))
+    #
+    #     # L2 loss between predict_flow3, blob38 (weighted w/ 0.01)
+    #     predict_flow3 = predictions['predict_flow3']
+    #     size = [predict_flow3.shape[1], predict_flow3.shape[2]]
+    #     downsampled_flow3 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow3, predict_flow3))
+    #
+    #     # L2 loss between predict_flow2, blob43 (weighted w/ 0.005)
+    #     predict_flow2 = predictions['predict_flow2']
+    #     size = [predict_flow2.shape[1], predict_flow2.shape[2]]
+    #     downsampled_flow2 = downsample(flow, size)
+    #     losses.append(average_endpoint_error(downsampled_flow2, predict_flow2))
+    #
+    #     loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005])
+    #
+    #     # Return the 'total' loss: loss fns + regularization terms defined in the model
+    #     return tf.losses.get_total_loss()
diff --git a/Codes/flownet2/src/flownet_sd/test.py b/Codes/flownet2/src/flownet_sd/test.py
new file mode 100644
index 0000000..b2ac285
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/test.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+from ..net import Mode
+from .flownet_sd import FlowNetSD
+
+FLAGS = None
+
+
+def main():
+    # Create a new network
+    net = FlowNetSD(mode=Mode.TEST)
+
+    # Train on the data
+    net.test(
+        checkpoint='./checkpoints/FlowNetSD/flownet-SD.ckpt-0',
+        input_a_path=FLAGS.input_a,
+        input_b_path=FLAGS.input_b,
+        out_path=FLAGS.out,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_a',
+        type=str,
+        required=True,
+        help='Path to first image'
+    )
+    parser.add_argument(
+        '--input_b',
+        type=str,
+        required=True,
+        help='Path to second image'
+    )
+    parser.add_argument(
+        '--out',
+        type=str,
+        required=True,
+        help='Path to output flow result'
+    )
+    FLAGS = parser.parse_args()
+
+    # Verify arguments are valid
+    if not os.path.exists(FLAGS.input_a):
+        raise ValueError('image_a path must exist')
+    if not os.path.exists(FLAGS.input_b):
+        raise ValueError('image_b path must exist')
+    if not os.path.isdir(FLAGS.out):
+        raise ValueError('out directory must exist')
+    main()
diff --git a/Codes/flownet2/src/flownet_sd/train.py b/Codes/flownet2/src/flownet_sd/train.py
new file mode 100644
index 0000000..86c64e5
--- /dev/null
+++ b/Codes/flownet2/src/flownet_sd/train.py
@@ -0,0 +1,19 @@
+from ..dataloader import load_batch
+from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG
+from ..training_schedules import LONG_SCHEDULE
+from .flownet_sd import FlowNetSD
+
+# Create a new network
+net = FlowNetSD()
+
+# Load a batch of data
+input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step)
+
+# Train on the data
+net.train(
+    log_dir='./logs/flownet_sd_sample',
+    training_schedule=LONG_SCHEDULE,
+    input_a=input_a,
+    input_b=input_b,
+    flow=flow
+)
diff --git a/Codes/flownet2/src/net.py b/Codes/flownet2/src/net.py
new file mode 100644
index 0000000..43b2193
--- /dev/null
+++ b/Codes/flownet2/src/net.py
@@ -0,0 +1,177 @@
+import abc
+from enum import Enum
+import os
+import tensorflow as tf
+from .flowlib import flow_to_image, write_flow
+import numpy as np
+# from scipy.misc import imread, imsave, imresize
+import cv2
+import uuid
+from .training_schedules import LONG_SCHEDULE
+slim = tf.contrib.slim
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+class Mode(Enum):
+    TRAIN = 1
+    TEST = 2
+
+
+class Net(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, mode=Mode.TRAIN, debug=False):
+        self.global_step = slim.get_or_create_global_step()
+        self.mode = mode
+        self.debug = debug
+
+    @abc.abstractmethod
+    def model(self, inputs, training_schedule, trainable=True):
+        """
+        Defines the model and returns a tuple of Tensors needed for calculating the loss.
+        """
+        return
+
+    @abc.abstractmethod
+    def loss(self, **kwargs):
+        """
+        Accepts prediction Tensors from the output of `model`.
+        Returns a single Tensor representing the total loss of the model.
+        """
+        return
+    """
+     python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_6.png \
+                                 --input_b /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_7.png \
+                                 --out  ./
+      python -m src.flownet_sd.test --input_a 006.png  --input_b  007.png     --out ./
+      python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/ped1/frames/testing/01/006.jpg \
+                             --input_b /home/liuwen/ssd/videogan/ped1/frames/testing/01/007.jpg \
+                             --out  ./
+    """
+    def test(self, checkpoint, input_a_path, input_b_path, out_path, save_image=True, save_flo=False):
+        input_a = cv2.imread(input_a_path)
+        input_b = cv2.imread(input_b_path)
+
+        input_a = cv2.resize(input_a, (512, 384))
+        input_b = cv2.resize(input_b, (512, 384))
+        print(input_a.shape, input_b.shape)
+
+        # Convert from RGB -> BGR
+        # input_a = input_a[..., [2, 1, 0]]
+        # input_b = input_b[..., [2, 1, 0]]
+
+        # Scale from [0, 255] -> [0.0, 1.0] if needed
+        if input_a.max() > 1.0:
+            input_a = input_a / 255.0
+        if input_b.max() > 1.0:
+            input_b = input_b / 255.0
+
+        # TODO: This is a hack, we should get rid of this
+        training_schedule = LONG_SCHEDULE
+
+        inputs = {
+            'input_a': tf.expand_dims(tf.constant(input_a, dtype=tf.float32), 0),
+            'input_b': tf.expand_dims(tf.constant(input_b, dtype=tf.float32), 0),
+        }
+        predictions = self.model(inputs, training_schedule)
+        pred_flow = predictions['flow']
+
+        saver = tf.train.Saver()
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        with tf.Session(config=config) as sess:
+            saver.restore(sess, checkpoint)
+            pred_flow = sess.run(pred_flow)[0, :, :, :]
+
+            np.save('temporal_ped1', pred_flow)
+
+            unique_name = 'flow-' + str(uuid.uuid4())
+            if save_image:
+                flow_img = flow_to_image(pred_flow)
+                full_out_path = os.path.join(out_path, unique_name + '.png')
+                cv2.imwrite(full_out_path, flow_img)
+
+            if save_flo:
+                full_out_path = os.path.join(out_path, unique_name + '.flo')
+                write_flow(pred_flow, full_out_path)
+
+    def train(self, log_dir, training_schedule, input_a, input_b, flow, checkpoints=None):
+        tf.summary.image("image_a", input_a, max_outputs=2)
+        tf.summary.image("image_b", input_b, max_outputs=2)
+
+        self.learning_rate = tf.train.piecewise_constant(
+            self.global_step,
+            [tf.cast(v, tf.int64) for v in training_schedule['step_values']],
+            training_schedule['learning_rates'])
+
+        optimizer = tf.train.AdamOptimizer(
+            self.learning_rate,
+            training_schedule['momentum'],
+            training_schedule['momentum2'])
+
+        inputs = {
+            'input_a': input_a,
+            'input_b': input_b,
+        }
+        predictions = self.model(inputs, training_schedule)
+        total_loss = self.loss(flow, predictions)
+        tf.summary.scalar('loss', total_loss)
+
+        if checkpoints:
+            for (checkpoint_path, (scope, new_scope)) in checkpoints.iteritems():
+                variables_to_restore = slim.get_variables(scope=scope)
+                renamed_variables = {
+                    var.op.name.split(new_scope + '/')[1]: var
+                    for var in variables_to_restore
+                }
+                restorer = tf.train.Saver(renamed_variables)
+                with tf.Session() as sess:
+                    restorer.restore(sess, checkpoint_path)
+
+        # Show the generated flow in TensorBoard
+        if 'flow' in predictions:
+            pred_flow_0 = predictions['flow'][0, :, :, :]
+            pred_flow_0 = tf.py_func(flow_to_image, [pred_flow_0], tf.uint8)
+            pred_flow_1 = predictions['flow'][1, :, :, :]
+            pred_flow_1 = tf.py_func(flow_to_image, [pred_flow_1], tf.uint8)
+            pred_flow_img = tf.stack([pred_flow_0, pred_flow_1], 0)
+            tf.summary.image('pred_flow', pred_flow_img, max_outputs=2)
+
+        true_flow_0 = flow[0, :, :, :]
+        true_flow_0 = tf.py_func(flow_to_image, [true_flow_0], tf.uint8)
+        true_flow_1 = flow[1, :, :, :]
+        true_flow_1 = tf.py_func(flow_to_image, [true_flow_1], tf.uint8)
+        true_flow_img = tf.stack([true_flow_0, true_flow_1], 0)
+        tf.summary.image('true_flow', true_flow_img, max_outputs=2)
+
+        train_op = slim.learning.create_train_op(
+            total_loss,
+            optimizer,
+            summarize_gradients=True)
+
+        if self.debug:
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                tf.train.start_queue_runners(sess)
+                slim.learning.train_step(
+                    sess,
+                    train_op,
+                    self.global_step,
+                    {
+                        'should_trace': tf.constant(1),
+                        'should_log': tf.constant(1),
+                        'logdir': log_dir + '/debug',
+                    }
+                )
+        else:
+            slim.learning.train(
+                train_op,
+                log_dir,
+                # session_config=tf.ConfigProto(allow_soft_placement=True),
+                global_step=self.global_step,
+                save_summaries_secs=60,
+                number_of_steps=training_schedule['max_iter']
+            )
diff --git a/Codes/flownet2/src/ops/build/.gitkeep b/Codes/flownet2/src/ops/build/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/Codes/flownet2/src/ops/build/.gitkeep
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
new file mode 100644
index 0000000..4e92f45
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc
@@ -0,0 +1,160 @@
+#define EIGEN_USE_THREADS
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class CorrelationGradKernel : public OpKernel {
+  public:
+    explicit CorrelationGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and verify their dimensions
+      const Tensor& gradients_t = ctx->input(0);
+      const Tensor& input_a_t   = ctx->input(1);
+      const Tensor& input_b_t   = ctx->input(2);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input
+      const int batch_size          = input_a_t.dim_size(0);
+      const int in_height           = input_a_t.dim_size(1);
+      const int in_width            = input_a_t.dim_size(2);
+      const int in_channels         = input_a_t.dim_size(3);
+      const int in_count_per_sample = in_height * in_width * in_channels;
+      const int padded_height       = in_height + 2 * pad;
+      const int padded_width        = in_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      const int kernel_radius = (kernel_size - 1) / 2;
+      const int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      const int out_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      const int out_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      const int neighborhood_grid_radius = max_displacement / stride_2;
+      const int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      const int out_channels             = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the outputs
+      Tensor *output_a_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_a_t.shape(), &output_a_gradient_t));
+      Tensor *output_b_gradient_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, input_b_t.shape(), &output_b_gradient_t));
+
+      // Get the tensors
+      auto gradients         = gradients_t.tensor<float, 4>();
+      auto input_a           = input_a_t.tensor<float, 4>();
+      auto input_b           = input_b_t.tensor<float, 4>();
+      auto output_a_gradient = output_a_gradient_t->tensor<float, 4>();
+      auto output_b_gradient = output_b_gradient_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, in_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          in_height,
+          in_width,
+          in_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      CorrelationGradA(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_b.data(),
+                       gradients.data(),
+                       output_a_gradient.data());
+
+      CorrelationGradB(ctx->eigen_gpu_device(),
+                       batch_size,
+                       out_width,
+                       out_height,
+                       out_channels,
+                       max_displacement,
+                       neighborhood_grid_radius,
+                       neighborhood_grid_width,
+                       kernel_radius,
+                       stride_1,
+                       stride_2,
+                       in_width,
+                       in_height,
+                       padded_width,
+                       padded_height,
+                       in_channels,
+                       in_count_per_sample,
+                       pad,
+                       padded_input_a.data(),
+                       gradients.data(),
+                       output_b_gradient.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CorrelationGrad")
+                        .Device(DEVICE_GPU),
+                        CorrelationGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
new file mode 100644
index 0000000..19e3a40
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc
@@ -0,0 +1,262 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define ROUND_OFF 50000
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateDataBackward0(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_a_gradient,
+                                       const float *input_b,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // Get X,Y ranges and clamp
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    // We add round_off before_s1 the int division and subtract round_off after
+    // it, to ensure the formula matches ceil behavior:
+    int xmin = (x - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+    int ymin = (y - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 -
+               round_off;
+
+    // Same here:
+    int xmax = (x - max_displacement + round_off_s1) / stride_1 - round_off;
+    int ymax = (y - max_displacement + round_off_s1) / stride_1 - round_off;
+
+    float sum = 0;
+
+    if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+      xmin = max(0, xmin);
+      xmax = min(out_width - 1, xmax);
+
+      ymin = max(0, ymin);
+      ymax = min(out_height - 1, ymax);
+
+      for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+        for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+          // Get input_b data:
+          int s2o         = stride_2 * o;
+          int s2p         = stride_2 * p;
+          int idx_input_b = ((item * padded_in_height + (y + s2p)) * padded_in_width + (x + s2o)) *
+                            in_channels + k;
+          float input_b_tmp = input_b[idx_input_b]; // input_b[x+s2o,y+s2p,k]
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              // gradient[x,y,o,p]
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_b_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_a_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_a_gradient[input_a_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+__global__ void CorrelateDataBackward1(const int    nthreads,
+                                       int          item,
+                                       int          out_width,
+                                       int          out_height,
+                                       int          out_channels,
+                                       int          max_displacement,
+                                       int          neighborhood_grid_radius,
+                                       int          neighborhood_grid_width,
+                                       int          kernel_radius,
+                                       int          stride_1,
+                                       int          stride_2,
+                                       int          in_width,
+                                       int          in_height,
+                                       int          padded_in_width,
+                                       int          padded_in_height,
+                                       int          in_channels,
+                                       int          in_count_per_sample,
+                                       int          pad_size,
+                                       float       *output_b_gradient,
+                                       const float *input_a,
+                                       const float *gradient)
+{
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int k = index % in_channels;                                     // channels
+    int x = (index / in_channels) % in_width + pad_size;             // w-pos
+    int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos
+
+    // round_off is a trick to enable integer division with ceil, even for
+    // negative numbers
+    // We use a large offset, for the inner part not to become negative.
+    const int round_off    = ROUND_OFF;
+    const int round_off_s1 = stride_1 * round_off;
+
+    float sum = 0;
+
+    // Height (y)
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      // Width (x)
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+        int s2o = stride_2 * o;
+        int s2p = stride_2 * p;
+
+        // Get X,Y ranges and clamp
+        // We add round_off before_s1 the int division and subtract round_off
+        // after it, to ensure the formula matches ceil behavior:
+        int xmin = (x - 2 * kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+        int ymin = (y - 2 * kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride_1 +
+                   1 - round_off;
+
+        // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+        // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+
+        // Same here:
+        int xmax = (x - max_displacement - s2o + round_off_s1) / stride_1 - round_off;
+        int ymax = (y - max_displacement - s2p + round_off_s1) / stride_1 - round_off;
+
+        if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) {
+          xmin = max(0, xmin);
+          xmax = min(out_width - 1, xmax);
+
+          ymin = max(0, ymin);
+          ymax = min(out_height - 1, ymax);
+
+          // Get input_a data:
+          int idx_input_a = ((item * padded_in_height + (y - s2p)) * padded_in_width + (x - s2o)) *
+                            in_channels + k;
+          float input_a_tmp = input_a[idx_input_a];
+
+          // Index offset for gradient in following loops:
+          int op = (p + neighborhood_grid_radius) * neighborhood_grid_width +
+                   (o + neighborhood_grid_radius); // index [o,p]
+
+          for (int y = ymin; y <= ymax; y++) {
+            for (int x = xmin; x <= xmax; x++) {
+              int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op;
+              sum += gradient[idx_gradient] * input_a_tmp;
+            }
+          }
+        }
+      }
+    }
+    const int sumelems    = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels;
+    const int input_b_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k;
+    output_b_gradient[input_b_idx + item * in_count_per_sample] = sum / (float)sumelems;
+  }
+}
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample, // h * w * ch
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward0 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_a_gradient, input_b, gradient);
+  }
+}
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device);
+
+  for (int n = 0; n < batch_size; n++) {
+    CorrelateDataBackward1 << < config.block_count, config.thread_per_block, 0,
+      device.stream() >> > (
+      in_count_per_sample,
+      n, out_width, out_height, out_channels,
+      max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+      stride_1, stride_2,
+      in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad,
+      output_b_gradient, input_a, gradient);
+  }
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
new file mode 100644
index 0000000..f8a5193
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc
@@ -0,0 +1,137 @@
+#define EIGEN_USE_THREADS
+
+#include <utility>
+
+#include "correlation_kernel.h"
+#include "pad.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<typename Device>
+class CorrelationKernel : public OpKernel {
+  public:
+    explicit CorrelationKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the attributes
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad));
+
+      OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4"));
+      OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4"));
+
+      // Get dimensions of input (already padded)
+      int batch_size     = input_a_t.dim_size(0);
+      int input_height   = input_a_t.dim_size(1);
+      int input_width    = input_a_t.dim_size(2);
+      int input_channels = input_a_t.dim_size(3);
+      int padded_height  = input_height + 2 * pad;
+      int padded_width   = input_width + 2 * pad;
+
+      // The size of unreachable border region on each side
+      int kernel_radius = (kernel_size - 1) / 2;
+      int border_size   = max_displacement + kernel_radius;
+
+      // Calculate the output dimensions
+      int output_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+      int output_width  = ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+      OP_REQUIRES(ctx, output_height >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input height."));
+      OP_REQUIRES(ctx, output_width >= 1,
+                  errors::InvalidArgument("Neighborhood and kernel don't fit in input width."));
+
+      int neighborhood_grid_radius = max_displacement / stride_2;
+      int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+      int output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ batch_size, output_height, output_width, output_channels }),
+                       &output_t));
+
+      // Get the tensors
+      auto input_a = input_a_t.tensor<float, 4>();
+      auto input_b = input_b_t.tensor<float, 4>();
+      auto output  = output_t->tensor<float, 4>();
+
+      // Create temporary tensors for padded inputs
+      Tensor padded_input_a_t, padded_input_b_t;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                        TensorShape({ batch_size, padded_height, padded_width, input_channels }),
+                                        &padded_input_b_t));
+      auto padded_input_a = padded_input_a_t.tensor<float, 4>();
+      auto padded_input_b = padded_input_b_t.tensor<float, 4>();
+
+      // Pad the inputs
+      Pad(ctx->eigen_device<Device>(),
+          input_a.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_a.data());
+      Pad(ctx->eigen_device<Device>(),
+          input_b.data(),
+          batch_size,
+          input_height,
+          input_width,
+          input_channels,
+          padded_height,
+          padded_width,
+          padded_input_b.data());
+
+      // Perform cross correlation
+      Correlation(ctx->eigen_device<Device>(),
+                  padded_input_a.data(),
+                  padded_input_b.data(),
+                  batch_size,
+                  output_height,
+                  output_width,
+                  output_channels,
+                  output_height * output_width * output_channels,
+                  padded_height,
+                  padded_width,
+                  input_channels,
+                  max_displacement,
+                  neighborhood_grid_radius,
+                  neighborhood_grid_width,
+                  kernel_radius,
+                  kernel_size,
+                  stride_1,
+                  stride_2,
+                  output.data());
+    }
+
+  private:
+    int kernel_size;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    int pad;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Correlation")
+                        .Device(DEVICE_GPU),
+                        CorrelationKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
new file mode 100644
index 0000000..c63e489
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc
@@ -0,0 +1,153 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+
+#include <stdio.h>
+#include <iostream>
+
+#include "correlation_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void CorrelateData(int          batch_size,
+                              int          out_width,
+                              int          out_height,
+                              int          out_channels,
+                              int          out_count,
+                              int          max_displacement,
+                              int          neighborhood_grid_radius,
+                              int          neighborhood_grid_width,
+                              int          kernel_radius,
+                              int          kernel_size,
+                              int          stride_1,
+                              int          stride_2,
+                              int          in_width_padded,
+                              int          in_height_padded,
+                              int          in_channels,
+                              const float *input_a,
+                              const float *input_b,
+                              float       *output) {
+  extern __shared__ char patch_data_char[];
+
+  float *patch_data = (float *)patch_data_char;
+
+  // First (upper left) position of kernel upper-left corner in current center
+  // position of neighborhood in image 1
+  int x1     = blockIdx.x * stride_1 + max_displacement;
+  int y1     = blockIdx.y * stride_1 + max_displacement;
+  int item   = blockIdx.z;
+  int ch_off = threadIdx.x;
+
+  // Load 3D patch into shared shared memory
+  // HEIGHT
+  for (int j = 0; j < kernel_size; j++) {
+    // WIDTH
+    for (int i = 0; i < kernel_size; i++) {
+      int ji_off = ((j * kernel_size) + i) * in_channels;
+
+      // CHANNELS
+      for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+        int idx1 = ((item * in_height_padded + y1 + j) * in_width_padded + x1 + i) *
+                   in_channels + ch;
+        int idxPatchData = ji_off + ch;
+        patch_data[idxPatchData] = input_a[idx1];
+      }
+    }
+  }
+
+  __syncthreads();
+
+  __shared__ float sum[WARPS_PER_BLOCK * THREADS_PER_WARP];
+
+  // Compute correlation
+  for (int out_channel = 0; out_channel < out_channels; out_channel++) {
+    sum[ch_off] = 0;
+
+    int s2o = (out_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int s2p = (out_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+    int x2  = x1 + s2o;
+    int y2  = y1 + s2p;
+
+    // HEIGHT
+    for (int j = 0; j < kernel_size; j++) {
+      // WIDTH
+      for (int i = 0; i < kernel_size; i++) {
+        int ji_off = ((j * kernel_size) + i) * in_channels;
+
+        // CHANNELS
+        for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) {
+          int idxPatchData = ji_off + ch;
+          int idx2         = ((item * in_height_padded + y2 + j) * in_width_padded + x2 + i) *
+                             in_channels + ch;
+
+          sum[ch_off] += patch_data[idxPatchData] * input_b[idx2];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    if (ch_off == 0) {
+      float total_sum = 0;
+
+      for (int idx = 0; idx < WARPS_PER_BLOCK * THREADS_PER_WARP; idx++) {
+        total_sum += sum[idx];
+      }
+      const int sumelems = kernel_size * kernel_size * in_channels;
+      const int index    = (blockIdx.y * out_width + blockIdx.x) * out_channels + out_channel;
+
+      /* from Caffe:   const int index    = ((out_channel * out_height +
+         blockIdx.y) * out_width) + blockIdx.x; */
+      output[index + item * out_count] = total_sum / (float)sumelems;
+
+      // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      // n = 0
+      // caffe: ((k * H + h) * W + w)  +   n * K * H * W
+      // tf: (h * W + w) * K + k       +   n * H * W * K
+    }
+  }
+}
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output) {
+  dim3 totalBlocksCorr(out_width, out_height, batch_size);
+  dim3 threadsPerBlock(THREADS_PER_WARP *WARPS_PER_BLOCK);
+  const int shared_memory_per_block = (kernel_size * kernel_size) * in_channels;
+
+  CorrelateData << < totalBlocksCorr, threadsPerBlock, shared_memory_per_block * sizeof(float),
+    device.stream() >> > (
+    batch_size, out_width, out_height, out_channels, out_count,
+    max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius,
+    kernel_size, stride_1, stride_2, in_width_padded, in_height_padded, in_channels,
+    input_a, input_b, output);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.h b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
new file mode 100644
index 0000000..a1dfb62
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.h
@@ -0,0 +1,77 @@
+#ifndef FLOWNET_CORRELATION_H_
+#define FLOWNET_CORRELATION_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Correlation(const GPUDevice& device,
+                 const float     *input_a,
+                 const float     *input_b,
+                 const int        batch_size,
+                 const int        out_height,
+                 const int        out_width,
+                 const int        out_channels,
+                 const int        out_count,
+                 const int        in_height_padded,
+                 const int        in_width_padded,
+                 const int        in_channels,
+                 int              max_displacement,
+                 int              neighborhood_grid_radius,
+                 int              neighborhood_grid_width,
+                 int              kernel_radius,
+                 int              kernel_size,
+                 int              stride_1,
+                 int              stride_2,
+                 float           *output);
+
+
+void CorrelationGradA(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_b,
+                      const float     *gradient,
+                      float           *output_a_gradient);
+
+void CorrelationGradB(const GPUDevice& device,
+                      const int        batch_size,
+                      const int        out_width,
+                      const int        out_height,
+                      const int        out_channels,
+                      const int        max_displacement,
+                      const int        neighborhood_grid_radius,
+                      const int        neighborhood_grid_width,
+                      const int        kernel_radius,
+                      const int        stride_1,
+                      const int        stride_2,
+                      const int        in_width,
+                      const int        in_height,
+                      const int        padded_in_width,
+                      const int        padded_in_height,
+                      const int        in_channels,
+                      const int        in_count_per_sample,
+                      const int        pad,
+                      const float     *input_a,
+                      const float     *gradient,
+                      float           *output_b_gradient);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_CORRELATION_H_
diff --git a/Codes/flownet2/src/ops/correlation/correlation_op.cc b/Codes/flownet2/src/ops/correlation/correlation_op.cc
new file mode 100644
index 0000000..4f420f0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/correlation_op.cc
@@ -0,0 +1,83 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status SetOutput(InferenceContext *c) {
+  ShapeHandle input_a, input_b, input;
+
+  // Get shapes of both inputs and verify they are rank 4
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_a));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_b));
+
+  // Verify inputs are same dimensions
+  TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input));
+
+  // Get the attributes
+  int kernel_size, max_displacement, stride_1, stride_2, pad;
+  TF_RETURN_IF_ERROR(c->GetAttr("kernel_size", &kernel_size));
+  TF_RETURN_IF_ERROR(c->GetAttr("max_displacement", &max_displacement));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_1", &stride_1));
+  TF_RETURN_IF_ERROR(c->GetAttr("stride_2", &stride_2));
+  TF_RETURN_IF_ERROR(c->GetAttr("pad", &pad));
+
+  // Get dimensions of input (already padded)
+  int64 batch         = c->Value(c->Dim(input, 0));
+  int64 input_height  = c->Value(c->Dim(input, 1));
+  int64 input_width   = c->Value(c->Dim(input, 2));
+  int64 padded_height = input_height + 2 * pad;
+  int64 padded_width  = input_width + 2 * pad;
+
+  // The size of unreachable border region on each side
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_size   = max_displacement + kernel_radius;
+
+  // Calculate the output dimensions
+  int64 output_height = (int64)ceil((float)(padded_height - border_size * 2) / (float)stride_1);
+  int64 output_width  = (int64)ceil((float)(padded_width - border_size * 2) / (float)stride_1);
+
+  // TODO: Verify output size >= 1
+
+  int   neighborhood_grid_radius = max_displacement / stride_2;
+  int   neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+  int64 output_channels          = neighborhood_grid_width * neighborhood_grid_width;
+
+  // Set output shape
+  c->set_output(0, c->MakeShape({ batch, output_height, output_width, output_channels }));
+  return Status::OK();
+}
+
+REGISTER_OP("Correlation")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("output: float32")
+.SetShapeFn(SetOutput);
+
+REGISTER_OP("CorrelationGrad")
+.Input("gradients: float32")
+.Input("input_a: float32")
+.Input("input_b: float32")
+.Attr("kernel_size: int")
+.Attr("max_displacement: int")
+.Attr("stride_1: int")
+.Attr("stride_2: int")
+.Attr("pad: int")
+.Output("backprops_a: float32")
+.Output("backprops_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Output gradients should be the same dimensions as the inputs
+    ShapeHandle out;
+    TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &out));
+    c->set_output(0, out);
+    c->set_output(1, out);
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/correlation/pad.cu.cc b/Codes/flownet2/src/ops/correlation/pad.cu.cc
new file mode 100644
index 0000000..0b6c93d
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.cu.cc
@@ -0,0 +1,76 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "pad.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void PadData(
+  const float *in,
+  int          in_widthheight,
+  int          in_width,
+  int          in_height,
+  int          out_width,
+  int          out_height,
+  int          channels,
+  int          padding,
+  float       *out) {
+  int xy = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x  = xy % in_width;
+  int y  = xy / in_width;
+  int ch = blockIdx.y;
+  int n  = blockIdx.z;
+
+  if (xy >= in_widthheight) {
+    out[((n * out_height + y) * out_width + x) * channels + ch] = 0.0;
+    return;
+  }
+
+  float value = in[((n * in_height + y) * in_width + x) * channels + ch];
+
+  __syncthreads();
+
+  int xpad = x + padding;
+  int ypad = y + padding;
+
+  out[((n * out_height + ypad) * out_width + xpad) * channels + ch] = value;
+}
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output) {
+  int  in_widthheight    = input_width * input_height;
+  int  threads_per_block = 16;
+  dim3 totalBlocks((in_widthheight - 1) / threads_per_block + 1, input_channels, batch_size);
+
+  cudaMemset(output, 0, batch_size * output_height * output_width * input_channels * sizeof(float));
+
+  int padding = (output_height - input_height) / 2;
+
+  // LAUNCH KERNEL
+  PadData << < totalBlocks, threads_per_block, 0, device.stream() >> > (
+    input,
+    in_widthheight,
+    input_width,
+    input_height,
+    output_width,
+    output_height,
+    input_channels,
+    padding,
+    output);
+}
+}
+#endif // if GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/correlation/pad.h b/Codes/flownet2/src/ops/correlation/pad.h
new file mode 100644
index 0000000..afb4df0
--- /dev/null
+++ b/Codes/flownet2/src/ops/correlation/pad.h
@@ -0,0 +1,20 @@
+#ifndef FLOWNET_PAD_H_
+#define FLOWNET_PAD_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void Pad(const GPUDevice& device,
+         const float     *input,
+         int              batch_size,
+         int              input_height,
+         int              input_width,
+         int              input_channels,
+         int              output_height,
+         int              output_width,
+         float           *output);
+} // end namespace tensorflow
+
+#endif // ifndef FLOWNET_PAD_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
new file mode 100644
index 0000000..eefe247
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc
@@ -0,0 +1,47 @@
+#define EIGEN_USE_THREADS
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+class DownsampleKernel : public OpKernel {
+ public:
+  explicit DownsampleKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    // Get the size [height, width] tensor and verify its dimensions
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("size", &size_));
+    OP_REQUIRES(ctx, size_.size() == 2, errors::InvalidArgument("size must be 2 dimensions"));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    // Get the input images and transforms and verify their dimensions
+    const Tensor& input_t = ctx->input(0);
+    OP_REQUIRES(ctx, input_t.dims() == 4,
+                errors::InvalidArgument("Input images must have rank 4"));
+
+    // Allocate the memory for the output
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+        0, TensorShape({input_t.dim_size(0), size_[0], size_[1], input_t.dim_size(3)}), &output_t));
+
+    // Perform flow augmentation
+    auto input = input_t.tensor<float, 4>();
+    auto output = output_t->tensor<float, 4>();
+
+    Downsample(ctx->eigen_gpu_device(), input, output);
+  }
+
+  private:
+    std::vector<int32> size_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Downsample")
+                          .Device(DEVICE_GPU),
+                      DownsampleKernel<GPUDevice>)
+}  // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.h b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
new file mode 100644
index 0000000..bcc4e3f
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.h
@@ -0,0 +1,18 @@
+#ifndef FLOWNET_DOWNSAMPLE_H_
+#define FLOWNET_DOWNSAMPLE_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output);
+
+}  // end namespace tensorflow
+
+#endif  // FLOWNET_DOWNSAMPLE_H_
diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
new file mode 100644
index 0000000..b7629a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc
@@ -0,0 +1,108 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "downsample_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define CUDART_NAN_F            __int_as_float(0x7fffffff)
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void DownsampleKernel(
+    const int32 nthreads,
+    const float* input_ptr,
+    float* output_ptr,
+    const int in_width,
+    const int in_height,
+    const int out_width,
+    const int out_height,
+    const int channels,
+    const float width_scale,
+    const float height_scale,
+    const int wradius,
+    const int hradius) {
+        CUDA_1D_KERNEL_LOOP(index, nthreads) {
+            const int c = index % channels;
+            const int destx = (index / channels) % out_width;
+            const int desty = (index / channels / out_width) % out_height;
+            const int n = (index / channels / out_width) / out_height;
+
+            const float srcx = ((float)destx / (float)(out_width - 1)) * (float)(in_width - 1);
+            const float srcy = ((float)desty / (float)(out_height - 1)) * (float)(in_height - 1);
+
+            const int isrcx = round(srcx);
+            const int isrcy = round(srcy);
+
+            float accum_value = 0;
+            float accum_weight = 0;
+            float accum_nan = 0;
+
+            for (int dy = -hradius; dy <= hradius; dy++) {
+                int yoff = isrcy + dy;
+                //
+                for (int dx = -wradius; dx <= wradius; dx++) {
+                    int xoff = isrcx + dx;
+
+                    if (xoff >= 0 && yoff >= 0 && xoff < in_width && yoff < in_height) {
+                        int idx = ((n * in_height + yoff) * in_width + xoff) * channels + c;
+                        float sample = input_ptr[idx];
+                        float weight = fmaxf(0.0f, 1.0f - (fabsf((float)xoff - srcx) / width_scale))
+                                       * fmaxf(0.0f, 1.0f - (fabsf((float)yoff - srcy) / height_scale));
+                        if (sample != sample) { // isnan
+                            accum_nan += weight;
+                            sample = 0;
+                            weight = 0;
+                        }
+                        accum_value += sample * weight;
+                        accum_weight += weight;
+                    }
+                }
+            }
+
+            if (accum_nan / accum_weight > 0.5) {
+                output_ptr[index] = CUDART_NAN_F;
+            } else {
+                output_ptr[index] = accum_value / accum_weight;
+            }
+        }
+}
+
+bool Downsample(const GPUDevice& device,
+                typename TTypes<float, 4>::ConstTensor input,
+                typename TTypes<float, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int out_height = output.dimension(1);
+    const int out_width = output.dimension(2);
+    const int out_channels = output.dimension(3);
+    const int total_count = batch_size * out_height * out_width * out_channels;
+
+    const int in_height = input.dimension(1);
+    const int in_width = input.dimension(2);
+
+    const float width_scale = (float)(in_width - 1) / (float)(out_width - 1);
+    const float height_scale = (float)(in_height - 1) / (float)(out_height - 1);
+
+    const int wradius = ceil(width_scale);
+    const int hradius = ceil(height_scale);
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, device);
+    DownsampleKernel<<<config.block_count, config.thread_per_block, 0,
+                        device.stream()>>>(total_count, input.data(), output.data(),
+                        in_width, in_height, out_width, out_height, out_channels,
+                        width_scale, height_scale, wradius, hradius);
+    return device.ok();
+}
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/downsample/downsample_op.cc b/Codes/flownet2/src/ops/downsample/downsample_op.cc
new file mode 100644
index 0000000..6980dc7
--- /dev/null
+++ b/Codes/flownet2/src/ops/downsample/downsample_op.cc
@@ -0,0 +1,30 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch = c->Dim(input, 0);
+  DimensionHandle depth = c->Dim(input, 3);
+  std::vector<int32> size_;
+  c->GetAttr("size", &size_);
+  DimensionHandle height = c->MakeDim(size_[0]);
+  DimensionHandle width  = c->MakeDim(size_[1]);
+  c->set_output(0, c->MakeShape({batch, height, width, depth}));
+  return Status::OK();
+}
+
+REGISTER_OP("Downsample")
+    .Input("input: float32")
+    .Attr("size: list(int) >= 2")
+    .Output("output: float32")
+    .SetShapeFn(SetOutputToSizedImage);
+
+}  // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
new file mode 100644
index 0000000..b5d9602
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc
@@ -0,0 +1,48 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpKernel : public OpKernel {
+  public:
+    explicit FlowWarpKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& input_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+
+      OP_REQUIRES(ctx, input_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  input_t.dim_size(0) == flow_t.dim_size(0) && input_t.dim_size(
+                    1) == flow_t.dim_size(1) && input_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_t.shape(), &output_t));
+
+      // Perform flow augmentation
+      auto input  = input_t.tensor<float, 4>();
+      auto flow   = flow_t.tensor<float, 4>();
+      auto output = output_t->tensor<float, 4>();
+
+      FlowWarp(ctx->eigen_gpu_device(), input, flow, output);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarp")
+                        .Device(DEVICE_GPU),
+                        FlowWarpKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
new file mode 100644
index 0000000..2007151
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc
@@ -0,0 +1,130 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#define RA_TILE 32
+#define RA_ROWS 8
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpKernel(
+  const float *image,
+  const float *flow,
+  float       *warped,
+  const int    batch_size,
+  const int    channels,
+  const int    cblocks,
+  const int    width,
+  const int    wblocks,
+  const int    height,
+  const int    width_height) {
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X];
+  __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1];
+
+  int x;
+  int c;
+
+  x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if ((threadIdx.y == 0) && (x < width)) {
+    const int idx = ((n * height + y) * width + x) * 2;
+    x2_buf[threadIdx.x] = float(x) + flow[idx];
+    y2_buf[threadIdx.x] = float(y) + flow[idx + 1];
+  }
+
+  __syncthreads();
+
+  float x2 = x2_buf[threadIdx.y];
+  float y2 = y2_buf[threadIdx.y];
+
+  int ix2_L = int(x2);
+  int iy2_T = int(y2);
+  int ix2_R = min(ix2_L + 1, width - 1);
+  int iy2_B = min(iy2_T + 1, height - 1);
+
+  int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels;
+  int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels;
+  int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels;
+  int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels;
+
+  float alpha   = x2 - ix2_L;
+  float beta    = y2 - iy2_T;
+  float coeffTL = (1 - alpha) * (1 - beta);
+  float coeffTR = alpha * (1 - beta);
+  float coeffBL = (1 - alpha) * beta;
+  float coeffBR = alpha * beta;
+
+  for (int cb = 0; cb < cblocks; cb++) {
+    __syncthreads();
+
+    buffer[threadIdx.y][threadIdx.x] = 0.0;
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.x;
+
+    if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) {
+      buffer[threadIdx.y][threadIdx.x] = // buffer [x][c]
+                                         coeffTL * image[off_TL + c] +
+                                         coeffTR * image[off_TR + c] +
+                                         coeffBL * image[off_BL + c] +
+                                         coeffBR * image[off_BR + c];
+    }
+
+    __syncthreads();
+
+    c = cb * FW_TILE_C + threadIdx.y;
+    x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+    if ((c < channels) && (x < width)) {
+      warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y];
+    }
+  }
+}
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output) {
+  const int batch_size = input.dimension(0);
+  const int height     = input.dimension(1);
+  const int width      = input.dimension(2);
+  const int channels   = input.dimension(3);
+
+  const int width_height = width * height;
+  int  wblocks           = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks           = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, FW_TILE_C);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    input.data(),
+    flow.data(),
+    output.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.h b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
new file mode 100644
index 0000000..2780316
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.h
@@ -0,0 +1,28 @@
+#ifndef FLOWNET_FLOWWARP_H_
+#define FLOWNET_FLOWWARP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+#define FW_THREADS 32
+#define FW_TILE_X FW_THREADS
+#define FW_TILE_C FW_THREADS
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+void FlowWarp(const GPUDevice& device,
+              typename TTypes<float, 4>::ConstTensor input,
+              typename TTypes<float, 4>::ConstTensor flow,
+              typename TTypes<float, 4>::Tensor output);
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOWWARP_H_
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
new file mode 100644
index 0000000..9f3e7ea
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc
@@ -0,0 +1,57 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_warp.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+template<typename Device>
+class FlowWarpGradKernel : public OpKernel {
+  public:
+    explicit FlowWarpGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {}
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input image and flow and verify dimensions
+      const Tensor& image_t = ctx->input(0);
+      const Tensor& flow_t  = ctx->input(1);
+      const Tensor& grad_t  = ctx->input(2);
+
+      OP_REQUIRES(ctx, image_t.dims() == 4,
+                  errors::InvalidArgument("Input image must have rank 4"));
+      OP_REQUIRES(ctx, flow_t.dims() == 4,
+                  errors::InvalidArgument("Input flow must have rank 4"));
+      OP_REQUIRES(ctx,
+                  image_t.dim_size(0) == flow_t.dim_size(0) && image_t.dim_size(
+                    1) == flow_t.dim_size(1) && image_t.dim_size(2) == flow_t.dim_size(2),
+                  errors::InvalidArgument(
+                    "Input image and flow must have same N x H x W dimensions"));
+
+      // Allocate the memory for the output
+      Tensor *image_grad_t;
+      Tensor *flow_grad_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, image_t.shape(), &image_grad_t));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, flow_t.shape(), &flow_grad_t));
+
+      auto image      = image_t.tensor<float, 4>();
+      auto flow       = flow_t.tensor<float, 4>();
+      auto gradient   = grad_t.tensor<float, 4>();
+      auto image_grad = image_grad_t->tensor<float, 4>();
+      auto flow_grad  = flow_grad_t->tensor<float, 4>();
+
+      FlowWarpGrad(ctx->eigen_gpu_device(),
+                   image,
+                   flow,
+                   gradient,
+                   image_grad,
+                   flow_grad);
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowWarpGrad")
+                        .Device(DEVICE_GPU),
+                        FlowWarpGradKernel<GPUDevice>)
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
new file mode 100644
index 0000000..25248c8
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc
@@ -0,0 +1,126 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "flow_warp.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__ void FlowWarpGradKernel(
+  const float *image,
+  float       *image_grad,
+  const float *flow,
+  float       *flow_grad,
+  const float *gradient,
+  int          batch_size,
+  int          channels,
+  int          cblocks,
+  int          width,
+  int          wblocks,
+  int          height,
+  int          widthheight) {
+  int x = blockIdx.x * FW_TILE_X + threadIdx.x;
+
+  if (x >= width) return;
+
+  int y = blockIdx.y;
+  int n = blockIdx.z;
+
+  const int flow_idx = ((n * height + y) * width + x) * 2;
+  float     x2       = float(x) + flow[flow_idx];
+  float     y2       = float(y) + flow[flow_idx + 1];
+
+  if ((x2 >= 0.f) && (y2 >= 0.f) && (x2 < width) && (y2 < height)) {
+    int ix2_L = int(x2);
+    int iy2_T = int(y2);
+    int ix2_R = min(ix2_L + 1, width - 1);
+    int iy2_B = min(iy2_T + 1, height - 1);
+
+    float alpha = x2 - ix2_L;
+    float beta  = y2 - iy2_T;
+
+    for (int c = 0; c < channels; c++) {
+      float warped_diff_value = gradient[((n * height + y) * width + x) * channels + c];
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * (1 - beta));
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_L) * channels + c],
+                warped_diff_value * (1 - alpha) * beta);
+      atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_R) * channels + c],
+                warped_diff_value * alpha * beta);
+    }
+
+    float gamma    = iy2_B - y2;
+    float bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      int   ch_off = (n * channels + c) * height;
+      float temp   = 0;
+      temp += gamma *
+              (image[((n * height + iy2_T) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_B) * width + ix2_L) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2] = bot_diff;
+
+    gamma    = ix2_R - x2;
+    bot_diff = 0;
+
+    for (int c = 0; c < channels; c++) {
+      float temp = 0;
+      temp += gamma *
+              (image[((n * height + iy2_B) * width + ix2_L) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_L) * channels + c]);
+      temp += (1 - gamma) *
+              (image[((n * height + iy2_B) * width + ix2_R) * channels + c] -
+               image[((n * height + iy2_T) * width + ix2_R) * channels + c]);
+
+      bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp;
+    }
+    flow_grad[((n * height + y) * width + x) * 2 + 1] = bot_diff;
+  }
+}
+
+void FlowWarpGrad(const GPUDevice& device,
+                  typename TTypes<float, 4>::ConstTensor image,
+                  typename TTypes<float, 4>::ConstTensor flow,
+                  typename TTypes<float, 4>::ConstTensor gradient,
+                  typename TTypes<float, 4>::Tensor image_grad,
+                  typename TTypes<float, 4>::Tensor flow_grad) {
+  const int batch_size   = image.dimension(0);
+  const int height       = image.dimension(1);
+  const int width        = image.dimension(2);
+  const int channels     = image.dimension(3);
+  const int width_height = width * height;
+
+  int  wblocks = ((width - 1) / FW_TILE_X + 1);
+  int  cblocks = ((channels - 1) / FW_TILE_C + 1);
+  dim3 warpThreads(FW_TILE_X, 1);
+  dim3 warpBlocks(wblocks, height, batch_size);
+
+  cudaMemset(image_grad.data(), 0, batch_size * height * width * channels * sizeof(float));
+  cudaMemset(flow_grad.data(),  0, batch_size * height * width * 2 * sizeof(float));
+
+  FlowWarpGradKernel << < warpBlocks, warpThreads, 0, device.stream() >> > (
+    image.data(),
+    image_grad.data(),
+    flow.data(),
+    flow_grad.data(),
+    gradient.data(),
+    batch_size,
+    channels,
+    cblocks,
+    width,
+    wblocks,
+    height,
+    width_height);
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
new file mode 100644
index 0000000..aef9c74
--- /dev/null
+++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc
@@ -0,0 +1,23 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+REGISTER_OP("FlowWarp")
+.Input("image: float32")
+.Input("flow: float32")
+.Output("output: float32")
+.SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+REGISTER_OP("FlowWarpGrad")
+.Input("image: float32")
+.Input("flow: float32")
+.Input("gradient: float32")
+.Output("image_grad: float32")
+.Output("flow_grad: float32")
+.SetShapeFn([](shape_inference::InferenceContext *c) {
+    c->set_output(0, c->input(0));
+    c->set_output(1, c->input(1));
+    return Status::OK();
+  });
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
new file mode 100644
index 0000000..b93dfa6
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc
@@ -0,0 +1,420 @@
+#include "augmentation_base.h"
+
+#include <math.h>
+#include <random>
+
+namespace tensorflow {
+/** TransMat Functions **/
+void AugmentationLayerBase::TransMat::fromCoeff(AugmentationCoeff *coeff,
+                                                int                out_width,
+                                                int                out_height,
+                                                int                src_width,
+                                                int                src_height) {
+  leftMultiply(1, 0, -0.5 * out_width,
+               0, 1, -0.5 * out_height);
+
+  if (coeff->angle) {
+    leftMultiply(cos(coeff->angle()), -sin(coeff->angle()), 0,
+                 sin(coeff->angle()), cos(coeff->angle()), 0);
+  }
+
+  if (coeff->dx || coeff->dy) {
+    leftMultiply(1, 0, coeff->dx() * out_width,
+                 0, 1, coeff->dy() * out_height);
+  }
+
+  if (coeff->zoom_x || coeff->zoom_y) {
+    leftMultiply(1.0 / coeff->zoom_x(), 0, 0,
+                 0, 1.0 / coeff->zoom_y(), 0);
+  }
+
+  leftMultiply(1, 0, 0.5 * src_width,
+               0, 1, 0.5 * src_height);
+}
+
+void AugmentationLayerBase::TransMat::fromTensor(const float *tensor_data) {
+  t0 = tensor_data[0];
+  t1 = tensor_data[1];
+  t2 = tensor_data[2];
+  t3 = tensor_data[3];
+  t4 = tensor_data[4];
+  t5 = tensor_data[5];
+}
+
+AugmentationLayerBase::TransMat AugmentationLayerBase::TransMat::inverse() {
+  float a = this->t0, b = this->t1, c = this->t2;
+  float d = this->t3, e = this->t4, f = this->t5;
+
+  float denom = a * e - b * d;
+
+  TransMat result;
+
+  result.t0 = e / denom;
+  result.t1 = b / -denom;
+  result.t2 = (c * e - b * f) / -denom;
+  result.t3 = d / -denom;
+  result.t4 = a / denom;
+  result.t5 = (c * d - a * f) / denom;
+
+  return result;
+}
+
+void AugmentationLayerBase::TransMat::leftMultiply(float u0,
+                                                   float u1,
+                                                   float u2,
+                                                   float u3,
+                                                   float u4,
+                                                   float u5) {
+  float t0 = this->t0, t1 = this->t1, t2 = this->t2;
+  float t3 = this->t3, t4 = this->t4, t5 = this->t5;
+
+  this->t0 = t0 * u0 + t3 * u1;
+  this->t1 = t1 * u0 + t4 * u1;
+  this->t2 = t2 * u0 + t5 * u1 + u2;
+  this->t3 = t0 * u3 + t3 * u4;
+  this->t4 = t1 * u3 + t4 * u4;
+  this->t5 = t2 * u3 + t5 * u4 + u5;
+}
+
+void AugmentationLayerBase::TransMat::toIdentity() {
+  t0 = 1; t1 = 0; t2 = 0;
+  t3 = 0; t4 = 1; t5 = 0;
+}
+
+/** AugmentationCoeff Functions **/
+void AugmentationCoeff::clear() {
+  // Spatial variables
+  dx.clear();
+  dy.clear();
+  angle.clear();
+  zoom_x.clear();
+  zoom_y.clear();
+
+  // Chromatic variables
+  gamma.clear();
+  brightness.clear();
+  contrast.clear();
+  color1.clear();
+  color2.clear();
+  color3.clear();
+}
+
+void AugmentationCoeff::combine_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = dx() * coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = dy() * coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = angle() * coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = zoom_x() * coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = zoom_y() * coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = brightness() * coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = contrast() * coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = color1() * coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = color2() * coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = color3() * coeff.color3();
+  }
+}
+
+void AugmentationCoeff::replace_with(const AugmentationCoeff& coeff) {
+  // Spatial types
+  if (coeff.dx) {
+    dx = coeff.dx();
+  }
+
+  if (coeff.dy) {
+    dy = coeff.dy();
+  }
+
+  if (coeff.angle) {
+    angle = coeff.angle();
+  }
+
+  if (coeff.zoom_x) {
+    zoom_x = coeff.zoom_x();
+  }
+
+  if (coeff.zoom_y) {
+    zoom_y = coeff.zoom_y();
+  }
+
+  // Chromatic types
+  if (coeff.gamma) {
+    gamma = gamma() * coeff.gamma();
+  }
+
+  if (coeff.brightness) {
+    brightness = coeff.brightness();
+  }
+
+  if (coeff.contrast) {
+    contrast = coeff.contrast();
+  }
+
+  if (coeff.color1) {
+    color1 = coeff.color1();
+  }
+
+  if (coeff.color2) {
+    color2 = coeff.color2();
+  }
+
+  if (coeff.color3) {
+    color3 = coeff.color3();
+  }
+}
+
+/** AugmentationLayerBase Functions **/
+float AugmentationLayerBase::rng_generate(const AugmentationParam& param,
+                                          float                    discount_coeff,
+                                          const float              default_value) {
+  std::random_device rd;  // Will be used to obtain a seed for the random number
+                          // engine
+  std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+
+  float spread = param.spread * discount_coeff;
+
+  if (param.rand_type == "uniform_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (param.spread > 0.0) {
+      std::uniform_real_distribution<> uniform(param.mean - spread,
+                                               param.mean + spread);
+      tmp1 = uniform(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else if (param.rand_type == "gaussian_bernoulli") {
+    float tmp1 = 0.0;
+    bool  tmp2 = false;
+
+    if (param.prob > 0.0) {
+      std::bernoulli_distribution bernoulli(param.prob);
+      tmp2 = bernoulli(gen);
+    }
+
+    if (!tmp2) {
+      return default_value;
+    }
+
+    if (spread > 0.0) {
+      std::normal_distribution<> normal(param.mean, spread);
+      tmp1 = normal(gen);
+    } else {
+      tmp1 = param.mean;
+    }
+
+    if (param.should_exp) {
+      tmp1 = exp(tmp1);
+    }
+
+    return tmp1;
+  } else {
+    throw "Unknown random type: " + param.rand_type;
+  }
+}
+
+void AugmentationLayerBase::generate_chromatic_coeffs(float                     discount_coeff,
+                                                      const AugmentationParams& aug,
+                                                      AugmentationCoeff       & coeff) {
+  if (aug.gamma) {
+    coeff.gamma = rng_generate(aug.gamma(), discount_coeff, coeff.gamma.get_default());
+  }
+
+  if (aug.brightness) {
+    coeff.brightness =
+      rng_generate(aug.brightness(), discount_coeff, coeff.brightness.get_default());
+  }
+
+  if (aug.contrast) {
+    coeff.contrast = rng_generate(aug.contrast(), discount_coeff, coeff.contrast.get_default());
+  }
+
+  if (aug.color) {
+    coeff.color1 = rng_generate(aug.color(), discount_coeff, coeff.color1.get_default());
+    coeff.color2 = rng_generate(aug.color(), discount_coeff, coeff.color2.get_default());
+    coeff.color3 = rng_generate(aug.color(), discount_coeff, coeff.color3.get_default());
+  }
+}
+
+void AugmentationLayerBase::generate_spatial_coeffs(float                     discount_coeff,
+                                                    const AugmentationParams& aug,
+                                                    AugmentationCoeff       & coeff) {
+  if (aug.translate) {
+    coeff.dx = rng_generate(aug.translate(), discount_coeff, coeff.dx.get_default());
+    coeff.dy = rng_generate(aug.translate(), discount_coeff, coeff.dy.get_default());
+  }
+
+  if (aug.rotate) {
+    coeff.angle = rng_generate(aug.rotate(), discount_coeff, coeff.angle.get_default());
+  }
+
+  if (aug.zoom) {
+    coeff.zoom_x = rng_generate(aug.zoom(), discount_coeff, coeff.zoom_x.get_default());
+    coeff.zoom_y = coeff.zoom_x();
+  }
+
+  if (aug.squeeze) {
+    float squeeze_coeff = rng_generate(aug.squeeze(), discount_coeff, 1.0);
+    coeff.zoom_x = coeff.zoom_x() * squeeze_coeff;
+    coeff.zoom_y = coeff.zoom_y() * squeeze_coeff;
+  }
+}
+
+void AugmentationLayerBase::generate_valid_spatial_coeffs(
+  float                     discount_coeff,
+  const AugmentationParams& aug,
+  AugmentationCoeff       & coeff,
+  int                       src_width,
+  int                       src_height,
+  int                       out_width,
+  int                       out_height) {
+  int   x, y;
+  float x1, y1, x2, y2;
+  int   counter     = 0;
+  int   good_params = 0;
+  AugmentationCoeff incoming_coeff(coeff);
+
+  while (good_params < 4 && counter < 50) {
+    coeff.clear();
+    AugmentationLayerBase::generate_spatial_coeffs(discount_coeff, aug, coeff);
+    coeff.combine_with(incoming_coeff);
+
+    // Check if all 4 corners of the transformed image fit into the original
+    // image
+    good_params = 0;
+
+    for (x = 0; x < out_width; x += out_width - 1) {
+      for (y = 0; y < out_height; y += out_height - 1) {
+        // move the origin
+        x1 = x - 0.5 * out_width;
+        y1 = y - 0.5 * out_height;
+
+        // rotate
+        x2 = cos(coeff.angle()) * x1 - sin(coeff.angle()) * y1;
+        y2 = sin(coeff.angle()) * x1 + sin(coeff.angle()) * y1;
+
+        // translate
+        x2 = x2 + coeff.dx() * out_width;
+        y2 = y2 + coeff.dy() * out_height;
+
+        // zoom
+        x2 = x2 / coeff.zoom_x();
+        y2 = y2 / coeff.zoom_y();
+
+        // move the origin back
+        x2 = x2 + 0.5 * src_width;
+        y2 = y2 + 0.5 * src_height;
+
+        if (!((floor(x2) < 0) || (floor(x2) > src_width - 2.0) ||
+              (floor(y2) < 0) || (floor(y2) > src_height - 2.0))) {
+          good_params++;
+        }
+      }
+    }
+    counter++;
+  }
+
+  if (counter >= 50) {
+    printf("Warning: No suitable spatial transformation after %d attempts.\n", counter);
+    coeff.clear();
+    coeff.replace_with(incoming_coeff);
+  }
+}
+
+void AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  typename TTypes<float, 2>::Tensor& out)
+{
+  float *out_ptr = out.data();
+  int    counter = 0;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    out_ptr[counter + 0] = coeff.gamma();
+    out_ptr[counter + 1] = coeff.brightness();
+    out_ptr[counter + 2] = coeff.contrast();
+    out_ptr[counter + 3] = coeff.color1();
+    out_ptr[counter + 4] = coeff.color2();
+    out_ptr[counter + 5] = coeff.color3();
+    counter             += 6;
+  }
+}
+
+void AugmentationLayerBase::copy_spatial_coeffs_to_tensor(
+  const std::vector<AugmentationCoeff>& coeff_arr,
+  const int out_width,
+  const int out_height,
+  const int src_width,
+  const int src_height,
+  typename TTypes<float, 2>::Tensor& out,
+  const bool invert)
+{
+  float   *out_ptr = out.data();
+  int      counter = 0;
+  TransMat t;
+
+  for (AugmentationCoeff coeff : coeff_arr) {
+    t.toIdentity();
+    t.fromCoeff(&coeff, out_width, out_height, src_width, src_height);
+
+    if (invert) {
+      t = t.inverse();
+    }
+
+    out_ptr[counter + 0] = t.t0;
+    out_ptr[counter + 1] = t.t1;
+    out_ptr[counter + 2] = t.t2;
+    out_ptr[counter + 3] = t.t3;
+    out_ptr[counter + 4] = t.t4;
+    out_ptr[counter + 5] = t.t5;
+    counter             += 6;
+  }
+}
+}
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
new file mode 100644
index 0000000..d2aba2c
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h
@@ -0,0 +1,228 @@
+#ifndef AUGMENTATION_LAYER_BASE_H_
+#define AUGMENTATION_LAYER_BASE_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+template<typename T>
+class OptionalType {
+  public:
+    OptionalType(const T default_value) : default_value(default_value), has_value(false) {}
+
+    operator bool() const {
+      return has_value;
+    }
+
+    OptionalType& operator=(T val) {
+      has_value = true;
+      value     = val;
+      return *this;
+    }
+
+    const T operator()() const {
+      return has_value ? value : default_value;
+    }
+
+    void clear() {
+      has_value = false;
+    }
+
+    const T get_default() {
+      return default_value;
+    }
+
+  private:
+    T value;
+    bool has_value;
+    const T default_value;
+};
+
+class AugmentationCoeff {
+  public:
+    // Spatial Types
+    OptionalType<float>dx;
+    OptionalType<float>dy;
+    OptionalType<float>angle;
+    OptionalType<float>zoom_x;
+    OptionalType<float>zoom_y;
+
+    // Chromatic Types
+    OptionalType<float>gamma;
+    OptionalType<float>brightness;
+    OptionalType<float>contrast;
+    OptionalType<float>color1;
+    OptionalType<float>color2;
+    OptionalType<float>color3;
+
+    AugmentationCoeff() : dx(0.0), dy(0.0), angle(0.0), zoom_x(1.0), zoom_y(1.0), gamma(1.0),
+      brightness(0.0), contrast(1.0), color1(1.0), color2(1.0), color3(1.0) {}
+
+    AugmentationCoeff(const AugmentationCoeff& coeff) : AugmentationCoeff() {
+      replace_with(coeff);
+    }
+
+    void clear();
+
+    void combine_with(const AugmentationCoeff& coeff);
+
+    void replace_with(const AugmentationCoeff& coeff);
+};
+
+typedef struct AugmentationParam {
+  std::string rand_type;
+  bool        should_exp;
+  float       mean;
+  float       spread;
+  float       prob;
+} AugmentationParam;
+
+class AugmentationParams {
+  public:
+    int crop_height;
+    int crop_width;
+
+    // Spatial options
+    OptionalType<struct AugmentationParam>translate;
+    OptionalType<struct AugmentationParam>rotate;
+    OptionalType<struct AugmentationParam>zoom;
+    OptionalType<struct AugmentationParam>squeeze;
+
+    // Chromatic options
+    OptionalType<struct AugmentationParam>gamma;
+    OptionalType<struct AugmentationParam>brightness;
+    OptionalType<struct AugmentationParam>contrast;
+    OptionalType<struct AugmentationParam>color;
+
+    inline AugmentationParams(int                     crop_height,
+                              int                     crop_width,
+                              std::vector<std::string>params_name,
+                              std::vector<std::string>params_rand_type,
+                              std::vector<bool>       params_exp,
+                              std::vector<float>      params_mean,
+                              std::vector<float>      params_spread,
+                              std::vector<float>      params_prob) :
+      crop_height(crop_height),
+      crop_width(crop_width),
+      translate(AugmentationParam()),
+      rotate(AugmentationParam()),
+      zoom(AugmentationParam()),
+      squeeze(AugmentationParam()),
+      gamma(AugmentationParam()),
+      brightness(AugmentationParam()),
+      contrast(AugmentationParam()),
+      color(AugmentationParam()) {
+      for (int i = 0; i < params_name.size(); i++) {
+        const std::string name      = params_name[i];
+        const std::string rand_type = params_rand_type[i];
+        const bool  should_exp      = params_exp[i];
+        const float mean            = params_mean[i];
+        const float spread          = params_spread[i];
+        const float prob            = params_prob[i];
+
+        struct AugmentationParam param = { rand_type, should_exp, mean, spread, prob };
+
+        if (name == "translate") {
+          this->translate = param;
+        } else if (name == "rotate") {
+          this->rotate = param;
+        } else if (name == "zoom") {
+          this->zoom = param;
+        }  else if (name == "squeeze") {
+          this->squeeze = param;
+        } else if (name == "noise") {
+          // NoOp: We handle noise on the Python side
+        } else if (name == "gamma") {
+          this->gamma = param;
+        } else if (name == "brightness") {
+          this->brightness = param;
+        } else if (name == "contrast") {
+          this->contrast = param;
+        } else if (name == "color") {
+          this->color = param;
+        } else {
+          std::cout << "Ignoring unknown augmentation parameter: " << name << std::endl;
+        }
+      }
+    }
+
+    bool should_do_spatial_transform() {
+      return this->translate || this->rotate || this->zoom || this->squeeze;
+    }
+
+    bool should_do_chromatic_transform() {
+      return this->gamma || this->brightness || this->contrast || this->color;
+    }
+};
+
+class AugmentationLayerBase {
+  public:
+    class TransMat {
+      /**
+       * Translation matrix class for spatial augmentation
+       * | 0 1 2 |
+       * | 3 4 5 |
+       */
+
+      public:
+        float t0, t1, t2;
+        float t3, t4, t5;
+
+
+        void fromCoeff(AugmentationCoeff *coeff,
+                       int                out_width,
+                       int                out_height,
+                       int                src_width,
+                       int                src_height);
+
+        void     fromTensor(const float *tensor_data);
+
+        TransMat inverse();
+
+        void     leftMultiply(float u0,
+                              float u1,
+                              float u2,
+                              float u3,
+                              float u4,
+                              float u5);
+
+        void toIdentity();
+    };
+
+    // TODO: Class ChromaticCoeffs
+
+    static float rng_generate(const AugmentationParam& param,
+                              float                    discount_coeff,
+                              const float              default_value);
+
+    static void clear_spatial_coeffs(AugmentationCoeff& coeff);
+    static void generate_chromatic_coeffs(float                     discount_coeff,
+                                          const AugmentationParams& aug,
+                                          AugmentationCoeff       & coeff);
+    static void generate_spatial_coeffs(float                     discount_coeff,
+                                        const AugmentationParams& aug,
+                                        AugmentationCoeff       & coeff);
+    static void generate_valid_spatial_coeffs(float                     discount_coeff,
+                                              const AugmentationParams& aug,
+                                              AugmentationCoeff       & coeff,
+                                              int                       src_width,
+                                              int                       src_height,
+                                              int                       out_width,
+                                              int                       out_height);
+
+    static void copy_chromatic_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                                typename TTypes<float, 2>::Tensor& out);
+    static void copy_spatial_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr,
+                                              const int out_width,
+                                              const int out_height,
+                                              const int src_width,
+                                              const int src_height,
+                                              typename TTypes<float, 2>::Tensor& out,
+                                              const bool invert = false);
+};
+} // namespace tensorflow
+
+#endif // AUGMENTATION_LAYER_BASE_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
new file mode 100644
index 0000000..77b8c83
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc
@@ -0,0 +1,461 @@
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+template<>
+void Augment(OpKernelContext *context,
+             const CPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int64 channel_count                          = batch_size * out_height * out_width;
+  const int   kCostPerChannel                        = 10;
+  const DeviceBase::CpuWorkerThreads& worker_threads =
+    *context->device()->tensorflow_cpu_worker_threads();
+
+  Shard(worker_threads.num_threads,
+        worker_threads.workers,
+        channel_count,
+        kCostPerChannel,
+        [batch_size, channels, src_width,
+         src_height, src_count, out_width, out_height, src_data,
+         out_data, transMats, chromatic_coeffs](
+          int64 start_channel, int64 end_channel) {
+      // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+      for (int index = start_channel; index < end_channel; index++) {
+        int x = index % out_width;
+        int y = (index / out_width) % out_height;
+        int n = index / out_width / out_height;
+
+        const float *transMat = transMats + n * 6;
+
+        float gamma, brightness, contrast;
+
+        if (chromatic_coeffs) {
+          gamma      = chromatic_coeffs[n * 6 + 0];
+          brightness = chromatic_coeffs[n * 6 + 1];
+          contrast   = chromatic_coeffs[n * 6 + 2];
+        }
+
+        float xpos = x * transMat[0] + y * transMat[1] + transMat[2];
+        float ypos = x * transMat[3] + y * transMat[4] + transMat[5];
+
+        xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+        ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+        float tlx = floor(xpos);
+        float tly = floor(ypos);
+
+        float xdist = xpos - tlx;
+        float ydist = ypos - tly;
+
+        int srcTLIdxOffset = ((n * src_height + (int)tly) * src_width + (int)tlx) * channels;
+
+        // ((n * src_height + tly) * src_width + (tlx + 1)) * channels
+        int srcTRIdxOffset = srcTLIdxOffset + channels;
+
+        // ((n * src_height + (tly + 1)) * src_width + tlx) * channels
+        int srcBLIdxOffset = srcTLIdxOffset + channels * src_width;
+
+        // ((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels
+        int srcBRIdxOffset = srcTLIdxOffset + channels + channels * src_width;
+
+        // Variables for chromatic transform
+        int   data_index[3];
+        float rgb[3];
+        float mean_in  = 0;
+        float mean_out = 0;
+
+        for (int c = 0; c < channels; c++) {
+          // Bilinear interpolation
+          int srcTLIdx = srcTLIdxOffset + c;
+          int srcTRIdx = std::min(srcTRIdxOffset + c, src_count);
+          int srcBLIdx = std::min(srcBLIdxOffset + c, src_count);
+          int srcBRIdx = std::min(srcBRIdxOffset + c, src_count);
+
+          float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                       + (xdist) * (ydist) * src_data[srcBRIdx]
+                       + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                       + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+          if (chromatic_coeffs) {
+            // Gather data for chromatic transform
+            data_index[c] = index * channels + c;
+            rgb[c]        = dest;
+            mean_in      += rgb[c];
+
+            // Note: coeff[3] == color1, coeff[4] == color2, ...
+            rgb[c] *= chromatic_coeffs[n * 6 + (3 + c)];
+
+            mean_out += rgb[c];
+          } else {
+            out_data[index * channels + c] = dest;
+          }
+        }
+
+        float brightness_coeff = mean_in / (mean_out + 0.01f);
+
+        if (chromatic_coeffs) {
+          // Chromatic transformation
+          for (int c = 0; c < channels; c++) {
+            // compensate brightness
+            rgb[c] = clamp(rgb[c] * brightness_coeff, 0.0f, 1.0f);
+
+            // gamma change
+            rgb[c] = pow(rgb[c], gamma);
+
+            // brightness change
+            rgb[c] = rgb[c] + brightness;
+
+            // contrast change
+            rgb[c] = 0.5f + (rgb[c] - 0.5f) * contrast;
+
+            out_data[data_index[c]] = clamp(rgb[c], 0.0f, 1.0f);
+          }
+        }
+      }
+    });
+}
+
+template<typename Device>
+class DataAugmentation : public OpKernel {
+  public:
+    explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+
+      // TODO: Verify params are all the same length
+
+      // Get the tensors for params_a and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_a_rand_type", &params_a_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread", &params_a_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_coeff_schedule", &params_a_coeff_schedule_));
+
+      // Get the tensors for params_b and verify their dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetAttr("params_b_rand_type", &params_b_rand_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread", &params_b_spread_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_coeff_schedule", &params_b_coeff_schedule_));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images
+      const Tensor& input_a_t = ctx->input(0);
+      const Tensor& input_b_t = ctx->input(1);
+
+      // Get the global step value
+      const Tensor& global_step_t = ctx->input(2);
+      auto global_step_eigen      = global_step_t.tensor<int64, 0>();
+      const int64 global_step     = global_step_eigen.data()[0];
+
+      // Dimension constants
+      const int batch_size = input_a_t.dim_size(0);
+      const int src_height = input_a_t.dim_size(1);
+      const int src_width  = input_a_t.dim_size(2);
+      const int channels   = input_a_t.dim_size(3);
+      const int src_count  = batch_size * src_height * src_width * channels;
+      const int out_height = crop_[0];
+      const int out_width  = crop_[1];
+      const int out_count  = batch_size * out_height * out_width * channels;
+
+      // All tensors for this op
+      Tensor chromatic_coeffs_a_t;
+      Tensor chromatic_coeffs_b_t;
+
+      // Allocate the memory for the output images
+      Tensor *output_a_t;
+      Tensor *output_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(0, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(1, TensorShape({ batch_size, crop_[0], crop_[1],
+                                                           channels }), &output_b_t));
+
+      // Allocate the memory for the output spatial transforms
+      Tensor *spat_transform_a_t;
+      Tensor *spat_transform_b_t;
+
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_a_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+                                          &spat_transform_b_t));
+
+      // Compute discount for coefficients if using a schedule
+      float discount_coeff_a = 1.0;
+      float discount_coeff_b = 1.0;
+
+      if (params_a_coeff_schedule_.size() == 3) {
+        float half_life     = params_a_coeff_schedule_[0];
+        float initial_coeff = params_a_coeff_schedule_[1];
+        float final_coeff   = params_a_coeff_schedule_[2];
+        discount_coeff_a = initial_coeff + (final_coeff - initial_coeff) *
+                           (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+      }
+
+      if (params_b_coeff_schedule_.size() == 3) {
+        if (params_a_coeff_schedule_.size() == 3) {
+          discount_coeff_b = discount_coeff_a;
+        } else {
+          float half_life     = params_b_coeff_schedule_[0];
+          float initial_coeff = params_b_coeff_schedule_[1];
+          float final_coeff   = params_b_coeff_schedule_[2];
+          discount_coeff_b = initial_coeff + (final_coeff - initial_coeff) *
+                             (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0);
+        }
+      }
+
+      /*** BEGIN AUGMENTATION TO IMAGE A ***/
+      auto input_a  = input_a_t.tensor<float, 4>();
+      auto output_a = output_a_t->tensor<float, 4>();
+
+      // Load augmentation parameters for image A
+      AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+                                                    params_a_name_,
+                                                    params_a_rand_type_,
+                                                    params_a_exp_,
+                                                    params_a_mean_,
+                                                    params_a_spread_,
+                                                    params_a_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_a;
+
+
+      bool gen_spatial_transform   = aug_a.should_do_spatial_transform();
+      bool gen_chromatic_transform = aug_a.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff;
+
+        if (gen_spatial_transform) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_a, aug_a, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_a, aug_a, coeff);
+        }
+
+        coeffs_a.push_back(coeff);
+      }
+
+      // Copy spatial coefficients A to the output Tensor on the CPU
+      // (output for FlowAugmentation)
+      auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_a);
+
+      float *chromatic_coeffs_a_data = NULL;
+
+      if (gen_chromatic_transform) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_a_t));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_a = chromatic_coeffs_a_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_a, chromatic_coeffs_a);
+        chromatic_coeffs_a_data = chromatic_coeffs_a.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_a.data(),
+        output_a.data(),
+        spat_transform_a.data(),
+        chromatic_coeffs_a_data);
+
+      /*** END AUGMENTATION TO IMAGE A ***/
+
+      /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+      AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+                                                    params_b_name_,
+                                                    params_b_rand_type_,
+                                                    params_b_exp_,
+                                                    params_b_mean_,
+                                                    params_b_spread_,
+                                                    params_b_prob_);
+
+      std::vector<AugmentationCoeff> coeffs_b;
+
+      bool gen_spatial_transform_b   = aug_b.should_do_spatial_transform();
+      bool gen_chromatic_transform_b = aug_b.should_do_chromatic_transform();
+
+      for (int n = 0; n < batch_size; n++) {
+        AugmentationCoeff coeff(coeffs_a[n]);
+
+        // If we did a spatial transform on image A, we need to do the same one
+        // (+ possibly more) on image B
+        if (gen_spatial_transform_b) {
+          AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_b, aug_b, coeff,
+                                                               src_width, src_height,
+                                                               out_width, out_height);
+        }
+
+        if (gen_chromatic_transform_b) {
+          AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_b, aug_b, coeff);
+        }
+
+        coeffs_b.push_back(coeff);
+      }
+
+      /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+
+      /*** BEGIN AUGMENTATION TO IMAGE B ***/
+      auto input_b  = input_b_t.tensor<float, 4>();
+      auto output_b = output_b_t->tensor<float, 4>();
+
+      // Copy spatial coefficients B to the output Tensor on the CPU
+      auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b);
+
+      float *chromatic_coeffs_b_data = NULL;
+
+      if (gen_chromatic_transform || gen_chromatic_transform_b) {
+        // Allocate a temporary tensor to hold the chromatic coefficients
+        tensorflow::AllocatorAttributes pinned_allocator;
+        pinned_allocator.set_on_host(true);
+        pinned_allocator.set_gpu_compatible(true);
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_temp(DataTypeToEnum<float>::value,
+                                          TensorShape({ batch_size, 6 }),
+                                          &chromatic_coeffs_b_t, pinned_allocator));
+
+        // Copy the chromatic coefficients A to a temporary Tensor on the CPU
+        auto chromatic_coeffs_b = chromatic_coeffs_b_t.tensor<float, 2>();
+        AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_b, chromatic_coeffs_b);
+        chromatic_coeffs_b_data = chromatic_coeffs_b.data();
+      }
+
+      // Perform augmentation either on CPU or GPU
+      Augment<Device>(
+        ctx,
+        ctx->eigen_device<Device>(),
+        batch_size,
+        channels,
+        src_width,
+        src_height,
+        src_count,
+        out_width,
+        out_height,
+        input_b.data(),
+        output_b.data(),
+        spat_transform_b.data(),
+        chromatic_coeffs_b_data);
+
+      // FlowAugmentation needs the inverse
+      // TODO: To avoid rewriting, can we invert when we read on the
+      // FlowAugmentation side?
+      AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+                                                           out_width, out_height,
+                                                           src_width, src_height,
+                                                           spat_transform_b,
+                                                           true);
+
+      /*** END AUGMENTATION TO IMAGE B ***/
+    }
+
+  private:
+    std::vector<int32>crop_;
+
+    // Params A
+    std::vector<string>params_a_name_;
+    std::vector<string>params_a_rand_type_;
+    std::vector<bool>params_a_exp_;
+    std::vector<float>params_a_mean_;
+    std::vector<float>params_a_spread_;
+    std::vector<float>params_a_prob_;
+    std::vector<float>params_a_coeff_schedule_;
+
+    // Params B
+    std::vector<string>params_b_name_;
+    std::vector<string>params_b_rand_type_;
+    std::vector<bool>params_b_exp_;
+    std::vector<float>params_b_mean_;
+    std::vector<float>params_b_spread_;
+    std::vector<float>params_b_prob_;
+    std::vector<float>params_b_coeff_schedule_;
+};
+
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_CPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("DataAugmentation")
+                        .Device(DEVICE_GPU)
+                        .HostMemory("global_step")
+                        .HostMemory("transforms_from_a")
+                        .HostMemory("transforms_from_b"),
+                        DataAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
new file mode 100644
index 0000000..7a2101d
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc
@@ -0,0 +1,348 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "augmentation_base.h"
+#include "data_augmentation.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+inline __device__ __host__ float clamp(float f, float a, float b) {
+  return fmaxf(a, fminf(f, b));
+}
+
+__global__ void SpatialAugmentation(
+  const int32  nthreads,
+  const int    src_width,
+  const int    src_height,
+  const int    channels,
+  const int    src_count,
+  const int    out_width,
+  const int    out_height,
+  const float *src_data,
+  float       *out_data,
+  const float *transMats) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    int c = index % channels;
+    int x = (index / channels) % out_width;
+    int y = (index / channels / out_width) % out_height;
+    int n = index / channels / out_width / out_height;
+
+    const float *transMat = transMats + n * 6;
+    float xpos            = x * transMat[0] + y * transMat[1] + transMat[2];
+    float ypos            = x * transMat[3] + y * transMat[4] + transMat[5];
+
+    xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f);
+    ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f);
+
+    float tlx = floor(xpos);
+    float tly = floor(ypos);
+
+    // Bilinear interpolation
+    int srcTLIdx = ((n * src_height + tly) * src_width + tlx) * channels + c;
+    int srcTRIdx = min((int)(((n * src_height + tly) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+    int srcBLIdx = min((int)(((n * src_height + (tly + 1)) * src_width + tlx) * channels + c),
+                       src_count);
+    int srcBRIdx = min((int)(((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + c),
+                       src_count);
+
+    float xdist = xpos - tlx;
+    float ydist = ypos - tly;
+
+    float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx]
+                 + (xdist) * (ydist) * src_data[srcBRIdx]
+                 + (1 - xdist) * (ydist) * src_data[srcBLIdx]
+                 + (xdist) * (1 - ydist) * src_data[srcTRIdx];
+
+    out_data[index] = dest;
+  }
+}
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template<>
+void Augment(OpKernelContext *context,
+             const GPUDevice& d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs) {
+  const int out_count     = batch_size * out_height * out_width * channels;
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_count, d);
+
+  printf("Chromatic transform not yet implemented on GPU, ignoring.");
+
+  SpatialAugmentation << < config.block_count, config.thread_per_block, 0, d.stream() >> > (
+    config.virtual_thread_count, src_width, src_height, channels, src_count,
+    out_width, out_height,
+    src_data, out_data, transMats);
+}
+
+//
+// template<typename Device>
+// class DataAugmentation : public OpKernel {
+//   public:
+//     explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+//       // Get the crop [height, width] tensor and verify its dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+//       OP_REQUIRES(ctx, crop_.size() == 2,
+//                   errors::InvalidArgument("crop must be 2 dimensions"));
+//
+//       // TODO: Verify params are all the same length
+//
+//       // Get the tensors for params_a and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", &params_a_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_a_rand_type",
+// &params_a_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", &params_a_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", &params_a_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread",
+// &params_a_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", &params_a_prob_));
+//
+//       // Get the tensors for params_b and verify their dimensions
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", &params_b_name_));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->GetAttr("params_b_rand_type",
+// &params_b_rand_type_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", &params_b_exp_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", &params_b_mean_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread",
+// &params_b_spread_));
+//       OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", &params_b_prob_));
+//     }
+//
+//     void Compute(OpKernelContext *ctx) override {
+//       const GPUDevice& device = ctx->eigen_gpu_device();
+//
+//       // Get the input images
+//       const Tensor& input_a_t = ctx->input(0);
+//       const Tensor& input_b_t = ctx->input(1);
+//
+//       // Dimension constants
+//       const int batch_size = input_a_t.dim_size(0);
+//       const int src_height = input_a_t.dim_size(1);
+//       const int src_width  = input_a_t.dim_size(2);
+//       const int channels   = input_a_t.dim_size(3);
+//       const int src_count  = batch_size * src_height * src_width * channels;
+//       const int out_height = crop_[0];
+//       const int out_width  = crop_[1];
+//       const int out_count  = batch_size * out_height * out_width * channels;
+//
+//       // Allocate the memory for the output images
+//       Tensor *output_a_t;
+//       Tensor *output_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(0, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(1, TensorShape({ batch_size,
+// crop_[0], crop_[1],
+//                                                            channels }),
+// &output_b_t));
+//
+//       // Allocate the memory for the output spatial transforms
+//       Tensor *spat_transform_a_t;
+//       Tensor *spat_transform_b_t;
+//
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(2, TensorShape({ batch_size, 6 }),
+//  &spat_transform_a_t));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_output(3, TensorShape({ batch_size, 6 }),
+// &spat_transform_b_t));
+//
+//       // Allocate temporary pinned memory for the spatial transforms to be
+// used
+//       // on the GPU
+//       tensorflow::AllocatorAttributes pinned_allocator;
+//       pinned_allocator.set_on_host(true);
+//       pinned_allocator.set_gpu_compatible(true);
+//
+//       Tensor spat_transform_a_pinned_t;
+//       Tensor spat_transform_b_pinned_t;
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_a_pinned_t,
+// pinned_allocator));
+//       OP_REQUIRES_OK(ctx,
+//                      ctx->allocate_temp(DataTypeToEnum<float>::value,
+//                                         TensorShape({ batch_size, 6 }),
+//                                         &spat_transform_b_pinned_t,
+// pinned_allocator));
+//       auto spat_transform_a_pinned = spat_transform_a_pinned_t.tensor<float,
+// 2>();
+//       auto spat_transform_b_pinned = spat_transform_b_pinned_t.tensor<float,
+// 2>();
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE A ***/
+//       auto input_a  = input_a_t.tensor<float, 4>();
+//       auto output_a = output_a_t->tensor<float, 4>();
+//
+//       // Load augmentation parameters for image A
+//       AugmentationParams aug_a = AugmentationParams(out_height, out_width,
+//                                                     params_a_name_,
+//                                                     params_a_rand_type_,
+//                                                     params_a_exp_,
+//                                                     params_a_mean_,
+//                                                     params_a_spread_,
+//                                                     params_a_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_a;
+//
+//       bool gen_spatial_transform = aug_a.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_a, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_a.push_back(coeff);
+//       }
+//
+//       // Copy spatial coefficients A to the output Tensor on the CPU (output
+// for
+//       // FlowAugmentation)
+//       auto spat_transform_a = spat_transform_a_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_a);
+//
+//       // ...as well as a Tensor going to the GPU
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a,
+//                                                            out_width,
+//                                                            out_height,
+//                                                            src_width,
+//                                                            src_height,
+//
+//
+//
+//                                                    spat_transform_a_pinned);
+//
+//       CudaLaunchConfig config = GetCudaLaunchConfig(out_count, device);
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_a.data(), output_a.data(), spat_transform_a_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE A ***/
+//
+//       /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//       AugmentationParams aug_b = AugmentationParams(out_height, out_width,
+//                                                     params_b_name_,
+//                                                     params_b_rand_type_,
+//                                                     params_b_exp_,
+//                                                     params_b_mean_,
+//                                                     params_b_spread_,
+//                                                     params_b_prob_);
+//
+//       std::vector<AugmentationCoeff> coeffs_b;
+//
+//       gen_spatial_transform = aug_b.should_do_spatial_transform();
+//
+//       for (int n = 0; n < batch_size; n++) {
+//         AugmentationCoeff coeff;
+//
+//         if (gen_spatial_transform) {
+//           AugmentationLayerBase::generate_valid_spatial_coeffs(aug_b, coeff,
+//                                                                src_width,
+// src_height,
+//                                                                out_width,
+// out_height);
+//         }
+//
+//         coeffs_b.push_back(coeff);
+//       }
+//
+//       /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/
+//
+//       /*** BEGIN AUGMENTATION TO IMAGE B ***/
+//       auto input_b  = input_b_t.tensor<float, 4>();
+//       auto output_b = output_b_t->tensor<float, 4>();
+//
+//       // Copy spatial coefficients B to the output Tensor on the CPU
+//       auto spat_transform_b = spat_transform_b_t->tensor<float, 2>();
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//                                                            spat_transform_b,
+//                                                            true);
+//       AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b,
+//                                                            out_width,
+// out_height,
+//                                                            src_width,
+// src_height,
+//
+//
+//
+//                                                    spat_transform_b_pinned);
+//
+//       SpatialAugmentation << < config.block_count, config.thread_per_block,
+// 0,
+//         device.stream() >> > (
+//         config.virtual_thread_count, src_width, src_height, channels,
+// src_count,
+//         out_width, out_height,
+//         input_b.data(), output_b.data(), spat_transform_b_pinned.data());
+//
+//       /*** END AUGMENTATION TO IMAGE B ***/
+//     }
+//
+//   private:
+//     std::vector<int32>crop_;
+//
+//     // Params A
+//     std::vector<string>params_a_name_;
+//     std::vector<string>params_a_rand_type_;
+//     std::vector<bool>params_a_exp_;
+//     std::vector<float>params_a_mean_;
+//     std::vector<float>params_a_spread_;
+//     std::vector<float>params_a_prob_;
+//
+//     // Params B
+//     std::vector<string>params_b_name_;
+//     std::vector<string>params_b_rand_type_;
+//     std::vector<bool>params_b_exp_;
+//     std::vector<float>params_b_mean_;
+//     std::vector<float>params_b_spread_;
+//     std::vector<float>params_b_prob_;
+// };
+} // namespace tensorflow
+#endif // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
new file mode 100644
index 0000000..545b8a0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h
@@ -0,0 +1,22 @@
+#ifndef FLOWNET_DATA_AUGMENTATION_H_
+#define FLOWNET_DATA_AUGMENTATION_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+template<class Device>
+void Augment(OpKernelContext *context,
+             const Device   & d,
+             const int        batch_size,
+             const int        channels,
+             const int        src_width,
+             const int        src_height,
+             const int        src_count,
+             const int        out_width,
+             const int        out_height,
+             const float     *src_data,
+             float           *out_data,
+             const float     *transMats,
+             float           *chromatic_coeffs);
+} // namespace tensorflow
+#endif // FLOWNET_DATA_AUGMENTATION_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
new file mode 100644
index 0000000..b5cc11f
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc
@@ -0,0 +1,129 @@
+#define EIGEN_USE_THREADS
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice        GPUDevice;
+
+inline int clamp(int f, int a, int b) {
+  return std::max(a, std::min(f, b));
+}
+
+template<>
+void FillFlowAugmentation(const CPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int src_height      = flows.dimension(1);
+  const int src_width       = flows.dimension(2);
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+  float *output_ptr     = output.data();
+  const float *flow_ptr = flows.data();
+
+  for (int n = 0; n < batch_size; n++) {
+    const float *transMatA = transforms_from_a.data() + n * 6;
+    const float *transMatB = transforms_from_b.data() + n * 6;
+
+    for (int y = 0; y < out_height; y++) {
+      int outputIdxOffset = (n * out_height + y) * out_width;
+
+      for (int x = 0; x < out_width; x++) {
+        // Apply transformation matrix applied to first image
+        const float xpos1 = x * transMatA[0] + y * transMatA[1] + transMatA[2];
+        const float ypos1 = x * transMatA[3] + y * transMatA[4] + transMatA[5];
+
+        const int srcXIdx =
+          ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * 2 + 0;
+        const int srcYIdx = srcXIdx + 1;
+
+        const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+        const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+        // Apply inverse of the transformation matrix applied to second image
+        const float xpos3 = xpos2 * transMatB[0] + ypos2 * transMatB[1] + transMatB[2];
+        const float ypos3 = xpos2 * transMatB[3] + ypos2 * transMatB[4] + transMatB[5];
+
+        output_ptr[(outputIdxOffset + x) * 2 + 0] = xpos3 - (float)x;
+        output_ptr[(outputIdxOffset + x) * 2 + 1] = ypos3 - (float)y;
+      }
+    }
+  }
+}
+
+template<typename Device>
+class FlowAugmentation : public OpKernel {
+  public:
+    explicit FlowAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) {
+      // Get the crop [height, width] tensor and verify its dimensions
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_));
+      OP_REQUIRES(ctx, crop_.size() == 2,
+                  errors::InvalidArgument("crop must be 2 dimensions"));
+    }
+
+    void Compute(OpKernelContext *ctx) override {
+      // Get the input images and transforms and verify their dimensions
+      const Tensor& flows_t             = ctx->input(0);
+      const Tensor& transforms_from_a_t = ctx->input(1);
+      const Tensor& transforms_from_b_t = ctx->input(2);
+
+      OP_REQUIRES(ctx, flows_t.dims() == 4,
+                  errors::InvalidArgument("Input images must have rank 4"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_a_t.shape()) &&
+                   transforms_from_a_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_a_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_a should be num_images x 6"));
+      OP_REQUIRES(ctx,
+                  (TensorShapeUtils::IsMatrix(transforms_from_b_t.shape()) &&
+                   transforms_from_b_t.dim_size(0) ==
+                   flows_t.dim_size(0) &&
+                   transforms_from_b_t.dim_size(1) == 6),
+                  errors::InvalidArgument(
+                    "Input transforms_from_b should be num_images x 6"));
+
+      // Allocate the memory for the output
+      Tensor *output_t;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                       0,
+                       TensorShape({ flows_t.dim_size(0), crop_[0], crop_[1],
+                                     flows_t.dim_size(3) }), &output_t));
+
+      // Perform flow augmentation
+      auto flows             = flows_t.tensor<float, 4>();
+      auto transforms_from_a = transforms_from_a_t.tensor<float, 2>();
+      auto transforms_from_b = transforms_from_b_t.tensor<float, 2>();
+      auto output            = output_t->tensor<float, 4>();
+
+      FillFlowAugmentation(ctx->eigen_device<Device>(),
+                           output,
+                           flows,
+                           transforms_from_a,
+                           transforms_from_b);
+    }
+
+  private:
+    std::vector<int32>crop_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_CPU),
+                        FlowAugmentation<CPUDevice>)
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("FlowAugmentation")
+                        .Device(DEVICE_GPU),
+                        FlowAugmentation<GPUDevice>)
+#endif // GOOGLE_CUDA
+} // end namespace tensorflow
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
new file mode 100644
index 0000000..7795991
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h
@@ -0,0 +1,19 @@
+#ifndef FLOWNET_FLOW_AUG_H_
+#define FLOWNET_FLOW_AUG_H_
+
+// See docs in ../ops/image_ops.cc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+template<class Device>
+void FillFlowAugmentation(const Device& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<float, 2>::ConstTensor transforms_from_b);
+} // end namespace tensorflow
+
+#endif  // FLOWNET_FLOW_AUG_H_
diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
new file mode 100644
index 0000000..7e10864
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc
@@ -0,0 +1,95 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <stdio.h>
+#include <iostream>
+
+#include "flow_augmentation.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+inline __device__ __host__ int clamp(int f, int a, int b) {
+  return max(a, min(f, b));
+}
+
+__global__ void FillFlowAugmentationKernel(
+  const int32 nthreads,
+  const float *flow_ptr,
+  const float *transforms_from_a,
+  const float *inv_transforms_from_b,
+  const int src_total_count, const int src_height, const int src_width,
+  const int batch_size, const int out_height,
+  const int out_width, float *output_ptr) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const float x = (float)(index % out_width);
+    const float y = (float)((index / out_width) % out_height);
+    const int   n = (index / out_width / out_height);
+
+    const int transformIdx = n * 6;
+
+    // Apply transformation matrix applied to second image
+    const float xpos1 = x * transforms_from_a[transformIdx + 0]
+                        + y * transforms_from_a[transformIdx + 1]
+                        + transforms_from_a[transformIdx + 2];
+    const float ypos1 = x * transforms_from_a[transformIdx + 3]
+                        + y * transforms_from_a[transformIdx + 4]
+                        + transforms_from_a[transformIdx + 5];
+
+    // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+    // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+    const int srcXIdx =
+      ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) *
+      2 + 0;
+    const int srcYIdx = srcXIdx + 1;
+
+    const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)];
+    const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)];
+
+    // Apply inverse of the transformation matrix applied to first image
+    const float xpos3 = xpos2 * inv_transforms_from_b[transformIdx + 0]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 1]
+                        + inv_transforms_from_b[transformIdx + 2];
+    const float ypos3 = xpos2 * inv_transforms_from_b[transformIdx + 3]
+                        + ypos2 * inv_transforms_from_b[transformIdx + 4]
+                        + inv_transforms_from_b[transformIdx + 5];
+
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 0] = xpos3 -
+                                                                           x;
+    output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 1] = ypos3 -
+                                                                           y;
+  }
+}
+
+template<>
+void FillFlowAugmentation(const GPUDevice& device,
+                          typename TTypes<float, 4>::Tensor output,
+                          typename TTypes<float, 4>::ConstTensor flows,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_a,
+                          typename TTypes<const float, 2>::ConstTensor transforms_from_b) {
+  const int batch_size      = output.dimension(0);
+  const int out_height      = output.dimension(1);
+  const int out_width       = output.dimension(2);
+  const int depth           = 2;
+  const int total_count     = batch_size * out_height * out_width * depth;
+  const int src_total_count = flows.dimension(0) * flows.dimension(1) *
+                              flows.dimension(2) * flows.dimension(3);
+
+  CudaLaunchConfig config = GetCudaLaunchConfig(total_count / 2, device);
+
+  FillFlowAugmentationKernel << < config.block_count, config.thread_per_block, 0,
+    device.stream() >> > (
+    total_count / 2, flows.data(), transforms_from_a.data(),
+    transforms_from_b.data(),
+    src_total_count, flows.dimension(1), flows.dimension(2), batch_size,
+    out_height, out_width, output.data());
+}
+} // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/Codes/flownet2/src/ops/preprocessing/preprocessing.cc b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
new file mode 100644
index 0000000..086a0d0
--- /dev/null
+++ b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+Status SetOutputToSizedImage(InferenceContext *c) {
+  ShapeHandle input;
+
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  DimensionHandle batch    = c->Dim(input, 0);
+  DimensionHandle    depth = c->Dim(input, 3);
+  std::vector<int32> crop_;
+  c->GetAttr("crop", &crop_);
+  DimensionHandle height = c->MakeDim(crop_[0]);
+  DimensionHandle width  = c->MakeDim(crop_[1]);
+  c->set_output(0, c->MakeShape({ batch, height, width, depth }));
+  return Status::OK();
+}
+
+REGISTER_OP("DataAugmentation")
+.Input("image_a: float32")
+.Input("image_b: float32")
+.Input("global_step: int64")
+.Attr("crop: list(int) >= 2")
+.Attr("params_a_name: list(string)")
+.Attr("params_a_rand_type: list(string)")
+.Attr("params_a_exp: list(bool)")
+.Attr("params_a_mean: list(float)")
+.Attr("params_a_spread: list(float)")
+.Attr("params_a_prob: list(float)")
+.Attr("params_a_coeff_schedule: list(float)")
+.Attr("params_b_name: list(string)")
+.Attr("params_b_rand_type: list(string)")
+.Attr("params_b_exp: list(bool)")
+.Attr("params_b_mean: list(float)")
+.Attr("params_b_spread: list(float)")
+.Attr("params_b_prob: list(float)")
+.Attr("params_b_coeff_schedule: list(float)")
+.Output("aug_image_a: float32")
+.Output("aug_image_b: float32")
+.Output("transforms_from_a: float32")
+.Output("transforms_from_b: float32")
+.SetShapeFn([](InferenceContext *c) {
+    // Verify input A and input B both have 4 dimensions
+    ShapeHandle input_shape_a, input_shape_b;
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape_a));
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_shape_b));
+
+    // TODO: Verify params vectors all have the same length
+
+    // TODO: Move this out of here and into Compute
+    // Verify input A and input B are the same shape
+    DimensionHandle batch_size, unused;
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 0),
+                                    c->Value(c->Dim(input_shape_b, 0)),
+                                    &batch_size));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 1),
+                                    c->Value(c->Dim(input_shape_b, 1)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 2),
+                                    c->Value(c->Dim(input_shape_b, 2)), &unused));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 3),
+                                    c->Value(c->Dim(input_shape_b, 3)), &unused));
+
+    // Get cropping dimensions
+    std::vector<int32>crop_;
+    TF_RETURN_IF_ERROR(c->GetAttr("crop", &crop_));
+
+    // Reshape input shape to cropped shape
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 1, c->MakeDim(crop_[0]),
+                                     &input_shape_a));
+    TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 2, c->MakeDim(crop_[1]),
+                                     &input_shape_a));
+
+    // Set output images shapes
+    c->set_output(0, input_shape_a);
+    c->set_output(1, input_shape_a);
+
+    // Set output spatial transforms shapes
+    c->set_output(2, c->MakeShape({ batch_size, 6 }));
+    c->set_output(3, c->MakeShape({ batch_size, 6 }));
+
+    return Status::OK();
+  });
+
+REGISTER_OP("FlowAugmentation")
+.Input("flows: float32")
+.Input("transforms_from_a: float32")
+.Input("transforms_from_b: float32")
+.Attr("crop: list(int) >= 2")
+.Output("transformed_flows: float32")
+.SetShapeFn(SetOutputToSizedImage);
+} // namespace tensorflow
diff --git a/Codes/flownet2/src/training_schedules.py b/Codes/flownet2/src/training_schedules.py
new file mode 100644
index 0000000..4db5aab
--- /dev/null
+++ b/Codes/flownet2/src/training_schedules.py
@@ -0,0 +1,12 @@
+LONG_SCHEDULE = {
+    'step_values': [400000, 600000, 800000, 1000000],
+    'learning_rates': [0.0001, 0.00005, 0.000025, 0.0000125, 0.00000625],
+    'momentum': 0.9,
+    'momentum2': 0.999,
+    'weight_decay': 0.0004,
+    'max_iter': 1200000,
+}
+
+FINETUNE_SCHEDULE = {
+    # TODO: Finetune schedule
+}
diff --git a/Codes/flownet2/src/utils.py b/Codes/flownet2/src/utils.py
new file mode 100644
index 0000000..f6abe18
--- /dev/null
+++ b/Codes/flownet2/src/utils.py
@@ -0,0 +1,46 @@
+import tensorflow as tf
+
+
+# Thanks, https://github.com/tensorflow/tensorflow/issues/4079
+def LeakyReLU(x, leak=0.1, name="lrelu"):
+    with tf.variable_scope(name):
+        f1 = 0.5 * (1.0 + leak)
+        f2 = 0.5 * (1.0 - leak)
+        return f1 * x + f2 * abs(x)
+
+
+def average_endpoint_error(labels, predictions):
+    """
+    Given labels and predictions of size (N, H, W, 2), calculates average endpoint error:
+        sqrt[sum_across_channels{(X - Y)^2}]
+    """
+    num_samples = predictions.shape.as_list()[0]
+    with tf.name_scope(None, "average_endpoint_error", (predictions, labels)) as scope:
+        predictions = tf.to_float(predictions)
+        labels = tf.to_float(labels)
+        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+        squared_difference = tf.square(tf.subtract(predictions, labels))
+        # sum across channels: sum[(X - Y)^2] -> N, H, W, 1
+        loss = tf.reduce_sum(squared_difference, 3, keep_dims=True)
+        loss = tf.sqrt(loss)
+        return tf.reduce_sum(loss) / num_samples
+
+
+def pad(tensor, num=1):
+    """
+    Pads the given tensor along the height and width dimensions with `num` 0s on each side
+    """
+    return tf.pad(tensor, [[0, 0], [num, num], [num, num], [0, 0]], "CONSTANT")
+
+
+def antipad(tensor, num=1):
+    """
+    Performs a crop. "padding" for a deconvolutional layer (conv2d tranpose) removes
+    padding from the output rather than adding it to the input.
+    """
+    batch, h, w, c = tensor.get_shape().as_list()
+    # print(batch, h, w, c)
+    # print(type(batch), type(h), type(w), type(c))
+    # return tf.slice(tensor, begin=[0, num, num, 0], size=[batch, h - 2 * num, w - 2 * num, c])
+    return tensor[:, num: num + h - 2 * num, num: num + w - 2 * num, :]
diff --git a/Codes/flownet2/test.py b/Codes/flownet2/test.py
new file mode 100644
index 0000000..2fcb380
--- /dev/null
+++ b/Codes/flownet2/test.py
@@ -0,0 +1,163 @@
+import os
+import tensorflow as tf
+import numpy as np
+from scipy.misc import imread
+import matplotlib
+from src.flowlib import read_flow, flow_to_image
+matplotlib.use('TKAgg')
+import matplotlib.pyplot as plt
+
+_preprocessing_ops = tf.load_op_library(
+    tf.resource_loader.get_path_to_datafile("./src/ops/build/preprocessing.so"))
+
+
+def display(img, c):
+    plt.subplot(int('22' + str(c + 1)))
+    plt.imshow(img[0, :, :, :])
+
+
+def main():
+    """
+.Input("image_a: float32")
+.Input("image_b: float32")
+.Attr("crop: list(int) >= 2")
+.Attr("params_a_name: list(string)")
+.Attr("params_a_rand_type: list(string)")
+.Attr("params_a_exp: list(bool)")
+.Attr("params_a_mean: list(float32)")
+.Attr("params_a_spread: list(float32)")
+.Attr("params_a_prob: list(float32)")
+.Attr("params_b_name: list(string)")
+.Attr("params_b_rand_type: list(string)")
+.Attr("params_b_exp: list(bool)")
+.Attr("params_b_mean: list(float32)")
+.Attr("params_b_spread: list(float32)")
+.Attr("params_b_prob: list(float32)")
+.Output("aug_image_a: float32")
+.Output("aug_image_b: float32")
+.Output("spatial_transform_a: float32")
+.Output("inv_spatial_transform_b: float32")
+    """
+
+    crop = [364, 492]
+    params_a_name = ['translate_x', 'translate_y']
+    params_a_rand_type = ['uniform_bernoulli', 'uniform_bernoulli']
+    params_a_exp = [False, False]
+    params_a_mean = [0.0, 0.0]
+    params_a_spread = [0.4, 0.4]
+    params_a_prob = [1.0, 1.0]
+    params_b_name = []
+    params_b_rand_type = []
+    params_b_exp = []
+    params_b_mean = []
+    params_b_spread = []
+    params_b_prob = []
+
+    with tf.Session() as sess:
+        with tf.device('/gpu:0'):
+            image_a = imread('./img0.ppm') / 255.0
+            image_b = imread('./img1.ppm') / 255.0
+            flow = read_flow('./flow.flo')
+
+            image_a_tf = tf.expand_dims(tf.to_float(tf.constant(image_a, dtype=tf.float64)), 0)
+            image_b_tf = tf.expand_dims(tf.to_float(tf.constant(image_b, dtype=tf.float64)), 0)
+
+            preprocess = _preprocessing_ops.data_augmentation(image_a_tf,
+                                                              image_b_tf,
+                                                              crop,
+                                                              params_a_name,
+                                                              params_a_rand_type,
+                                                              params_a_exp,
+                                                              params_a_mean,
+                                                              params_a_spread,
+                                                              params_a_prob,
+                                                              params_b_name,
+                                                              params_b_rand_type,
+                                                              params_b_exp,
+                                                              params_b_mean,
+                                                              params_b_spread,
+                                                              params_b_prob)
+
+            out = sess.run(preprocess)
+            trans = out.spatial_transform_a
+            inv_trans = out.inv_spatial_transform_b
+
+            print(trans.shape)
+            print(inv_trans.shape)
+
+            flow_tf = tf.expand_dims(tf.to_float(tf.constant(flow)), 0)
+            aug_flow_tf = _preprocessing_ops.flow_augmentation(flow_tf, trans, inv_trans, crop)
+
+            aug_flow = sess.run(aug_flow_tf)[0, :, :, :]
+
+            # Plot img0, img0aug
+            plt.subplot(321)
+            plt.imshow(image_a)
+            plt.subplot(322)
+            plt.imshow(out.aug_image_a[0, :, :, :])
+
+            # Plot img1, img1aug
+            plt.subplot(323)
+            plt.imshow(image_b)
+            plt.subplot(324)
+            plt.imshow(out.aug_image_b[0, :, :, :])
+
+            # Plot flow, flowaug
+            plt.subplot(325)
+            plt.imshow(flow_to_image(flow))
+            plt.subplot(326)
+            plt.imshow(flow_to_image(aug_flow))
+
+            plt.show()
+
+            # image_b_aug = sess.run(image_b_tf)
+            #
+            # display(np.expand_dims(image_a, 0), 0)
+            # display(np.expand_dims(image_b, 0), 1)
+            # display(image_a_aug, 2)
+            # display(image_b_aug, 3)
+            # plt.show()
+
+            # o = _preprocessing_ops.flow_augmentation(flow, trans, inv_t, [4, 8])
+            # print n[:, :, :]
+            # print n[0, 0, 1], n[0, 0, 0]
+            # print n[1, 0, 1], n[1, 0, 0]
+            # print n[2, 0, 1], n[2, 0, 0]
+            # print '---'
+            # print sess.run(o)
+
+            """# Goes along width first!!
+            // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w)
+            // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k)
+
+            H=5, W=10, K=2
+            n=0, h=1, w=5, k=0
+
+            (2 * 10)                + c
+
+            30      49                  n[0, 1, 5, 0]"""
+
+
+print(os.getpid())
+input("Press Enter to continue...")
+main()
+
+# Last index is channel!!
+
+#   K
+
+# value 13 should be at [0, 2, 7, 1] aka batch=0, height=1, width=0, channel=0. it is at index=20.
+#
+# items = {
+#     'N': [0, 0],
+#     'H': [5, 2],
+#     'W': [10, 7],
+#     'K': [2, 1],
+# }
+#
+# for (i1, v1) in items.iteritems():
+#     for (i2, v2) in items.iteritems():
+#         for (i3, v3) in items.iteritems():
+#             for (i4, v4) in items.iteritems():
+#                 if ((v1[1] * v2[0] + v2[1]) * v3[0] + v3[1]) * v4[0] + v4[1] == 55:
+#                     print 'found it: ', i1, i2, i3, i4
diff --git a/Codes/inference.py b/Codes/inference.py
new file mode 100644
index 0000000..0263339
--- /dev/null
+++ b/Codes/inference.py
@@ -0,0 +1,149 @@
+import tensorflow as tf
+import os
+import time
+import numpy as np
+import pickle
+
+
+from models import generator
+from utils import DataLoader, load, save, psnr_error
+from constant import const
+import evaluate
+
+
+slim = tf.contrib.slim
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = const.GPU
+
+dataset_name = const.DATASET
+test_folder = const.TEST_FOLDER
+
+num_his = const.NUM_HIS
+height, width = 256, 256
+
+snapshot_dir = const.SNAPSHOT_DIR
+psnr_dir = const.PSNR_DIR
+evaluate_name = const.EVALUATE
+
+print(const)
+
+
+# define dataset
+with tf.name_scope('dataset'):
+    test_video_clips_tensor = tf.placeholder(shape=[1, height, width, 3 * (num_his + 1)],
+                                             dtype=tf.float32)
+    test_inputs = test_video_clips_tensor[..., 0:num_his*3]
+    test_gt = test_video_clips_tensor[..., -3:]
+    print('test inputs = {}'.format(test_inputs))
+    print('test prediction gt = {}'.format(test_gt))
+
+# define testing generator function and
+# in testing, only generator networks, there is no discriminator networks and flownet.
+with tf.variable_scope('generator', reuse=None):
+    print('testing = {}'.format(tf.get_variable_scope().name))
+    test_outputs = generator(test_inputs, layers=4, output_channel=3)
+    test_psnr_error = psnr_error(gen_frames=test_outputs, gt_frames=test_gt)
+
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+with tf.Session(config=config) as sess:
+    # dataset
+    data_loader = DataLoader(test_folder, height, width)
+
+    # initialize weights
+    sess.run(tf.global_variables_initializer())
+    print('Init global successfully!')
+
+    # tf saver
+    saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=None)
+
+    restore_var = [v for v in tf.global_variables()]
+    loader = tf.train.Saver(var_list=restore_var)
+
+    def inference_func(ckpt, dataset_name, evaluate_name):
+        load(loader, sess, ckpt)
+
+        psnr_records = []
+        videos_info = data_loader.videos
+        num_videos = len(videos_info.keys())
+        total = 0
+        timestamp = time.time()
+
+        for video_name, video in videos_info.items():
+            length = video['length']
+            total += length
+            psnrs = np.empty(shape=(length,), dtype=np.float32)
+
+            for i in range(num_his, length):
+                video_clip = data_loader.get_video_clips(video_name, i - num_his, i + 1)
+                psnr = sess.run(test_psnr_error,
+                                feed_dict={test_video_clips_tensor: video_clip[np.newaxis, ...]})
+                psnrs[i] = psnr
+
+                print('video = {} / {}, i = {} / {}, psnr = {:.6f}'.format(
+                    video_name, num_videos, i, length, psnr))
+
+            psnrs[0:num_his] = psnrs[num_his]
+            psnr_records.append(psnrs)
+
+        result_dict = {'dataset': dataset_name, 'psnr': psnr_records, 'flow': [], 'names': [], 'diff_mask': []}
+
+        used_time = time.time() - timestamp
+        print('total time = {}, fps = {}'.format(used_time, total / used_time))
+
+        # TODO specify what's the actual name of ckpt.
+        pickle_path = os.path.join(psnr_dir, os.path.split(ckpt)[-1])
+        with open(pickle_path, 'wb') as writer:
+            pickle.dump(result_dict, writer, pickle.HIGHEST_PROTOCOL)
+
+        results = evaluate.evaluate(evaluate_name, pickle_path)
+        print(results)
+
+
+    if os.path.isdir(snapshot_dir):
+        def check_ckpt_valid(ckpt_name):
+            is_valid = False
+            ckpt = ''
+            if ckpt_name.startswith('model.ckpt-'):
+                ckpt_name_splits = ckpt_name.split('.')
+                ckpt = str(ckpt_name_splits[0]) + '.' + str(ckpt_name_splits[1])
+                ckpt_path = os.path.join(snapshot_dir, ckpt)
+                if os.path.exists(ckpt_path + '.index') and os.path.exists(ckpt_path + '.meta') and \
+                        os.path.exists(ckpt_path + '.data-00000-of-00001'):
+                    is_valid = True
+
+            return is_valid, ckpt
+
+        def scan_psnr_folder():
+            tested_ckpt_in_psnr_sets = set()
+            for test_psnr in os.listdir(psnr_dir):
+                tested_ckpt_in_psnr_sets.add(test_psnr)
+            return tested_ckpt_in_psnr_sets
+
+        def scan_model_folder():
+            saved_models = set()
+            for ckpt_name in os.listdir(snapshot_dir):
+                is_valid, ckpt = check_ckpt_valid(ckpt_name)
+                if is_valid:
+                    saved_models.add(ckpt)
+            return saved_models
+
+        tested_ckpt_sets = scan_psnr_folder()
+        while True:
+            all_model_ckpts = scan_model_folder()
+            new_model_ckpts = all_model_ckpts - tested_ckpt_sets
+
+            for ckpt_name in new_model_ckpts:
+                # inference
+                ckpt = os.path.join(snapshot_dir, ckpt_name)
+                inference_func(ckpt, dataset_name, evaluate_name)
+
+                tested_ckpt_sets.add(ckpt_name)
+
+            print('waiting for models...')
+            evaluate.evaluate('compute_auc', psnr_dir)
+            time.sleep(60)
+    else:
+        inference_func(snapshot_dir, dataset_name, evaluate_name)
diff --git a/Codes/loss_functions.py b/Codes/loss_functions.py
new file mode 100644
index 0000000..ca97966
--- /dev/null
+++ b/Codes/loss_functions.py
@@ -0,0 +1,54 @@
+import tensorflow as tf
+import numpy as np
+
+
+def flow_loss(gen_flows, gt_flows):
+    print(gen_flows['flow'])
+    return tf.reduce_mean(tf.abs(gen_flows['flow'] - gt_flows['flow']))
+
+
+def intensity_loss(gen_frames, gt_frames, l_num):
+    """
+    Calculates the sum of lp losses between the predicted and ground truth frames.
+
+    @param gen_frames: The predicted frames at each scale.
+    @param gt_frames: The ground truth frames at each scale
+    @param l_num: 1 or 2 for l1 and l2 loss, respectively).
+
+    @return: The lp loss.
+    """
+    return tf.reduce_mean(tf.abs((gen_frames - gt_frames) ** l_num))
+
+
+def gradient_loss(gen_frames, gt_frames, alpha):
+    """
+    Calculates the sum of GDL losses between the predicted and ground truth frames.
+
+    @param gen_frames: The predicted frames at each scale.
+    @param gt_frames: The ground truth frames at each scale
+    @param alpha: The power to which each gradient term is raised.
+
+    @return: The GDL loss.
+    """
+    # calculate the loss for each scale
+    # create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
+
+    channels = gen_frames.get_shape().as_list()[-1]
+    pos = tf.constant(np.identity(channels), dtype=tf.float32)     # 3 x 3
+    neg = -1 * pos
+    filter_x = tf.expand_dims(tf.stack([neg, pos]), 0)  # [-1, 1]
+    filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)])  # [[1],[-1]]
+    strides = [1, 1, 1, 1]  # stride of (1, 1)
+    padding = 'SAME'
+
+    gen_dx = tf.abs(tf.nn.conv2d(gen_frames, filter_x, strides, padding=padding))
+    gen_dy = tf.abs(tf.nn.conv2d(gen_frames, filter_y, strides, padding=padding))
+    gt_dx = tf.abs(tf.nn.conv2d(gt_frames, filter_x, strides, padding=padding))
+    gt_dy = tf.abs(tf.nn.conv2d(gt_frames, filter_y, strides, padding=padding))
+
+    grad_diff_x = tf.abs(gt_dx - gen_dx)
+    grad_diff_y = tf.abs(gt_dy - gen_dy)
+
+    # condense into one tensor and avg
+    return tf.reduce_mean(grad_diff_x ** alpha + grad_diff_y ** alpha)
+
diff --git a/Codes/models.py b/Codes/models.py
new file mode 100644
index 0000000..8c20134
--- /dev/null
+++ b/Codes/models.py
@@ -0,0 +1,44 @@
+import tensorflow as tf
+
+import unet
+import pix2pix
+
+from flownet2.src.flowlib import flow_to_image
+from flownet2.src.flownet_sd.flownet_sd import FlowNetSD  # Ok
+from flownet2.src.training_schedules import LONG_SCHEDULE
+from flownet2.src.net import Mode
+
+
+slim = tf.contrib.slim
+
+
+def generator(inputs, layers, features_root=64, filter_size=3, pool_size=2, output_channel=3):
+    return unet.unet(inputs, layers, features_root, filter_size, pool_size, output_channel)
+
+
+def discriminator(inputs, num_filers=(128, 256, 512, 512)):
+    logits, end_points = pix2pix.pix2pix_discriminator(inputs, num_filers)
+    return logits, end_points['predictions']
+
+
+def flownet(input_a, input_b, height, width, reuse=None):
+    net = FlowNetSD(mode=Mode.TEST)
+    # train preds flow
+    input_a = (input_a + 1.0) / 2.0     # flownet receives image with color space in [0, 1]
+    input_b = (input_b + 1.0) / 2.0     # flownet receives image with color space in [0, 1]
+    # input size is 384 x 512
+    input_a = tf.image.resize_images(input_a, [height, width])
+    input_b = tf.image.resize_images(input_b, [height, width])
+    flows = net.model(
+        inputs={'input_a': input_a, 'input_b': input_b},
+        training_schedule=LONG_SCHEDULE,
+        trainable=False, reuse=reuse
+    )
+    return flows['flow']
+
+
+def initialize_flownet(sess, checkpoint):
+    flownet_vars = slim.get_variables_to_restore(include=['FlowNetSD'])
+    flownet_saver = tf.train.Saver(flownet_vars)
+    print('FlownetSD restore from {}!'.format(checkpoint))
+    flownet_saver.restore(sess, checkpoint)
diff --git a/Codes/models/download_pretrains.sh b/Codes/models/download_pretrains.sh
new file mode 100644
index 0000000..08e58ec
--- /dev/null
+++ b/Codes/models/download_pretrains.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Downloading trained models on ped1, ped2 and avenue datasets ....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4mHfGdUxGoa7NnnI-eIlTqInymvmHyDOSGGw5zKM08jOGukHKdYdxmtZiEEh-rCAWK7oTDTstQ5bKazvjdyTtsIUW7zxcKnVgIsgZg6DpEb-Qdq83Zmnnw6nv7pX5HhiOkMxc42CLl65QK0A2Mv1Cmj-062Pyodm-Mt5r24Id3_glS0NT6BdvAp7-VbevkXygnmXQrcXRQU6d0y1cHlZJ2ig/pretrains.tar.gz"
+tar -xvf pretrains.tar.gz
+rm pretrains.tar.gz
+
+echo "Download pretrains successfully..."
+
+
diff --git a/Codes/pix2pix.py b/Codes/pix2pix.py
new file mode 100644
index 0000000..941c8fc
--- /dev/null
+++ b/Codes/pix2pix.py
@@ -0,0 +1,274 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Implementation of the Image-to-Image Translation model.
+This network represents a port of the following work:
+  Image-to-Image Translation with Conditional Adversarial Networks
+  Phillip Isola, Jun-Yan Zhu, Tinghui Zhou and Alexei A. Efros
+  Arxiv, 2017
+  https://phillipi.github.io/pix2pix/
+A reference implementation written in Lua can be found at:
+https://github.com/phillipi/pix2pix/blob/master/models.lua
+"""
+import collections
+import functools
+
+import tensorflow as tf
+
+layers = tf.contrib.layers
+
+
+def pix2pix_arg_scope():
+    """Returns a default argument scope for isola_net.
+    Returns:
+      An arg scope.
+    """
+    # These parameters come from the online port, which don't necessarily match
+    # those in the paper.
+    # TODO(nsilberman): confirm these values with Philip.
+    instance_norm_params = {
+        'center': True,
+        'scale': True,
+        'epsilon': 0.00001,
+    }
+
+    with tf.contrib.framework.arg_scope(
+            [layers.conv2d, layers.conv2d_transpose],
+            normalizer_fn=layers.instance_norm,
+            normalizer_params=instance_norm_params,
+            weights_initializer=tf.random_normal_initializer(0, 0.02)) as sc:
+        return sc
+
+
+def upsample(net, num_outputs, kernel_size, method='nn_upsample_conv'):
+    """Upsamples the given inputs.
+    Args:
+      net: A `Tensor` of size [batch_size, height, width, filters].
+      num_outputs: The number of output filters.
+      kernel_size: A list of 2 scalars or a 1x2 `Tensor` indicating the scale,
+        relative to the inputs, of the output dimensions. For example, if kernel
+        size is [2, 3], then the output height and width will be twice and three
+        times the input size.
+      method: The upsampling method.
+    Returns:
+      An `Tensor` which was upsampled using the specified method.
+    Raises:
+      ValueError: if `method` is not recognized.
+    """
+    net_shape = tf.shape(net)
+    height = net_shape[1]
+    width = net_shape[2]
+
+    if method == 'nn_upsample_conv':
+        net = tf.image.resize_nearest_neighbor(
+            net, [kernel_size[0] * height, kernel_size[1] * width])
+        net = layers.conv2d(net, num_outputs, [4, 4], activation_fn=None)
+    elif method == 'conv2d_transpose':
+        net = layers.conv2d_transpose(
+            net, num_outputs, [4, 4], stride=kernel_size, activation_fn=None)
+    else:
+        raise ValueError('Unknown method: [%s]', method)
+
+    return net
+
+
+class Block(
+    collections.namedtuple('Block', ['num_filters', 'decoder_keep_prob'])):
+    """Represents a single block of encoder and decoder processing.
+    The Image-to-Image translation paper works a bit differently than the original
+    U-Net model. In particular, each block represents a single operation in the
+    encoder which is concatenated with the corresponding decoder representation.
+    A dropout layer follows the concatenation and convolution of the concatenated
+    features.
+    """
+    pass
+
+
+def _default_generator_blocks():
+    """Returns the default generator block definitions.
+    Returns:
+      A list of generator blocks.
+    """
+    return [
+        Block(64, 0.5),
+        Block(128, 0.5),
+        Block(256, 0.5),
+        Block(512, 0),
+        Block(512, 0),
+        Block(512, 0),
+        Block(512, 0),
+    ]
+
+
+def pix2pix_generator(net,
+                      num_outputs,
+                      blocks=None,
+                      upsample_method='nn_upsample_conv',
+                      is_training=False):  # pylint: disable=unused-argument
+    """Defines the network architecture.
+    Args:
+      net: A `Tensor` of size [batch, height, width, channels]. Note that the
+        generator currently requires square inputs (e.g. height=width).
+      num_outputs: The number of (per-pixel) outputs.
+      blocks: A list of generator blocks or `None` to use the default generator
+        definition.
+      upsample_method: The method of upsampling images, one of 'nn_upsample_conv'
+        or 'conv2d_transpose'
+      is_training: Whether or not we're in training or testing mode.
+    Returns:
+      A `Tensor` representing the model output and a dictionary of model end
+        points.
+    Raises:
+      ValueError: if the input heights do not match their widths.
+    """
+    end_points = {}
+
+    blocks = blocks or _default_generator_blocks()
+
+    input_size = net.get_shape().as_list()
+    height, width = input_size[1], input_size[2]
+    if height != width:
+        raise ValueError('The input height must match the input width.')
+
+    input_size[3] = num_outputs
+
+    upsample_fn = functools.partial(upsample, method=upsample_method)
+
+    encoder_activations = []
+
+    ###########
+    # Encoder #
+    ###########
+    with tf.variable_scope('encoder'):
+        with tf.contrib.framework.arg_scope(
+                [layers.conv2d],
+                kernel_size=[4, 4],
+                stride=2,
+                activation_fn=tf.nn.leaky_relu):
+
+            for block_id, block in enumerate(blocks):
+                # No normalizer for the first encoder layers as per 'Image-to-Image',
+                # Section 5.1.1
+                if block_id == 0:
+                    # First layer doesn't use normalizer_fn
+                    net = layers.conv2d(net, block.num_filters, normalizer_fn=None)
+                elif block_id < len(blocks) - 1:
+                    net = layers.conv2d(net, block.num_filters)
+                else:
+                    # Last layer doesn't use activation_fn nor normalizer_fn
+                    net = layers.conv2d(
+                        net, block.num_filters, activation_fn=None, normalizer_fn=None)
+
+                encoder_activations.append(net)
+                end_points['encoder%d' % block_id] = net
+
+    ###########
+    # Decoder #
+    ###########
+    reversed_blocks = list(blocks)
+    reversed_blocks.reverse()
+
+    with tf.variable_scope('decoder'):
+        # Dropout is used at both train and test time as per 'Image-to-Image',
+        # Section 2.1 (last paragraph).
+        with tf.contrib.framework.arg_scope([layers.dropout], is_training=is_training):
+
+            for block_id, block in enumerate(reversed_blocks):
+                if block_id > 0:
+                    net = tf.concat([net, encoder_activations[-block_id - 1]], axis=3)
+
+                # The Relu comes BEFORE the upsample op:
+                net = tf.nn.relu(net)
+                net = upsample_fn(net, block.num_filters, [2, 2])
+                if block.decoder_keep_prob > 0:
+                    net = layers.dropout(net, keep_prob=block.decoder_keep_prob)
+                end_points['decoder%d' % block_id] = net
+
+    with tf.variable_scope('output'):
+        logits = layers.conv2d(net, num_outputs, [4, 4], activation_fn=None)
+        # print(logits)
+        # logits = tf.reshape(logits, input_size)
+
+        end_points['logits'] = logits
+        end_points['predictions'] = tf.tanh(logits)
+
+    return logits, end_points
+
+
+def pix2pix_discriminator(net, num_filters, padding=2, is_training=False):
+    """Creates the Image2Image Translation Discriminator.
+    Args:
+      net: A `Tensor` of size [batch_size, height, width, channels] representing
+        the input.
+      num_filters: A list of the filters in the discriminator. The length of the
+        list determines the number of layers in the discriminator.
+      padding: Amount of reflection padding applied before each convolution.
+      is_training: Whether or not the model is training or testing.
+    Returns:
+      A logits `Tensor` of size [batch_size, N, N, 1] where N is the number of
+      'patches' we're attempting to discriminate and a dictionary of model end
+      points.
+    """
+    del is_training
+    end_points = {}
+
+    num_layers = len(num_filters)
+
+    def padded(net, scope):
+        if padding:
+            with tf.variable_scope(scope):
+                spatial_pad = tf.constant(
+                    [[0, 0], [padding, padding], [padding, padding], [0, 0]],
+                    dtype=tf.int32)
+                return tf.pad(net, spatial_pad, 'REFLECT')
+        else:
+            return net
+
+    with tf.contrib.framework.arg_scope(
+            [layers.conv2d],
+            kernel_size=[4, 4],
+            stride=2,
+            padding='valid',
+            activation_fn=tf.nn.leaky_relu):
+
+        # No normalization on the input layer.
+        net = layers.conv2d(
+            padded(net, 'conv0'), num_filters[0], normalizer_fn=None, scope='conv0')
+
+        end_points['conv0'] = net
+
+        for i in range(1, num_layers - 1):
+            net = layers.conv2d(
+                padded(net, 'conv%d' % i), num_filters[i], scope='conv%d' % i)
+            end_points['conv%d' % i] = net
+
+        # Stride 1 on the last layer.
+        net = layers.conv2d(
+            padded(net, 'conv%d' % (num_layers - 1)),
+            num_filters[-1],
+            stride=1,
+            scope='conv%d' % (num_layers - 1))
+        end_points['conv%d' % (num_layers - 1)] = net
+
+        # 1-dim logits, stride 1, no activation, no normalization.
+        logits = layers.conv2d(
+            padded(net, 'conv%d' % num_layers),
+            1,
+            stride=1,
+            activation_fn=None,
+            normalizer_fn=None,
+            scope='conv%d' % num_layers)
+        end_points['logits'] = logits
+        end_points['predictions'] = tf.sigmoid(logits)
+    return logits, end_points
diff --git a/Codes/requirements.txt b/Codes/requirements.txt
new file mode 100644
index 0000000..91d2206
--- /dev/null
+++ b/Codes/requirements.txt
@@ -0,0 +1,9 @@
+numpy==1.14.1
+scipy==1.0.0
+matplotlib==2.1.2
+tensorflow==1.4.1
+tensorflow_gpu==1.4.1
+Pillow==5.0.0
+pypng==0.0.18
+scikit_learn==0.19.1
+opencv-python==3.2.0.6
diff --git a/Codes/runner.sh b/Codes/runner.sh
new file mode 100644
index 0000000..f0b545f
--- /dev/null
+++ b/Codes/runner.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+/home/liuwen/ssd/datasets/ped2/training/frames
+/home/liuwen/ssd/datasets/ped2/testing/frames
+
+python train.py  --dataset  ped2    \
+                 --train_folder  ../Data/ped2/training/frames     \
+                 --test_folder  ../Data/ped2/testing/frames       \
+                 --gpu  0       \
+                 --iters    80000
+
+
+python inference.py  --dataset  ped2    \
+                    --test_folder  /home/liuwen/ssd/datasets/ped2/testing/frames      \
+                    --gpu  3    \
+                    --snapshot_dir    models/pretrains/ped2
+
+
+python train.py  --dataset  avenue    \
+                 --train_folder  ../Data/avenue/training/frames     \
+                 --test_folder  ../Data/avenue/testing/frames       \
+                 --gpu  2       \
+                 --iters    80000
+
+python inference.py  --dataset  avenue    \
+                     --test_folder  ../Data/avenue/testing/frames       \
+                     --gpu  3
+
+
+python train.py  --dataset  ped1    \
+                 --train_folder  ../Data/ped1/training/frames     \
+                 --test_folder  ../Data/ped1/testing/frames       \
+                 --gpu  2       \
+                 --iters    80000
+
+python inference.py  --dataset  ped1    \
+                     --test_folder  ../Data/ped1/testing/frames       \
+                     --gpu  3
+
+python train.py  --dataset  ped1    \
+                 --train_folder  ../Data/ped1/training/frames     \
+                 --test_folder  ../Data/ped1/testing/frames       \
+                 --gpu  0       \
+                 --iters    80000   \
+                 --config   training_hyper_params/hyper_params_lp_0.ini
+
+python inference.py  --dataset  ped1    \
+                     --test_folder  ../Data/ped1/testing/frames       \
+                     --gpu  1   \
+                     --config   training_hyper_params/hyper_params_lp_0.ini
+
+
+python inference.py  --dataset  ped2    \
+                     --test_folder  /home/liuwen/ssd/datasets/ped2/testing/frames       \
+                     --gpu  1   \
+                     --snapshot_dir     models/pretrains/ped2
+\ No newline at end of file
diff --git a/Codes/train.py b/Codes/train.py
new file mode 100644
index 0000000..42a8fc9
--- /dev/null
+++ b/Codes/train.py
@@ -0,0 +1,215 @@
+import tensorflow as tf
+import os
+
+from models import generator, discriminator, flownet, initialize_flownet
+from loss_functions import intensity_loss, gradient_loss
+from utils import DataLoader, load, save, psnr_error
+from constant import const
+
+
+os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+os.environ['CUDA_VISIBLE_DEVICES'] = const.GPU
+
+dataset_name = const.DATASET
+train_folder = const.TRAIN_FOLDER
+test_folder = const.TEST_FOLDER
+
+batch_size = const.BATCH_SIZE
+iterations = const.ITERATIONS
+num_his = const.NUM_HIS
+height, width = 256, 256
+flow_height, flow_width = const.FLOW_HEIGHT, const.FLOW_WIDTH
+
+l_num = const.L_NUM
+alpha_num = const.ALPHA_NUM
+lam_lp = const.LAM_LP
+lam_gdl = const.LAM_GDL
+lam_adv = const.LAM_ADV
+lam_flow = const.LAM_FLOW
+adversarial = (lam_adv != 0)
+
+summary_dir = const.SUMMARY_DIR
+snapshot_dir = const.SNAPSHOT_DIR
+
+
+print(const)
+
+# define dataset
+with tf.name_scope('dataset'):
+    train_loader = DataLoader(train_folder, resize_height=height, resize_width=width)
+    train_dataset = train_loader(batch_size=batch_size, time_steps=num_his, num_pred=1)
+
+    train_it = train_dataset.make_one_shot_iterator()
+    train_videos_clips_tensor = train_it.get_next()
+    train_videos_clips_tensor.set_shape([batch_size, height, width, 3*(num_his + 1)])
+
+    train_inputs = train_videos_clips_tensor[..., 0:num_his*3]
+    train_gt = train_videos_clips_tensor[..., -3:]
+
+    print('train inputs = {}'.format(train_inputs))
+    print('train prediction gt = {}'.format(train_gt))
+
+    test_loader = DataLoader(test_folder, resize_height=height, resize_width=width)
+    test_dataset = test_loader(batch_size=batch_size, time_steps=num_his, num_pred=1)
+    test_it = test_dataset.make_one_shot_iterator()
+    test_videos_clips_tensor = test_it.get_next()
+    test_videos_clips_tensor.set_shape([batch_size, height, width, 3*(num_his + 1)])
+
+    test_inputs = test_videos_clips_tensor[..., 0:num_his*3]
+    test_gt = test_videos_clips_tensor[..., -3:]
+
+    print('test inputs = {}'.format(test_inputs))
+    print('test prediction gt = {}'.format(test_gt))
+
+# define training generator function
+with tf.variable_scope('generator', reuse=None):
+    print('training = {}'.format(tf.get_variable_scope().name))
+    train_outputs = generator(train_inputs, layers=4, output_channel=3)
+    train_psnr_error = psnr_error(gen_frames=train_outputs, gt_frames=train_gt)
+
+# define testing generator function
+with tf.variable_scope('generator', reuse=True):
+    print('testing = {}'.format(tf.get_variable_scope().name))
+    test_outputs = generator(test_inputs, layers=4, output_channel=3)
+    test_psnr_error = psnr_error(gen_frames=test_outputs, gt_frames=test_gt)
+
+
+# define intensity loss
+if lam_lp != 0:
+    lp_loss = intensity_loss(gen_frames=train_outputs, gt_frames=train_gt, l_num=l_num)
+else:
+    lp_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define gdl loss
+if lam_gdl != 0:
+    gdl_loss = gradient_loss(gen_frames=train_outputs, gt_frames=train_gt, alpha=alpha_num)
+else:
+    gdl_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define flow loss
+if lam_flow != 0:
+    train_gt_flow = flownet(input_a=train_inputs[..., -3:], input_b=train_gt,
+                            height=flow_height, width=flow_width, reuse=None)
+    train_pred_flow = flownet(input_a=train_inputs[..., -3:], input_b=train_outputs,
+                              height=flow_height, width=flow_width, reuse=True)
+    flow_loss = tf.reduce_mean(tf.abs(train_gt_flow - train_pred_flow))
+else:
+    flow_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+# define adversarial loss
+if adversarial:
+    with tf.variable_scope('discriminator', reuse=None):
+        real_logits, real_outputs = discriminator(inputs=train_gt)
+    with tf.variable_scope('discriminator', reuse=True):
+        fake_logits, fake_outputs = discriminator(inputs=train_outputs)
+
+    print('real_outputs = {}'.format(real_outputs))
+    print('fake_outputs = {}'.format(fake_outputs))
+
+    adv_loss = tf.reduce_mean(tf.square(fake_outputs - 1) / 2)
+    dis_loss = tf.reduce_mean(tf.square(real_outputs - 1) / 2) + tf.reduce_mean(tf.square(fake_outputs) / 2)
+else:
+    adv_loss = tf.constant(0.0, dtype=tf.float32)
+    dis_loss = tf.constant(0.0, dtype=tf.float32)
+
+
+with tf.name_scope('training'):
+    g_loss = tf.add_n([lp_loss * lam_lp, gdl_loss * lam_gdl, adv_loss * lam_adv, flow_loss * lam_flow], name='g_loss')
+
+    g_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='g_step')
+    g_lrate = tf.train.piecewise_constant(g_step, boundaries=const.LRATE_G_BOUNDARIES, values=const.LRATE_G)
+    g_optimizer = tf.train.AdamOptimizer(learning_rate=g_lrate, name='g_optimizer')
+    g_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator')
+
+    g_train_op = g_optimizer.minimize(g_loss, global_step=g_step, var_list=g_vars, name='g_train_op')
+
+    if adversarial:
+        # training discriminator
+        d_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='d_step')
+        d_lrate = tf.train.piecewise_constant(d_step, boundaries=const.LRATE_D_BOUNDARIES, values=const.LRATE_D)
+        d_optimizer = tf.train.AdamOptimizer(learning_rate=d_lrate, name='g_optimizer')
+        d_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator')
+
+        d_train_op = d_optimizer.minimize(dis_loss, global_step=d_step, var_list=d_vars, name='d_optimizer')
+    else:
+        d_step = None
+        d_lrate = None
+        d_train_op = None
+
+# add all to summaries
+tf.summary.scalar(tensor=train_psnr_error, name='train_psnr_error')
+tf.summary.scalar(tensor=test_psnr_error, name='test_psnr_error')
+tf.summary.scalar(tensor=g_loss, name='g_loss')
+tf.summary.scalar(tensor=adv_loss, name='adv_loss')
+tf.summary.scalar(tensor=dis_loss, name='dis_loss')
+tf.summary.image(tensor=train_outputs, name='train_outputs')
+tf.summary.image(tensor=train_gt, name='train_gt')
+tf.summary.image(tensor=test_outputs, name='test_outputs')
+tf.summary.image(tensor=test_gt, name='test_gt')
+summary_op = tf.summary.merge_all()
+
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+with tf.Session(config=config) as sess:
+    # summaries
+    summary_writer = tf.summary.FileWriter(summary_dir, graph=sess.graph)
+
+    # initialize weights
+    sess.run(tf.global_variables_initializer())
+    print('Init successfully!')
+
+    if lam_flow != 0:
+        # initialize flownet
+        initialize_flownet(sess, const.FLOWNET_CHECKPOINT)
+
+    # tf saver
+    saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=None)
+    restore_var = [v for v in tf.global_variables()]
+    loader = tf.train.Saver(var_list=restore_var)
+    if os.path.isdir(snapshot_dir):
+        ckpt = tf.train.get_checkpoint_state(snapshot_dir)
+        if ckpt and ckpt.model_checkpoint_path:
+            load(loader, sess, ckpt.model_checkpoint_path)
+        else:
+            print('No checkpoint file found.')
+    else:
+        load(loader, sess, snapshot_dir)
+
+    _step, _loss, _summaries = 0, None, None
+    while _step < iterations:
+        try:
+            if adversarial:
+                print('Training discriminator...')
+                _, _d_lr, _d_step, _dis_loss = sess.run([d_train_op, d_lrate, d_step, dis_loss])
+            else:
+                _d_step = 0
+                _d_lr = 0
+                _dis_loss = 0
+
+            print('Training generator...')
+            _, _g_lr, _step, _lp_loss, _gdl_loss, _adv_loss, _flow_loss, _g_loss, _train_psnr, _summaries = sess.run(
+                [g_train_op, g_lrate, g_step, lp_loss, gdl_loss, adv_loss, flow_loss, g_loss, train_psnr_error, summary_op])
+
+            if _step % 10 == 0:
+                print('DiscriminatorModel: Step {} | Global Loss: {:.6f}, lr = {:.6f}'.format(_d_step, _dis_loss, _d_lr))
+                print('GeneratorModel : Step {}, lr = {:.6f}'.format(_step, _g_lr))
+                print('                 Global      Loss : ', _g_loss)
+                print('                 intensity   Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_lp_loss, lam_lp, _lp_loss * lam_lp))
+                print('                 gradient    Loss : ({:.4f} * {:.4f} = {:.4f})'.format( _gdl_loss, lam_gdl, _gdl_loss * lam_gdl))
+                print('                 adversarial Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_adv_loss, lam_adv, _adv_loss * lam_adv))
+                print('                 flownet     Loss : ({:.4f} * {:.4f} = {:.4f})'.format(_flow_loss, lam_flow, _flow_loss * lam_flow))
+                print('                 PSNR  Error      : ', _train_psnr)
+            if _step % 100 == 0:
+                summary_writer.add_summary(_summaries, global_step=_step)
+                print('Save summaries...')
+
+            if _step % 1000 == 0:
+                save(saver, sess, snapshot_dir, _step)
+
+        except tf.errors.OutOfRangeError:
+            print('Finish successfully!')
+            save(saver, sess, snapshot_dir, _step)
+            break
diff --git a/Codes/training_hyper_params/hyper_params.ini b/Codes/training_hyper_params/hyper_params.ini
new file mode 100644
index 0000000..99dbf00
--- /dev/null
+++ b/Codes/training_hyper_params/hyper_params.ini
@@ -0,0 +1,103 @@
+[ped2]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [7000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [7000]
+
+[ped1]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 0.01
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [40000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [40000]
+
+
+[avenue]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss,
+# we found in smaller lp is slightly better in avenue, but not too much difference.
+LAM_LP = 0
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0002, 0.00002]
+LRATE_G_BOUNDARIES = [100000]
+
+LRATE_D = [0.00002, 0.000002]
+LRATE_D_BOUNDARIES = [100000]
+
+
+[shanghaitech]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0002, 0.00002]
+LRATE_G_BOUNDARIES = [50000]
+
+LRATE_D = [0.00002, 0.000002]
+LRATE_D_BOUNDARIES = [50000]
+
+
+[toydata]
+# for lp loss. e.g, 1 or 2 for l1 and l2 loss, respectively)
+L_NUM = 2
+# the power to which each gradient term is raised in GDL loss
+ALPHA_NUM = 1
+# the percentage of the adversarial loss to use in the combined loss
+LAM_ADV = 0.05
+# the percentage of the lp loss to use in the combined loss
+LAM_LP = 1
+# the percentage of the GDL loss to use in the combined loss
+LAM_GDL = 1
+# the percentage of the different frame loss
+LAM_FLOW = 2
+
+LRATE_G = [0.0001, 0.00001]
+LRATE_G_BOUNDARIES = [7000]
+
+LRATE_D = [0.00001, 0.000001]
+LRATE_D_BOUNDARIES = [7000]
diff --git a/Codes/unet.py b/Codes/unet.py
new file mode 100644
index 0000000..ac4c6aa
--- /dev/null
+++ b/Codes/unet.py
@@ -0,0 +1,42 @@
+import tensorflow as tf
+from tensorflow.contrib.layers import conv2d, max_pool2d, conv2d_transpose
+
+
+def unet(inputs, layers, features_root=64, filter_size=3, pool_size=2, output_channel=1):
+    """
+    :param inputs: input tensor, shape[None, height, width, channel]
+    :param layers: number of layers
+    :param features_root: number of features in the first layer
+    :param filter_size: size of each conv layer
+    :param pool_size:  size of each max pooling layer
+    :param output_channel:  number of channel for output tensor
+    :return: a tensor, shape[None, height, width, output_channel]
+    """
+
+    in_node = inputs
+    conv = []
+    for layer in range(0, layers):
+        features = 2**layer*features_root
+
+        conv1 = conv2d(inputs=in_node, num_outputs=features, kernel_size=filter_size)
+        conv2 = conv2d(inputs=conv1, num_outputs=features, kernel_size=filter_size)
+        conv.append(conv2)
+
+        if layer < layers - 1:
+            in_node = max_pool2d(inputs=conv2, kernel_size=pool_size, padding='SAME')
+            # in_node = conv2d(inputs=conv2, num_outputs=features, kernel_size=filter_size, stride=2)
+
+    in_node = conv[-1]
+
+    for layer in range(layers-2, -1, -1):
+        features = 2**(layer+1)*features_root
+
+        h_deconv = conv2d_transpose(inputs=in_node, num_outputs=features//2, kernel_size=pool_size, stride=pool_size)
+        h_deconv_concat = tf.concat([conv[layer], h_deconv], axis=3)
+
+        conv1 = conv2d(inputs=h_deconv_concat, num_outputs=features//2, kernel_size=filter_size)
+        in_node = conv2d(inputs=conv1, num_outputs=features//2, kernel_size=filter_size)
+
+    output = conv2d(inputs=in_node, num_outputs=output_channel, kernel_size=filter_size, activation_fn=None)
+    output = tf.tanh(output)
+    return output
diff --git a/Codes/utils.py b/Codes/utils.py
new file mode 100644
index 0000000..efeab8e
--- /dev/null
+++ b/Codes/utils.py
@@ -0,0 +1,227 @@
+import tensorflow as tf
+import numpy as np
+from collections import OrderedDict
+import os
+import glob
+import cv2
+
+
+rng = np.random.RandomState(2017)
+
+
+def np_load_frame(filename, resize_height, resize_width):
+    image_decoded = cv2.imread(filename)
+    image_resized = cv2.resize(image_decoded, (resize_width, resize_height))
+    image_resized = image_resized.astype(dtype=np.float32)
+    image_resized = (image_resized / 127.5) - 1.0
+    return image_resized
+
+
+class DataLoader(object):
+    def __init__(self, video_folder, resize_height=256, resize_width=256):
+        self.dir = video_folder
+        self.videos = {}
+        self._resize_height = resize_height
+        self._resize_width = resize_width
+        self.setup()
+
+    def __call__(self, batch_size, time_steps, num_pred=1):
+        video_info_list = list(self.videos.values())
+        num_videos = len(video_info_list)
+
+        clip_length = time_steps + num_pred
+        resize_height, resize_width = self._resize_height, self._resize_width
+
+        def video_clip_generator():
+            v_id = -1
+            while True:
+                v_id = (v_id + 1) % num_videos
+
+                video_info = video_info_list[v_id]
+                start = rng.randint(0, video_info['length'] - clip_length)
+                video_clip = []
+                for frame_id in range(start, start + clip_length):
+                    video_clip.append(np_load_frame(video_info['frame'][frame_id], resize_height, resize_width))
+                video_clip = np.concatenate(video_clip, axis=2)
+
+                yield video_clip
+
+        # video clip paths
+        dataset = tf.data.Dataset.from_generator(generator=video_clip_generator,
+                                                 output_types=tf.float32,
+                                                 output_shapes=[resize_height, resize_width, clip_length * 3])
+        print('generator dataset, {}'.format(dataset))
+        dataset = dataset.prefetch(buffer_size=1000)
+        dataset = dataset.shuffle(buffer_size=1000).batch(batch_size)
+        print('epoch dataset, {}'.format(dataset))
+
+        return dataset
+
+    def __getitem__(self, video_name):
+        assert video_name in self.videos.keys(), 'video = {} is not in {}!'.format(video_name, self.videos.keys())
+        return self.videos[video_name]
+
+    def setup(self):
+        videos = glob.glob(os.path.join(self.dir, '*'))
+        for video in sorted(videos):
+            video_name = video.split('/')[-1]
+            self.videos[video_name] = {}
+            self.videos[video_name]['path'] = video
+            self.videos[video_name]['frame'] = glob.glob(os.path.join(video, '*.jpg'))
+            self.videos[video_name]['frame'].sort()
+            self.videos[video_name]['length'] = len(self.videos[video_name]['frame'])
+
+    def get_video_clips(self, video, start, end):
+        # assert video in self.videos, 'video = {} must in {}!'.format(video, self.videos.keys())
+        # assert start >= 0, 'start = {} must >=0!'.format(start)
+        # assert end <= self.videos[video]['length'], 'end = {} must <= {}'.format(video, self.videos[video]['length'])
+
+        batch = []
+        for i in range(start, end):
+            image = np_load_frame(self.videos[video]['frame'][i], self._resize_height, self._resize_width)
+            batch.append(image)
+
+        return np.concatenate(batch, axis=2)
+
+    # def get_video_clips(self, video_name, start, end):
+    #     video_idx = np.arange(start, end)
+    #     video_clip = np.empty(shape=[self._resize_height, self._resize_height, 3*len(video_idx)], dtype=np.float32)
+    #     for idx, v_idx in enumerate(video_idx):
+    #         filename = self.videos[video_name]['frame'][v_idx]
+    #         video_clip[..., idx*3:(idx+1)*3] = np_load_frame(filename, self._resize_height, self._resize_width)
+    #
+    #     return video_clip
+
+
+def log10(t):
+    """
+    Calculates the base-10 log of each element in t.
+
+    @param t: The tensor from which to calculate the base-10 log.
+
+    @return: A tensor with the base-10 log of each element in t.
+    """
+
+    numerator = tf.log(t)
+    denominator = tf.log(tf.constant(10, dtype=numerator.dtype))
+    return numerator / denominator
+
+
+def psnr_error(gen_frames, gt_frames):
+    """
+    Computes the Peak Signal to Noise Ratio error between the generated images and the ground
+    truth images.
+
+    @param gen_frames: A tensor of shape [batch_size, height, width, 3]. The frames generated by the
+                       generator model.
+    @param gt_frames: A tensor of shape [batch_size, height, width, 3]. The ground-truth frames for
+                      each frame in gen_frames.
+
+    @return: A scalar tensor. The mean Peak Signal to Noise Ratio error over each frame in the
+             batch.
+    """
+    shape = tf.shape(gen_frames)
+    num_pixels = tf.to_float(shape[1] * shape[2] * shape[3])
+    gt_frames = (gt_frames + 1.0) / 2.0
+    gen_frames = (gen_frames + 1.0) / 2.0
+    square_diff = tf.square(gt_frames - gen_frames)
+
+    batch_errors = 10 * log10(1 / ((1 / num_pixels) * tf.reduce_sum(square_diff, [1, 2, 3])))
+    return tf.reduce_mean(batch_errors)
+
+
+def sharp_diff_error(gen_frames, gt_frames, channels=3):
+    """
+    Computes the Sharpness Difference error between the generated images and the ground truth
+    images.
+
+    @param gen_frames: A tensor of shape [batch_size, height, width, 3]. The frames generated by the
+                       generator model.
+    @param gt_frames: A tensor of shape [batch_size, height, width, 3]. The ground-truth frames for
+                      each frame in gen_frames.
+    @param channels: The number of channels, 3 is RGB and 1 is Gray, default is 3.
+
+    @return: A scalar tensor. The Sharpness Difference error over each frame in the batch.
+    """
+    shape = tf.shape(gen_frames)
+    num_pixels = tf.to_float(shape[1] * shape[2] * shape[3])
+
+    # gradient difference
+    # create filters [-1, 1] and [[1],[-1]] for diffing to the left and down respectively.
+    # TODO: Could this be simplified with one filter [[-1, 2], [0, -1]]?
+    pos = tf.constant(np.identity(channels), dtype=tf.float32)
+    neg = -1 * pos
+    filter_x = tf.expand_dims(tf.stack([neg, pos]), 0)  # [-1, 1]
+    filter_y = tf.stack([tf.expand_dims(pos, 0), tf.expand_dims(neg, 0)])  # [[1],[-1]]
+    strides = [1, 1, 1, 1]  # stride of (1, 1)
+    padding = 'SAME'
+
+    gen_dx = tf.abs(tf.nn.conv2d(gen_frames, filter_x, strides, padding=padding))
+    gen_dy = tf.abs(tf.nn.conv2d(gen_frames, filter_y, strides, padding=padding))
+    gt_dx = tf.abs(tf.nn.conv2d(gt_frames, filter_x, strides, padding=padding))
+    gt_dy = tf.abs(tf.nn.conv2d(gt_frames, filter_y, strides, padding=padding))
+
+    gen_grad_sum = gen_dx + gen_dy
+    gt_grad_sum = gt_dx + gt_dy
+
+    grad_diff = tf.abs(gt_grad_sum - gen_grad_sum)
+
+    batch_errors = 10 * log10(1 / ((1 / num_pixels) * tf.reduce_sum(grad_diff, [1, 2, 3])))
+    return tf.reduce_mean(batch_errors)
+
+
+def diff_mask(gen_frames, gt_frames, min_value=-1, max_value=1):
+    # normalize to [0, 1]
+    delta = max_value - min_value
+    gen_frames = (gen_frames - min_value) / delta
+    gt_frames = (gt_frames - min_value) / delta
+
+    gen_gray_frames = tf.image.rgb_to_grayscale(gen_frames)
+    gt_gray_frames = tf.image.rgb_to_grayscale(gt_frames)
+
+    diff = tf.abs(gen_gray_frames - gt_gray_frames)
+    return diff
+
+
+def load(saver, sess, ckpt_path):
+    saver.restore(sess, ckpt_path)
+    print("Restored model parameters from {}".format(ckpt_path))
+
+
+def save(saver, sess, logdir, step):
+    model_name = 'model.ckpt'
+    checkpoint_path = os.path.join(logdir, model_name)
+    if not os.path.exists(logdir):
+        os.makedirs(logdir)
+    saver.save(sess, checkpoint_path, global_step=step)
+    print('The checkpoint has been created.')
+
+
+# if __name__ == '__main__':
+#     os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
+#     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+#
+#     data_loader = DataLoader('/home/liuwen/ssd/datasets/avenue/training/frames')
+#     dataset, epoch_size = data_loader(10, 4, 1, 3, 1)
+#
+#     # debug
+#     iteration = dataset.make_one_shot_iterator()
+#     batch_video_clip_tensor = iteration.get_next()
+#
+#     config = tf.ConfigProto()
+#     config.gpu_options.allow_growth = True
+#     with tf.Session(config=config) as sess:
+#         # batch_video_clip = sess.run(next(it))
+#
+#         for i in range(100):
+#             batch_video_clip = sess.run(batch_video_clip_tensor)
+#             # print(batch_video_clip.shape)
+#
+#             for vid, video_clip in enumerate(batch_video_clip):
+#                 for fid, frame in enumerate(video_clip):
+#                     print(i, vid, fid)
+#                     cv2.imshow('visualization', frame + 0.5)
+#                     cv2.waitKey(100)
+
+
+
diff --git a/Data/avenue.sh b/Data/avenue.sh
new file mode 100644
index 0000000..ca3bac3
--- /dev/null
+++ b/Data/avenue.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Downloading CUHK-Avenue dataset....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4mS8bbrfeD7Urmn0OWASYUcfrVLCTgcCwEBsTShdkWrrbfXTGLbKKMrT6KR94Nr9-DaFv1DBftJKqCzlCzG5phbgAPOy9V84BDtgzFceJpt0xwZstgPw_pZQR_E8jwmiw9QwhjMronyh2Yiy84huUbPEtFL6wt0TaN9KPQedwAMWaipj4w4di42BHwos5ESM5HZcim3Ng4xz5SPyN3btgrzg/avenue.tar.gz"
+tar -xvf avenue.tar.gz
+rm avenue.tar.gz
+
+echo "Download CUHK-Avenue successfully..."
+
+
diff --git a/Data/ped1.sh b/Data/ped1.sh
new file mode 100644
index 0000000..eb995eb
--- /dev/null
+++ b/Data/ped1.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Downloading UCSD-Ped1 dataset....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4m5ZrdN62Hy303ATx2p6Cogia4Ewpwnye8HgJ7qgWdFJ6gaNKGErugal2lfpdr2h65rjArbhrID9mxfSIf2WfXvh9AJf40xwcEWxEAuTp_-gSkfyLAt4Ef7xkJko4InRzUJz-3bdvV77dmBuYSl9LljLyP6908E4EyvPEkMI3pHrNP5QmiJSsHN6jFwtOHpZuUG8UeJGpqb-TwKjNxFrEpjQ/ped1.tar.gz"
+tar -xvf ped1.tar.gz
+rm ped1.tar.gz
+
+echo "Download UCSD-Ped1 successfully..."
+
+
diff --git a/Data/ped2.sh b/Data/ped2.sh
new file mode 100644
index 0000000..70a9cd7
--- /dev/null
+++ b/Data/ped2.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+echo "Downloading UCSD-Ped2 dataset....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4mnifbgr-4ZbLb7e0dvvIoiaKFz5BdUTKRegB_vYHMYDO-BIDM0PYjIQupSNbSLFVjaZGfY9VPKS2ID5BooAqnlM5W4cmnrzsnflicxYq1H5Ne__ko4dNvrvijr4dXwJNzA0wBRN9evE0bUkct-u5VfY6pvcWtPNIPUm2NgeXpC9XFmWKG7oXL7b1-H11-C1hyho2BmWXpMqPDwo6cFqtZKA/ped2.tar.gz"
+tar -xvf ped2.tar.gz
+rm ped2.tar.gz
+
+echo "Download UCSD-Ped2 successfully..."
+
diff --git a/Data/shanghaitech.sh b/Data/shanghaitech.sh
new file mode 100644
index 0000000..92639d5
--- /dev/null
+++ b/Data/shanghaitech.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+echo "download ShanghaiTech-Campus dataset....."
+
+wget "https://ofhz9a.bn.files.1drv.com/y4mavKKJgKjjUPr3CnqA6u-xYxU3DIwYPAhAVv5UhQpf82uT31Ueljk3qxkPlcwlCuc0oSLhb5RfDN_vJKv3qvyOAoKP1NFNq3A6xiAtcYR0F2Xm2AXxWEabD-yPR49bwHMGWKPKItSiw_bPhvrretOmPf9QqxEoc7TqrN0A8ZGHwl5ASdtT2n2e3TwZyMIRQfSCQ0yfDZKfml7WM2so9G6Nw/shanghaitech.tar.gz"
+tar -xvf shanghaitech.tar.gz
+rm shanghaitech.tar.gz
+
+echo "download ShanghaiTech-Campus successfully..."
+\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7985c76
--- /dev/null
+++ b/README.md
@@ -0,0 +1,130 @@
+# Future Frame Prediction for Anomaly Detection -- A New Baseline
+This repo is the official open source of [Future Frame Prediction for Anomaly Detection -- A New Baseline, CVPR 2018](https://arxiv.org/pdf/1712.09867.pdf) by Wen Liu, Weixinluo, Dongze Lian and Shenghua Gao. 
+A **demo** is shown in *https://www.youtube.com/watch?v=M--wv-Y_h0A*. It is implemented in tensorflow. Please follow the instructions to run the code.
+
+####1. Installation (Anaconda with python3.6 installation is recommended)
+* Install 3rd-package dependencies of python (listed in requirements.txt)
+```
+numpy==1.14.1
+scipy==1.0.0
+matplotlib==2.1.2
+tensorflow_gpu==1.4.1
+tensorflow==1.4.1
+Pillow==5.0.0
+pypng==0.0.18
+scikit_learn==0.19.1
+opencv-python==3.2.0.6
+```
+
+```shell
+pip install -r requirements.txt
+```
+* Other libraries
+```code
+CUDA 8.0
+Cudnn 6.0
+```
+####2. Download datasets
+cd into Data folder of project and run the shell scripts (**ped1.sh, ped2.sh, avenue.sh, shanghaitech.sh**) under the Data folder.
+```shell
+cd Data
+./ped2.sh
+./ped1.sh
+./avenue.sh
+./shanghaitech.sh
+```
+
+####3. Testing on saved models
+* Download the trained models
+```shell
+cd models
+./download_pretrains.sh
+```
+* Running the sript (as ped2 and avenue datasets for examples) and cd into **Codes** folder at first.
+```shell
+python inference.py  --dataset  ped2    \
+                    --test_folder  ../Data/ped2/testing/frames      \
+                    --gpu  1    \
+                    --snapshot_dir    models/pretrains/ped2
+```
+
+```shell
+python inference.py  --dataset  avenue    \
+                    --test_folder  ../Data/avenue/testing/frames      \
+                    --gpu  1    \
+                    --snapshot_dir    models/pretrains/avenue
+```
+
+
+####4. Training from scratch (here we use ped2 and avenue datasets for examples)
+* Set hyper-parameters
+The default hyper-parameters, such as $\lambda_{init}$, $\lambda_{gd}$, $\lambda_{op}$, $\lambda_{adv}$ and the learning rate of G, as well as D, are all initialized in **training_hyper_params/hyper_params.ini**. 
+* Running script (as ped2 or avenue for instances) and cd into **Codes** folder at first.
+```shell
+python train.py  --dataset  ped2    \
+                 --train_folder  ../Data/ped2/training/frames     \
+                 --test_folder  ../Data/ped2/testing/frames       \
+                 --gpu  0       \
+                 --iters    80000
+```
+* Model selection while training
+In order to do model selection, a popular way is to testing the saved models after a number of iterations or epochs (Since there are no validation set provided on above all datasets, and in order to compare the performance with other methods, we just choose the best model on testing set). Here, we can use another GPU to listen the **snapshot_dir** folder. When a new model.cpkt.xxx has arrived, then load the model and test. Finnaly, we choose the best model. Following is the script.
+```shell
+python inference.py  --dataset  ped2    \
+                     --test_folder  ../Data/ped2/testing/frames       \
+                     --gpu  1
+```
+Run **python train.py -h** to know more about the flag options or see the detials in **constant.py**.
+```shell
+Options to run the network.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -g GPU, --gpu GPU    the device id of gpu.
+  -i ITERS, --iters ITERS
+                        set the number of iterations, default is 1
+  -b BATCH, --batch BATCH
+                        set the batch size, default is 4.
+  --num_his NUM_HIS    set the time steps, default is 4.
+  -d DATASET, --dataset DATASET
+                        the name of dataset.
+  --train_folder TRAIN_FOLDER
+                        set the training folder path.
+  --test_folder TEST_FOLDER
+                        set the testing folder path.
+  --config CONFIG      the path of training_hyper_params, default is
+                        training_hyper_params/hyper_params.ini
+  --snapshot_dir SNAPSHOT_DIR
+                        if it is folder, then it is the directory to save
+                        models, if it is a specific model.ckpt-xxx, then the
+                        system will load it for testing.
+  --summary_dir SUMMARY_DIR
+                        the directory to save summaries.
+  --psnr_dir PSNR_DIR  the directory to save psnrs results in testing.
+  --evaluate EVALUATE  the evaluation metric, default is compute_auc
+```
+* (Option) Tensorboard visualization
+```shell
+tensorboard    --logdir=./summary    --port=10086
+```
+Open the browser and type **https://ip:10086**. Following is the screen shot of avenue on tensorboard.
+![scalars_tensorboard](assets/scalars.JPG)
+
+![images_tensorboard](assets/images.JPG)
+
+
+#### Notes
+The flow loss (temporal loss) module is based on [a TensorFlow implementation of FlowNet2](https://github.com/sampepose/flownet2-tf). Thanks for their nice work.
+#### Citation
+If you find this useful, please cite our work as follows:
+```code
+@article{liu2018ano_pred,
+Author = {Wen Liu and Weixin Luo and Dongze Lian and Shenghua Gao},
+Title = {Future Frame Prediction for Anomaly Detection -- A New Baseline},
+Journal = {ArXiv e-prints},
+Year = {2017},
+Eprint = {arXiv:1712.09867},
+}
+```
+While the open access of CVPR 2018 is available, welcome to cite the CVPR version. 
+Please contact with us if you have any questions.
diff --git a/assets/images.JPG b/assets/images.JPG
new file mode 100644
index 0000000..e786864
--- /dev/null
+++ b/assets/images.JPG
diff --git a/assets/scalars.JPG b/assets/scalars.JPG
new file mode 100644
index 0000000..f5c6e5d
--- /dev/null
+++ b/assets/scalars.JPG
author	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
committer	StevenLiuWen <liuwen@shanghaitech.edu.cn>	2018-03-13 03:28:06 -0400
commit	fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch)
tree	af7f6e759b5dec4fc2964daed09e903958b919ed