diff options
| author | StevenLiuWen <liuwen@shanghaitech.edu.cn> | 2018-03-13 03:28:06 -0400 |
|---|---|---|
| committer | StevenLiuWen <liuwen@shanghaitech.edu.cn> | 2018-03-13 03:28:06 -0400 |
| commit | fede6ca1dd0077ff509d84bd24028cc7a93bb119 (patch) | |
| tree | af7f6e759b5dec4fc2964daed09e903958b919ed /Codes/flownet2/src | |
first commit
Diffstat (limited to 'Codes/flownet2/src')
62 files changed, 5848 insertions, 0 deletions
diff --git a/Codes/flownet2/src/__init__.py b/Codes/flownet2/src/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/__init__.py diff --git a/Codes/flownet2/src/correlation.py b/Codes/flownet2/src/correlation.py new file mode 100644 index 0000000..60a5c37 --- /dev/null +++ b/Codes/flownet2/src/correlation.py @@ -0,0 +1,35 @@ +import tensorflow as tf + +_correlation_ops = tf.load_op_library( + tf.resource_loader.get_path_to_datafile("./ops/build/correlation.so")) + + +def correlation(input_a, input_b, kernel_size, max_displacement, stride_1, stride_2, padding): + return _correlation_ops.correlation(input_a, + input_b, + kernel_size, + max_displacement, + stride_1, + stride_2, + padding) + + +@tf.RegisterGradient("Correlation") +def _correlation_grad(corr_op, gradients): + kernel_size = corr_op.get_attr("kernel_size") + max_displacement = corr_op.get_attr("max_displacement") + stride_1 = corr_op.get_attr("stride_1") + stride_2 = corr_op.get_attr("stride_2") + pad = corr_op.get_attr("pad") + + corr_grads = _correlation_ops.correlation_grad(gradients, + corr_op.inputs[0], + corr_op.inputs[1], + kernel_size, + max_displacement, + stride_1, + stride_2, + pad) + + # Return the gradients with respect to input_a and input_b + return corr_grads.backprops_a, corr_grads.backprops_b diff --git a/Codes/flownet2/src/dataloader.py b/Codes/flownet2/src/dataloader.py new file mode 100644 index 0000000..22a6ddb --- /dev/null +++ b/Codes/flownet2/src/dataloader.py @@ -0,0 +1,329 @@ +# -*- coding: utf-8 -*- +import tensorflow as tf +import copy +slim = tf.contrib.slim + +_preprocessing_ops = tf.load_op_library( + tf.resource_loader.get_path_to_datafile("./ops/build/preprocessing.so")) + + +# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py +class Image(slim.tfexample_decoder.ItemHandler): + """An ItemHandler that decodes a parsed Tensor as an image.""" + + def __init__(self, + image_key=None, + format_key=None, + shape=None, + channels=3, + dtype=tf.uint8, + repeated=False): + """Initializes the image. + Args: + image_key: the name of the TF-Example feature in which the encoded image + is stored. + shape: the output shape of the image as 1-D `Tensor` + [height, width, channels]. If provided, the image is reshaped + accordingly. If left as None, no reshaping is done. A shape should + be supplied only if all the stored images have the same shape. + channels: the number of channels in the image. + dtype: images will be decoded at this bit depth. Different formats + support different bit depths. + See tf.image.decode_image, + tf.decode_raw, + repeated: if False, decodes a single image. If True, decodes a + variable number of image strings from a 1D tensor of strings. + """ + if not image_key: + image_key = 'image/encoded' + + super(Image, self).__init__([image_key]) + self._image_key = image_key + self._shape = shape + self._channels = channels + self._dtype = dtype + self._repeated = repeated + + def tensors_to_item(self, keys_to_tensors): + """See base class.""" + image_buffer = keys_to_tensors[self._image_key] + + if self._repeated: + return functional_ops.map_fn(lambda x: self._decode(x), + image_buffer, dtype=self._dtype) + else: + return self._decode(image_buffer) + + def _decode(self, image_buffer): + """Decodes the image buffer. + Args: + image_buffer: The tensor representing the encoded image tensor. + Returns: + A tensor that represents decoded image of self._shape, or + (?, ?, self._channels) if self._shape is not specified. + """ + def decode_raw(): + """Decodes a raw image.""" + return tf.decode_raw(image_buffer, out_type=self._dtype) + + image = decode_raw() + # image.set_shape([None, None, self._channels]) + if self._shape is not None: + image = tf.reshape(image, self._shape) + + return image + + +def __get_dataset(dataset_config, split_name): + """ + dataset_config: A dataset_config defined in datasets.py + split_name: 'train'/'validate' + """ + with tf.name_scope('__get_dataset'): + if split_name not in dataset_config['SIZES']: + raise ValueError('split name %s not recognized' % split_name) + + IMAGE_HEIGHT, IMAGE_WIDTH = dataset_config['IMAGE_HEIGHT'], dataset_config['IMAGE_WIDTH'] + reader = tf.TFRecordReader + keys_to_features = { + 'image_a': tf.FixedLenFeature((), tf.string), + 'image_b': tf.FixedLenFeature((), tf.string), + 'flow': tf.FixedLenFeature((), tf.string), + } + items_to_handlers = { + 'image_a': Image( + image_key='image_a', + dtype=tf.float64, + shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3], + channels=3), + 'image_b': Image( + image_key='image_b', + dtype=tf.float64, + shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 3], + channels=3), + 'flow': Image( + image_key='flow', + dtype=tf.float32, + shape=[IMAGE_HEIGHT, IMAGE_WIDTH, 2], + channels=2), + } + decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) + return slim.dataset.Dataset( + data_sources=dataset_config['PATHS'][split_name], + reader=reader, + decoder=decoder, + num_samples=dataset_config['SIZES'][split_name], + items_to_descriptions=dataset_config['ITEMS_TO_DESCRIPTIONS']) + + +def config_to_arrays(dataset_config): + output = { + 'name': [], + 'rand_type': [], + 'exp': [], + 'mean': [], + 'spread': [], + 'prob': [], + 'coeff_schedule': [], + } + config = copy.deepcopy(dataset_config) + + if 'coeff_schedule_param' in config: + del config['coeff_schedule_param'] + + # Get all attributes + for (name, value) in config.iteritems(): + if name == 'coeff_schedule_param': + output['coeff_schedule'] = [value['half_life'], + value['initial_coeff'], + value['final_coeff']] + else: + output['name'].append(name) + output['rand_type'].append(value['rand_type']) + output['exp'].append(value['exp']) + output['mean'].append(value['mean']) + output['spread'].append(value['spread']) + output['prob'].append(value['prob']) + + return output + + +# https://github.com/tgebru/transform/blob/master/src/caffe/layers/data_augmentation_layer.cpp#L34 +def _generate_coeff(param, discount_coeff=tf.constant(1.0), default_value=tf.constant(0.0)): + if not all(name in param for name in ['rand_type', 'exp', 'mean', 'spread', 'prob']): + raise RuntimeError('Expected rand_type, exp, mean, spread, prob in `param`') + + rand_type = param['rand_type'] + exp = float(param['exp']) + mean = tf.convert_to_tensor(param['mean'], dtype=tf.float32) + spread = float(param['spread']) # AKA standard deviation + prob = float(param['prob']) + + # Multiply spread by our discount_coeff so it changes over time + spread = spread * discount_coeff + + if rand_type == 'uniform': + value = tf.cond(spread > 0.0, + lambda: tf.random_uniform([], mean - spread, mean + spread), + lambda: mean) + if exp: + value = tf.exp(value) + elif rand_type == 'gaussian': + value = tf.cond(spread > 0.0, + lambda: tf.random_normal([], mean, spread), + lambda: mean) + if exp: + value = tf.exp(value) + elif rand_type == 'bernoulli': + if prob > 0.0: + value = tf.contrib.distributions.Bernoulli(probs=prob).sample([]) + else: + value = 0.0 + elif rand_type == 'uniform_bernoulli': + tmp1 = 0.0 + tmp2 = 0 + if prob > 0.0: + tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([]) + else: + tmp2 = 0 + + if tmp2 == 0: + if default_value is not None: + return default_value + else: + tmp1 = tf.cond(spread > 0.0, + lambda: tf.random_uniform([], mean - spread, mean + spread), + lambda: mean) + if exp: + tmp1 = tf.exp(tmp1) + value = tmp1 + elif rand_type == 'gaussian_bernoulli': + tmp1 = 0.0 + tmp2 = 0 + if prob > 0.0: + tmp2 = tf.contrib.distributions.Bernoulli(probs=prob).sample([]) + else: + tmp2 = 0 + + if tmp2 == 0: + if default_value is not None: + return default_value + else: + tmp1 = tf.cond(spread > 0.0, + lambda: tf.random_normal([], mean, spread), + lambda: mean) + if exp: + tmp1 = tf.exp(tmp1) + value = tmp1 + else: + raise ValueError('Unknown distribution type %s.' % rand_type) + return value + + +def load_batch(dataset_config, split_name, global_step): + num_threads = 32 + reader_kwargs = {'options': tf.python_io.TFRecordOptions( + tf.python_io.TFRecordCompressionType.ZLIB)} + + with tf.name_scope('load_batch'): + dataset = __get_dataset(dataset_config, split_name) + data_provider = slim.dataset_data_provider.DatasetDataProvider( + dataset, + num_readers=num_threads, + common_queue_capacity=2048, + common_queue_min=1024, + reader_kwargs=reader_kwargs) + image_a, image_b, flow = data_provider.get(['image_a', 'image_b', 'flow']) + image_a, image_b, flow = map(tf.to_float, [image_a, image_b, flow]) + + if dataset_config['PREPROCESS']['scale']: + image_a = image_a / 255.0 + image_b = image_b / 255.0 + + crop = [dataset_config['PREPROCESS']['crop_height'], + dataset_config['PREPROCESS']['crop_width']] + config_a = config_to_arrays(dataset_config['PREPROCESS']['image_a']) + config_b = config_to_arrays(dataset_config['PREPROCESS']['image_b']) + + image_as, image_bs, flows = map(lambda x: tf.expand_dims(x, 0), [image_a, image_b, flow]) + + # Perform data augmentation on GPU + with tf.device('/cpu:0'): + image_as, image_bs, transforms_from_a, transforms_from_b = \ + _preprocessing_ops.data_augmentation(image_as, + image_bs, + global_step, + crop, + config_a['name'], + config_a['rand_type'], + config_a['exp'], + config_a['mean'], + config_a['spread'], + config_a['prob'], + config_a['coeff_schedule'], + config_b['name'], + config_b['rand_type'], + config_b['exp'], + config_b['mean'], + config_b['spread'], + config_b['prob'], + config_b['coeff_schedule']) + + noise_coeff_a = None + noise_coeff_b = None + + # Generate and apply noise coeff for A if defined in A params + if 'noise' in dataset_config['PREPROCESS']['image_a']: + discount_coeff = tf.constant(1.0) + if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_a']: + initial_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['initial_coeff'] + final_coeff = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['final_coeff'] + half_life = dataset_config['PREPROCESS']['image_a']['coeff_schedule_param']['half_life'] + discount_coeff = initial_coeff + \ + (final_coeff - initial_coeff) * \ + (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0) + + noise_coeff_a = _generate_coeff( + dataset_config['PREPROCESS']['image_a']['noise'], discount_coeff) + noise_a = tf.random_normal(shape=tf.shape(image_as), + mean=0.0, stddev=noise_coeff_a, + dtype=tf.float32) + image_as = tf.clip_by_value(image_as + noise_a, 0.0, 1.0) + + # Generate noise coeff for B if defined in B params + if 'noise' in dataset_config['PREPROCESS']['image_b']: + discount_coeff = tf.constant(1.0) + if 'coeff_schedule_param' in dataset_config['PREPROCESS']['image_b']: + initial_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['initial_coeff'] + final_coeff = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['final_coeff'] + half_life = dataset_config['PREPROCESS']['image_b']['coeff_schedule_param']['half_life'] + discount_coeff = initial_coeff + \ + (final_coeff - initial_coeff) * \ + (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0) + noise_coeff_b = _generate_coeff( + dataset_config['PREPROCESS']['image_b']['noise'], discount_coeff) + + # Combine coeff from a with coeff from b + if noise_coeff_a is not None: + if noise_coeff_b is not None: + noise_coeff_b = noise_coeff_a * noise_coeff_b + else: + noise_coeff_b = noise_coeff_a + + # Add noise to B if needed + if noise_coeff_b is not None: + noise_b = tf.random_normal(shape=tf.shape(image_bs), + mean=0.0, stddev=noise_coeff_b, + dtype=tf.float32) + image_bs = tf.clip_by_value(image_bs + noise_b, 0.0, 1.0) + + # Perform flow augmentation using spatial parameters from data augmentation + flows = _preprocessing_ops.flow_augmentation( + flows, transforms_from_a, transforms_from_b, crop) + + return tf.train.batch([image_as, image_bs, flows], + enqueue_many=True, + batch_size=dataset_config['BATCH_SIZE'], + capacity=dataset_config['BATCH_SIZE'] * 4, + num_threads=num_threads, + allow_smaller_final_batch=False) diff --git a/Codes/flownet2/src/dataset_configs.py b/Codes/flownet2/src/dataset_configs.py new file mode 100644 index 0000000..fbda5d0 --- /dev/null +++ b/Codes/flownet2/src/dataset_configs.py @@ -0,0 +1,153 @@ +""" +Add dataset configurations here. Each dataset must have the following structure: + +NAME = { + IMAGE_HEIGHT: int, + IMAGE_WIDTH: int, + ITEMS_TO_DESCRIPTIONS: { + 'image_a': 'A 3-channel image.', + 'image_b': 'A 3-channel image.', + 'flow': 'A 2-channel optical flow field', + }, + SIZES: { + 'train': int, + 'validate': int, (optional) + ... + }, + BATCH_SIZE: int, + PATHS: { + 'train': '', + 'validate': '', (optional) + ... + } +} +""" + +""" +note that one step = one batch of data processed, ~not~ an entire epoch +'coeff_schedule_param': { + 'half_life': 50000, after this many steps, the value will be i + (f - i)/2 + 'initial_coeff': 0.5, initial value + 'final_coeff': 1, final value +}, +""" + +FLYING_CHAIRS_DATASET_CONFIG = { + 'IMAGE_HEIGHT': 384, + 'IMAGE_WIDTH': 512, + 'ITEMS_TO_DESCRIPTIONS': { + 'image_a': 'A 3-channel image.', + 'image_b': 'A 3-channel image.', + 'flow': 'A 2-channel optical flow field', + }, + 'SIZES': { + 'train': 22232, + 'validate': 640, + 'sample': 8, + }, + 'BATCH_SIZE': 8, + 'PATHS': { + 'train': './data/tfrecords/fc_train.tfrecords', + 'validate': './data/tfrecords/fc_val.tfrecords', + 'sample': './data/tfrecords/fc_sample.tfrecords', + }, + 'PREPROCESS': { + 'scale': False, + 'crop_height': 320, + 'crop_width': 448, + 'image_a': { + 'translate': { + 'rand_type': "uniform_bernoulli", + 'exp': False, + 'mean': 0, + 'spread': 0.4, + 'prob': 1.0, + }, + 'rotate': { + 'rand_type': "uniform_bernoulli", + 'exp': False, + 'mean': 0, + 'spread': 0.4, + 'prob': 1.0, + }, + 'zoom': { + 'rand_type': "uniform_bernoulli", + 'exp': True, + 'mean': 0.2, + 'spread': 0.4, + 'prob': 1.0, + }, + 'squeeze': { + 'rand_type': "uniform_bernoulli", + 'exp': True, + 'mean': 0, + 'spread': 0.3, + 'prob': 1.0, + }, + 'noise': { + 'rand_type': "uniform_bernoulli", + 'exp': False, + 'mean': 0.03, + 'spread': 0.03, + 'prob': 1.0, + }, + }, + # All preprocessing to image A will be applied to image B in addition to the following. + 'image_b': { + 'translate': { + 'rand_type': "gaussian_bernoulli", + 'exp': False, + 'mean': 0, + 'spread': 0.03, + 'prob': 1.0, + }, + 'rotate': { + 'rand_type': "gaussian_bernoulli", + 'exp': False, + 'mean': 0, + 'spread': 0.03, + 'prob': 1.0, + }, + 'zoom': { + 'rand_type': "gaussian_bernoulli", + 'exp': True, + 'mean': 0, + 'spread': 0.03, + 'prob': 1.0, + }, + 'gamma': { + 'rand_type': "gaussian_bernoulli", + 'exp': True, + 'mean': 0, + 'spread': 0.02, + 'prob': 1.0, + }, + 'brightness': { + 'rand_type': "gaussian_bernoulli", + 'exp': False, + 'mean': 0, + 'spread': 0.02, + 'prob': 1.0, + }, + 'contrast': { + 'rand_type': "gaussian_bernoulli", + 'exp': True, + 'mean': 0, + 'spread': 0.02, + 'prob': 1.0, + }, + 'color': { + 'rand_type': "gaussian_bernoulli", + 'exp': True, + 'mean': 0, + 'spread': 0.02, + 'prob': 1.0, + }, + 'coeff_schedule_param': { + 'half_life': 50000, + 'initial_coeff': 0.5, + 'final_coeff': 1, + }, + } + }, +} diff --git a/Codes/flownet2/src/downsample.py b/Codes/flownet2/src/downsample.py new file mode 100644 index 0000000..5e6fc95 --- /dev/null +++ b/Codes/flownet2/src/downsample.py @@ -0,0 +1,8 @@ +import tensorflow as tf + +_downsample = tf.load_op_library( + tf.resource_loader.get_path_to_datafile("./ops/build/downsample.so")) + + +def downsample(tensor, size): + return _downsample.downsample(tensor, size) diff --git a/Codes/flownet2/src/flow_warp.py b/Codes/flownet2/src/flow_warp.py new file mode 100644 index 0000000..fe5fd4d --- /dev/null +++ b/Codes/flownet2/src/flow_warp.py @@ -0,0 +1,15 @@ +import tensorflow as tf + +_flow_warp_ops = tf.load_op_library( + tf.resource_loader.get_path_to_datafile("./ops/build/flow_warp.so")) + + +def flow_warp(image, flow): + return _flow_warp_ops.flow_warp(image, flow) + + +@tf.RegisterGradient("FlowWarp") +def _flow_warp_grad(flow_warp_op, gradients): + return _flow_warp_ops.flow_warp_grad(flow_warp_op.inputs[0], + flow_warp_op.inputs[1], + gradients) diff --git a/Codes/flownet2/src/flowlib.py b/Codes/flownet2/src/flowlib.py new file mode 100644 index 0000000..36c56d4 --- /dev/null +++ b/Codes/flownet2/src/flowlib.py @@ -0,0 +1,554 @@ +#!/usr/bin/python +""" +# ============================== +# flowlib.py +# library for optical flow processing +# Author: Ruoteng Li +# Date: 6th Aug 2016 +# ============================== +""" +import png +import numpy as np +import matplotlib.colors as cl +import matplotlib.pyplot as plt +from PIL import Image +import tensorflow as tf + + +UNKNOWN_FLOW_THRESH = 1e7 +SMALLFLOW = 0.0 +LARGEFLOW = 1e8 + +""" +============= +Flow Section +============= +""" + + +def show_flow(filename): + """ + visualize optical flow map using matplotlib + :param filename: optical flow file + :return: None + """ + flow = read_flow(filename) + img = flow_to_image(flow) + plt.imshow(img) + plt.show() + + +def visualize_flow(flow, mode='Y'): + """ + this function visualize the input flow + :param flow: input flow in array + :param mode: choose which color mode to visualize the flow (Y: Ccbcr, RGB: RGB color) + :return: None + """ + if mode == 'Y': + # Ccbcr color wheel + img = flow_to_image(flow) + plt.imshow(img) + plt.show() + elif mode == 'RGB': + (h, w) = flow.shape[0:2] + du = flow[:, :, 0] + dv = flow[:, :, 1] + valid = flow[:, :, 2] + max_flow = max(np.max(du), np.max(dv)) + img = np.zeros((h, w, 3), dtype=np.float64) + # angle layer + img[:, :, 0] = np.arctan2(dv, du) / (2 * np.pi) + # magnitude layer, normalized to 1 + img[:, :, 1] = np.sqrt(du * du + dv * dv) * 8 / max_flow + # phase layer + img[:, :, 2] = 8 - img[:, :, 1] + # clip to [0,1] + small_idx = img[:, :, 0:3] < 0 + large_idx = img[:, :, 0:3] > 1 + img[small_idx] = 0 + img[large_idx] = 1 + # convert to rgb + img = cl.hsv_to_rgb(img) + # remove invalid point + img[:, :, 0] = img[:, :, 0] * valid + img[:, :, 1] = img[:, :, 1] * valid + img[:, :, 2] = img[:, :, 2] * valid + # show + plt.imshow(img) + plt.show() + + return None + + +def read_flow(filename): + """ + read optical flow from Middlebury .flo file + :param filename: name of the flow file + :return: optical flow data in matrix + """ + f = open(filename, 'rb') + magic = np.fromfile(f, np.float32, count=1) + data2d = None + + if 202021.25 != magic: + print('Magic number incorrect. Invalid .flo file') + else: + w = np.fromfile(f, np.int32, count=1) + h = np.fromfile(f, np.int32, count=1) + print("Reading %d x %d flo file" % (h, w)) + data2d = np.fromfile(f, np.float32, count=2 * w * h) + # reshape data into 3D array (columns, rows, channels) + data2d = np.resize(data2d, (h[0], w[0], 2)) + f.close() + return data2d + + +def read_flow_png(flow_file): + """ + Read optical flow from KITTI .png file + :param flow_file: name of the flow file + :return: optical flow data in matrix + """ + flow_object = png.Reader(filename=flow_file) + flow_direct = flow_object.asDirect() + flow_data = list(flow_direct[2]) + (w, h) = flow_direct[3]['size'] + flow = np.zeros((h, w, 3), dtype=np.float64) + for i in range(len(flow_data)): + flow[i, :, 0] = flow_data[i][0::3] + flow[i, :, 1] = flow_data[i][1::3] + flow[i, :, 2] = flow_data[i][2::3] + + invalid_idx = (flow[:, :, 2] == 0) + flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0 + flow[invalid_idx, 0] = 0 + flow[invalid_idx, 1] = 0 + return flow + + +def write_flow(flow, filename): + """ + write optical flow in Middlebury .flo format + :param flow: optical flow map + :param filename: optical flow file path to be saved + :return: None + """ + f = open(filename, 'wb') + magic = np.array([202021.25], dtype=np.float32) + (height, width) = flow.shape[0:2] + w = np.array([width], dtype=np.int32) + h = np.array([height], dtype=np.int32) + magic.tofile(f) + w.tofile(f) + h.tofile(f) + flow.tofile(f) + f.close() + + +def segment_flow(flow): + h = flow.shape[0] + w = flow.shape[1] + u = flow[:, :, 0] + v = flow[:, :, 1] + + idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW)) + idx2 = (abs(u) == SMALLFLOW) + class0 = (v == 0) & (u == 0) + u[idx2] = 0.00001 + tan_value = v / u + + class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0) + class2 = (tan_value >= 1) & (u >= 0) & (v >= 0) + class3 = (tan_value < -1) & (u <= 0) & (v >= 0) + class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0) + class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0) + class7 = (tan_value < -1) & (u >= 0) & (v <= 0) + class6 = (tan_value >= 1) & (u <= 0) & (v <= 0) + class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0) + + seg = np.zeros((h, w)) + + seg[class1] = 1 + seg[class2] = 2 + seg[class3] = 3 + seg[class4] = 4 + seg[class5] = 5 + seg[class6] = 6 + seg[class7] = 7 + seg[class8] = 8 + seg[class0] = 0 + seg[idx] = 0 + + return seg + + +def flow_error(tu, tv, u, v): + """ + Calculate average end point error + :param tu: ground-truth horizontal flow map + :param tv: ground-truth vertical flow map + :param u: estimated horizontal flow map + :param v: estimated vertical flow map + :return: End point error of the estimated flow + """ + smallflow = 0.0 + ''' + stu = tu[bord+1:end-bord,bord+1:end-bord] + stv = tv[bord+1:end-bord,bord+1:end-bord] + su = u[bord+1:end-bord,bord+1:end-bord] + sv = v[bord+1:end-bord,bord+1:end-bord] + ''' + stu = tu[:] + stv = tv[:] + su = u[:] + sv = v[:] + + idxUnknow = (abs(stu) > UNKNOWN_FLOW_THRESH) | (abs(stv) > UNKNOWN_FLOW_THRESH) + stu[idxUnknow] = 0 + stv[idxUnknow] = 0 + su[idxUnknow] = 0 + sv[idxUnknow] = 0 + + ind2 = [(np.absolute(stu) > smallflow) | (np.absolute(stv) > smallflow)] + index_su = su[ind2] + index_sv = sv[ind2] + an = 1.0 / np.sqrt(index_su ** 2 + index_sv ** 2 + 1) + un = index_su * an + vn = index_sv * an + + index_stu = stu[ind2] + index_stv = stv[ind2] + tn = 1.0 / np.sqrt(index_stu ** 2 + index_stv ** 2 + 1) + tun = index_stu * tn + tvn = index_stv * tn + + ''' + angle = un * tun + vn * tvn + (an * tn) + index = [angle == 1.0] + angle[index] = 0.999 + ang = np.arccos(angle) + mang = np.mean(ang) + mang = mang * 180 / np.pi + ''' + + epe = np.sqrt((stu - su) ** 2 + (stv - sv) ** 2) + epe = epe[ind2] + mepe = np.mean(epe) + return mepe + + +def flow_to_image(flow): + """ + Convert flow into middlebury color code image + :param flow: optical flow map + :return: optical flow image in middlebury color + """ + u = flow[:, :, 0] + v = flow[:, :, 1] + + maxu = -999. + maxv = -999. + minu = 999. + minv = 999. + + idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH) + u[idxUnknow] = 0 + v[idxUnknow] = 0 + + maxu = max(maxu, np.max(u)) + minu = min(minu, np.min(u)) + + maxv = max(maxv, np.max(v)) + minv = min(minv, np.min(v)) + + rad = np.sqrt(u ** 2 + v ** 2) + maxrad = max(-1, np.max(rad)) + + # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu,maxu, minv, maxv)) + + u = u/(maxrad + np.finfo(float).eps) + v = v/(maxrad + np.finfo(float).eps) + + img = compute_color(u, v) + + idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2) + img[idx] = 0 + + return np.uint8(img) + + +def tf_flow_to_image(flow): + """ + Convert flow into middlebury color code image + :param flow: optical flow map + :return: optical flow image in middlebury color + """ + u = flow[:, :, :, 0] + v = flow[:, :, :, 1] + + maxu = tf.constant(-999.) + maxv = tf.constant(-999.) + minu = tf.constant(999.) + minv = tf.constant(999.) + + zeros = tf.zeros_like(u, dtype=tf.float32) + u = tf.where(tf.greater(u, UNKNOWN_FLOW_THRESH), zeros, u) + v = tf.where(tf.greater(v, UNKNOWN_FLOW_THRESH), zeros, v) + + rad = tf.sqrt(u ** 2 + v ** 2) + maxrad = tf.reduce_max(-1, tf.reduce_max(rad)) + + # print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu, maxu, minv, maxv)) + + u = u / (maxrad + np.finfo(float).eps) + v = v / (maxrad + np.finfo(float).eps) + + img = compute_color(u, v) + + # idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2) + # img[idx] = 0 + + return np.uint8(img) + + +def evaluate_flow_file(gt, pred): + """ + evaluate the estimated optical flow end point error according to ground truth provided + :param gt: ground truth file path + :param pred: estimated optical flow file path + :return: end point error, float32 + """ + # Read flow files and calculate the errors + gt_flow = read_flow(gt) # ground truth flow + eva_flow = read_flow(pred) # predicted flow + # Calculate errors + average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], eva_flow[:, :, 0], eva_flow[:, :, 1]) + return average_pe + + +def evaluate_flow(gt_flow, pred_flow): + """ + gt: ground-truth flow + pred: estimated flow + """ + average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], pred_flow[:, :, 0], pred_flow[:, :, 1]) + return average_pe + + +""" +============== +Disparity Section +============== +""" + + +def read_disp_png(file_name): + """ + Read optical flow from KITTI .png file + :param file_name: name of the flow file + :return: optical flow data in matrix + """ + image_object = png.Reader(filename=file_name) + image_direct = image_object.asDirect() + image_data = list(image_direct[2]) + (w, h) = image_direct[3]['size'] + channel = len(image_data[0]) / w + flow = np.zeros((h, w, channel), dtype=np.uint16) + for i in range(len(image_data)): + for j in range(channel): + flow[i, :, j] = image_data[i][j::channel] + return flow[:, :, 0] / 256 + + +def disp_to_flowfile(disp, filename): + """ + Read KITTI disparity file in png format + :param disp: disparity matrix + :param filename: the flow file name to save + :return: None + """ + f = open(filename, 'wb') + magic = np.array([202021.25], dtype=np.float32) + (height, width) = disp.shape[0:2] + w = np.array([width], dtype=np.int32) + h = np.array([height], dtype=np.int32) + empty_map = np.zeros((height, width), dtype=np.float32) + data = np.dstack((disp, empty_map)) + magic.tofile(f) + w.tofile(f) + h.tofile(f) + data.tofile(f) + f.close() + + +""" +============== +Image Section +============== +""" + + +def read_image(filename): + """ + Read normal image of any format + :param filename: name of the image file + :return: image data in matrix uint8 type + """ + img = Image.open(filename) + im = np.array(img) + return im + + +def warp_image(im, flow): + """ + Use optical flow to warp image to the next + :param im: image to warp + :param flow: optical flow + :return: warped image + """ + from scipy import interpolate + image_height = im.shape[0] + image_width = im.shape[1] + flow_height = flow.shape[0] + flow_width = flow.shape[1] + n = image_height * image_width + (iy, ix) = np.mgrid[0:image_height, 0:image_width] + (fy, fx) = np.mgrid[0:flow_height, 0:flow_width] + fx += flow[:,:,0] + fy += flow[:,:,1] + mask = np.logical_or(fx <0 , fx > flow_width) + mask = np.logical_or(mask, fy < 0) + mask = np.logical_or(mask, fy > flow_height) + fx = np.minimum(np.maximum(fx, 0), flow_width) + fy = np.minimum(np.maximum(fy, 0), flow_height) + points = np.concatenate((ix.reshape(n,1), iy.reshape(n,1)), axis=1) + xi = np.concatenate((fx.reshape(n, 1), fy.reshape(n,1)), axis=1) + warp = np.zeros((image_height, image_width, im.shape[2])) + for i in range(im.shape[2]): + channel = im[:, :, i] + plt.imshow(channel, cmap='gray') + values = channel.reshape(n, 1) + new_channel = interpolate.griddata(points, values, xi, method='cubic') + new_channel = np.reshape(new_channel, [flow_height, flow_width]) + new_channel[mask] = 1 + warp[:, :, i] = new_channel.astype(np.uint8) + + return warp.astype(np.uint8) + + +""" +============== +Others +============== +""" + + +def scale_image(image, new_range): + """ + Linearly scale the image into desired range + :param image: input image + :param new_range: the new range to be aligned + :return: image normalized in new range + """ + min_val = np.min(image).astype(np.float32) + max_val = np.max(image).astype(np.float32) + min_val_new = np.array(min(new_range), dtype=np.float32) + max_val_new = np.array(max(new_range), dtype=np.float32) + scaled_image = (image - min_val) / (max_val - min_val) * (max_val_new - min_val_new) + min_val_new + return scaled_image.astype(np.uint8) + + +def compute_color(u, v): + """ + compute optical flow color map + :param u: optical flow horizontal map + :param v: optical flow vertical map + :return: optical flow in color code + """ + [h, w] = u.shape + img = np.zeros([h, w, 3]) + nanIdx = np.isnan(u) | np.isnan(v) + u[nanIdx] = 0 + v[nanIdx] = 0 + + colorwheel = make_color_wheel() + # ncols = np.size(colorwheel, 0) + ncols = colorwheel.shape[0] + + rad = np.sqrt(u**2+v**2) + + a = np.arctan2(-v, -u) / np.pi + + fk = (a+1) / 2 * (ncols - 1) + 1 + + k0 = np.floor(fk).astype(int) + + k1 = k0 + 1 + k1[k1 == ncols+1] = 1 + f = fk - k0 + + for i in range(0, np.size(colorwheel, 1)): + tmp = colorwheel[:, i] + col0 = tmp[k0-1] / 255 + col1 = tmp[k1-1] / 255 + col = (1-f) * col0 + f * col1 + + idx = rad <= 1 + col[idx] = 1-rad[idx]*(1-col[idx]) + notidx = np.logical_not(idx) + + col[notidx] *= 0.75 + img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx))) + + return img + + +def make_color_wheel(): + """ + Generate color wheel according Middlebury color code + :return: Color wheel + """ + RY = 15 + YG = 6 + GC = 4 + CB = 11 + BM = 13 + MR = 6 + + ncols = RY + YG + GC + CB + BM + MR + + colorwheel = np.zeros([ncols, 3]) + + col = 0 + + # RY + colorwheel[0:RY, 0] = 255 + colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY)) + col += RY + + # YG + colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG)) + colorwheel[col:col+YG, 1] = 255 + col += YG + + # GC + colorwheel[col:col+GC, 1] = 255 + colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC)) + col += GC + + # CB + colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB)) + colorwheel[col:col+CB, 2] = 255 + col += CB + + # BM + colorwheel[col:col+BM, 2] = 255 + colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM)) + col += + BM + + # MR + colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR)) + colorwheel[col:col+MR, 0] = 255 + + return colorwheel diff --git a/Codes/flownet2/src/flownet2/__init__.py b/Codes/flownet2/src/flownet2/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet2/__init__.py diff --git a/Codes/flownet2/src/flownet2/flownet2.py b/Codes/flownet2/src/flownet2/flownet2.py new file mode 100644 index 0000000..d44ed10 --- /dev/null +++ b/Codes/flownet2/src/flownet2/flownet2.py @@ -0,0 +1,118 @@ +from ..net import Net, Mode +from ..flownet_css.flownet_css import FlowNetCSS +from ..flownet_sd.flownet_sd import FlowNetSD +from ..flow_warp import flow_warp +from ..utils import LeakyReLU, average_endpoint_error, pad, antipad +from ..downsample import downsample +import tensorflow as tf +slim = tf.contrib.slim + + +class FlowNet2(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + self.net_css = FlowNetCSS(mode, debug) + self.net_sd = FlowNetSD(mode, debug) + super(FlowNet2, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True): + _, height, width, _ = inputs['input_a'].shape.as_list() + with tf.variable_scope('FlowNet2'): + # Forward pass through FlowNetCSS and FlowNetSD with weights frozen + net_css_predictions = self.net_css.model(inputs, training_schedule, trainable=True) + net_sd_predictions = self.net_sd.model(inputs, training_schedule, trainable=True) + + def ChannelNorm(tensor): + sq = tf.square(tensor) + r_sum = tf.reduce_sum(sq, keep_dims=True, axis=3) + return tf.sqrt(r_sum) + + sd_flow_norm = ChannelNorm(net_sd_predictions['flow']) + css_flow_norm = ChannelNorm(net_css_predictions['flow']) + + flow_warp_sd = flow_warp(inputs['input_b'], net_sd_predictions['flow']) + img_diff_sd = inputs['input_a'] - flow_warp_sd + img_diff_sd_norm = ChannelNorm(img_diff_sd) + + flow_warp_css = flow_warp(inputs['input_b'], net_css_predictions['flow']) + img_diff_css = inputs['input_a'] - flow_warp_css + img_diff_css_norm = ChannelNorm(img_diff_css) + + input_to_fusion = tf.concat([inputs['input_a'], + net_sd_predictions['flow'], + net_css_predictions['flow'], + sd_flow_norm, + css_flow_norm, + img_diff_sd_norm, + img_diff_css_norm], axis=3) + + # Fusion Network + with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], + # Only backprop this network if trainable + trainable=trainable, + # He (aka MSRA) weight initialization + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=LeakyReLU, + # We will do our own padding to match the original Caffe code + padding='VALID'): + + weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay']) + with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): + fuse_conv0 = slim.conv2d(pad(input_to_fusion), 64, 3, scope='fuse_conv0') + fuse_conv1 = slim.conv2d(pad(fuse_conv0), 64, 3, stride=2, scope='fuse_conv1') + fuse_conv1_1 = slim.conv2d(pad(fuse_conv1), 128, 3, scope='fuse_conv1_1') + fuse_conv2 = slim.conv2d(pad(fuse_conv1_1), 128, 3, + stride=2, scope='fuse_conv2') + fuse_conv2_1 = slim.conv2d(pad(fuse_conv2), 128, 3, scope='fuse_conv2_1') + + predict_flow2 = slim.conv2d(pad(fuse_conv2_1), 2, 3, + scope='predict_flow2', + activation_fn=None) + fuse_deconv1 = antipad(slim.conv2d_transpose(fuse_conv2_1, 32, 4, + stride=2, + scope='fuse_deconv1')) + fuse_upsample_flow2to1 = antipad(slim.conv2d_transpose(predict_flow2, 2, 4, + stride=2, + scope='fuse_upsample_flow2to1', + activation_fn=None)) + concat1 = tf.concat([fuse_conv1_1, fuse_deconv1, + fuse_upsample_flow2to1], axis=3) + fuse_interconv1 = slim.conv2d(pad(concat1), 32, 3, + activation_fn=None, scope='fuse_interconv1') + + predict_flow1 = slim.conv2d(pad(fuse_interconv1), 2, 3, + scope='predict_flow1', + activation_fn=None) + fuse_deconv0 = antipad(slim.conv2d_transpose(concat1, 16, 4, + stride=2, + scope='fuse_deconv0')) + fuse_upsample_flow1to0 = antipad(slim.conv2d_transpose(predict_flow1, 2, 4, + stride=2, + scope='fuse_upsample_flow1to0', + activation_fn=None)) + concat0 = tf.concat([fuse_conv0, fuse_deconv0, fuse_upsample_flow1to0], axis=3) + fuse_interconv0 = slim.conv2d(pad(concat0), 16, 3, + activation_fn=None, scope='fuse_interconv0') + + predict_flow0 = slim.conv2d(pad(fuse_interconv0), 2, + 3, activation_fn=None, scope='predict_flow0') + + flow = tf.image.resize_bilinear( + predict_flow0, tf.stack([height, width]), align_corners=True) + print(predict_flow0) + print(flow) + return { + 'predict_flow0': predict_flow0, + 'flow': flow, + } + + def loss(self, flow, predictions): + # L2 loss between predict_flow0, true flow (weighted w/ 0.005) + predict_flow0 = predictions['predict_flow0'] + size = [predict_flow0.shape[1], predict_flow0.shape[2]] + downsampled_flow0 = downsample(flow, size) + loss = average_endpoint_error(downsampled_flow0, predict_flow0) + tf.losses.add_loss(loss) + + # Return the 'total' loss: loss fns + regularization terms defined in the model + return tf.losses.get_total_loss() diff --git a/Codes/flownet2/src/flownet2/test.py b/Codes/flownet2/src/flownet2/test.py new file mode 100644 index 0000000..3177614 --- /dev/null +++ b/Codes/flownet2/src/flownet2/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet2 import FlowNet2 + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNet2(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNet2/flownet-2.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet2/train.py b/Codes/flownet2/src/flownet2/train.py new file mode 100644 index 0000000..40c028d --- /dev/null +++ b/Codes/flownet2/src/flownet2/train.py @@ -0,0 +1,24 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet2 import FlowNet2 + +# Create a new network +net = FlowNet2() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_2', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow, + # Load trained weights for CSS and SD parts of network + checkpoints={ + './checkpoints/FlowNetCSS-ft-sd/flownet-CSS-ft-sd.ckpt-0': ('FlowNet2/FlowNetCSS', 'FlowNet2'), + './checkpoints/FlowNetSD/flownet-SD.ckpt-0': ('FlowNet2/FlowNetSD', 'FlowNet2') + } +) diff --git a/Codes/flownet2/src/flownet_c/__init__.py b/Codes/flownet2/src/flownet_c/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet_c/__init__.py diff --git a/Codes/flownet2/src/flownet_c/flownet_c.py b/Codes/flownet2/src/flownet_c/flownet_c.py new file mode 100644 index 0000000..d333ee2 --- /dev/null +++ b/Codes/flownet2/src/flownet_c/flownet_c.py @@ -0,0 +1,167 @@ +from ..net import Net, Mode +from ..utils import LeakyReLU, average_endpoint_error, pad, antipad +from ..correlation import correlation +from ..downsample import downsample +import math +import tensorflow as tf +slim = tf.contrib.slim + + +class FlowNetC(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + super(FlowNetC, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True): + _, height, width, _ = inputs['input_a'].shape.as_list() + with tf.variable_scope('FlowNetC'): + with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], + # Only backprop this network if trainable + trainable=trainable, + # He (aka MSRA) weight initialization + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=LeakyReLU, + # We will do our own padding to match the original Caffe code + padding='VALID'): + + weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay']) + with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): + with slim.arg_scope([slim.conv2d], stride=2): + conv_a_1 = slim.conv2d(pad(inputs['input_a'], 3), 64, 7, scope='conv1') + conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope='conv2') + conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope='conv3') + + conv_b_1 = slim.conv2d(pad(inputs['input_b'], 3), + 64, 7, scope='conv1', reuse=True) + conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope='conv2', reuse=True) + conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope='conv3', reuse=True) + + # Compute cross correlation with leaky relu activation + cc = correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20) + cc_relu = LeakyReLU(cc) + + # Combine cross correlation results with convolution of feature map A + netA_conv = slim.conv2d(conv_a_3, 32, 1, scope='conv_redir') + # Concatenate along the channels axis + net = tf.concat([netA_conv, cc_relu], axis=3) + + conv3_1 = slim.conv2d(pad(net), 256, 3, scope='conv3_1') + with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3): + conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4') + conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1') + conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5') + conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1') + conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6') + conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1') + + """ START: Refinement Network """ + with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): + predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, + scope='predict_flow6', + activation_fn=None) + + deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4, + stride=2, + scope='deconv5')) + upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4, + stride=2, + scope='upsample_flow6to5', + activation_fn=None)) + concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3) + + predict_flow5 = slim.conv2d(pad(concat5), 2, 3, + scope='predict_flow5', + activation_fn=None) + deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4, + stride=2, + scope='deconv4')) + upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4, + stride=2, + scope='upsample_flow5to4', + activation_fn=None)) + concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3) + + predict_flow4 = slim.conv2d(pad(concat4), 2, 3, + scope='predict_flow4', + activation_fn=None) + deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4, + stride=2, + scope='deconv3')) + upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4, + stride=2, + scope='upsample_flow4to3', + activation_fn=None)) + concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3) + + predict_flow3 = slim.conv2d(pad(concat3), 2, 3, + scope='predict_flow3', + activation_fn=None) + deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4, + stride=2, + scope='deconv2')) + upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4, + stride=2, + scope='upsample_flow3to2', + activation_fn=None)) + concat2 = tf.concat([conv_a_2, deconv2, upsample_flow3to2], axis=3) + + predict_flow2 = slim.conv2d(pad(concat2), 2, 3, + scope='predict_flow2', + activation_fn=None) + """ END: Refinement Network """ + + flow = predict_flow2 * 20.0 + # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different + flow = tf.image.resize_bilinear(flow, + tf.stack([height, width]), + align_corners=True) + + return { + 'predict_flow6': predict_flow6, + 'predict_flow5': predict_flow5, + 'predict_flow4': predict_flow4, + 'predict_flow3': predict_flow3, + 'predict_flow2': predict_flow2, + 'flow': flow, + } + + def loss(self, flow, predictions): + flow = flow * 0.05 + + losses = [] + INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value) + + # L2 loss between predict_flow6, blob23 (weighted w/ 0.32) + predict_flow6 = predictions['predict_flow6'] + size = [predict_flow6.shape[1], predict_flow6.shape[2]] + downsampled_flow6 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow6, predict_flow6)) + + # L2 loss between predict_flow5, blob28 (weighted w/ 0.08) + predict_flow5 = predictions['predict_flow5'] + size = [predict_flow5.shape[1], predict_flow5.shape[2]] + downsampled_flow5 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow5, predict_flow5)) + + # L2 loss between predict_flow4, blob33 (weighted w/ 0.02) + predict_flow4 = predictions['predict_flow4'] + size = [predict_flow4.shape[1], predict_flow4.shape[2]] + downsampled_flow4 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow4, predict_flow4)) + + # L2 loss between predict_flow3, blob38 (weighted w/ 0.01) + predict_flow3 = predictions['predict_flow3'] + size = [predict_flow3.shape[1], predict_flow3.shape[2]] + downsampled_flow3 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow3, predict_flow3)) + + # L2 loss between predict_flow2, blob43 (weighted w/ 0.005) + predict_flow2 = predictions['predict_flow2'] + size = [predict_flow2.shape[1], predict_flow2.shape[2]] + downsampled_flow2 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow2, predict_flow2)) + + loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005]) + + # Return the 'total' loss: loss fns + regularization terms defined in the model + return tf.losses.get_total_loss() diff --git a/Codes/flownet2/src/flownet_c/test.py b/Codes/flownet2/src/flownet_c/test.py new file mode 100644 index 0000000..692f22d --- /dev/null +++ b/Codes/flownet2/src/flownet_c/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet_c import FlowNetC + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNetC(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNetC/flownet-C.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet_c/train.py b/Codes/flownet2/src/flownet_c/train.py new file mode 100644 index 0000000..9296ac7 --- /dev/null +++ b/Codes/flownet2/src/flownet_c/train.py @@ -0,0 +1,19 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet_c import FlowNetC + +# Create a new network +net = FlowNetC() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_c', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow +) diff --git a/Codes/flownet2/src/flownet_cs/__init__.py b/Codes/flownet2/src/flownet_cs/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet_cs/__init__.py diff --git a/Codes/flownet2/src/flownet_cs/flownet_cs.py b/Codes/flownet2/src/flownet_cs/flownet_cs.py new file mode 100644 index 0000000..aeaea47 --- /dev/null +++ b/Codes/flownet2/src/flownet_cs/flownet_cs.py @@ -0,0 +1,41 @@ +from ..net import Net, Mode +from ..flownet_c.flownet_c import FlowNetC +from ..flownet_s.flownet_s import FlowNetS +from ..flow_warp import flow_warp +import tensorflow as tf + + +class FlowNetCS(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + self.net_c = FlowNetC(mode, debug) + self.net_s = FlowNetS(mode, debug) + super(FlowNetCS, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True): + with tf.variable_scope('FlowNetCS'): + # Forward pass through FlowNetC with weights frozen + net_c_predictions = self.net_c.model(inputs, training_schedule, trainable=True) + + # Perform flow warping (to move image B closer to image A based on flow prediction) + warped = flow_warp(inputs['input_b'], net_c_predictions['flow']) + + # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels) + brightness_error = inputs['input_a'] - warped + brightness_error = tf.square(brightness_error) + brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3) + brightness_error = tf.sqrt(brightness_error) + + # Gather all inputs to FlowNetS + inputs_to_s = { + 'input_a': inputs['input_a'], + 'input_b': inputs['input_b'], + 'warped': warped, + 'flow': net_c_predictions['flow'] * 0.05, + 'brightness_error': brightness_error, + } + + return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable) + + def loss(self, flow, predictions): + return self.net_s.loss(flow, predictions) diff --git a/Codes/flownet2/src/flownet_cs/test.py b/Codes/flownet2/src/flownet_cs/test.py new file mode 100644 index 0000000..ae00ff4 --- /dev/null +++ b/Codes/flownet2/src/flownet_cs/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet_cs import FlowNetCS + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNetCS(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNetCS/flownet-CS.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet_cs/train.py b/Codes/flownet2/src/flownet_cs/train.py new file mode 100644 index 0000000..9376132 --- /dev/null +++ b/Codes/flownet2/src/flownet_cs/train.py @@ -0,0 +1,21 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet_cs import FlowNetCS + +# Create a new network +net = FlowNetCS() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_cs', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow, + # Load trained weights for C part of network + checkpoints={'./checkpoints/FlowNetC/flownet-C.ckpt-0': ('FlowNetCS/FlowNetC', 'FlowNetCS')} +) diff --git a/Codes/flownet2/src/flownet_css/__init__.py b/Codes/flownet2/src/flownet_css/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet_css/__init__.py diff --git a/Codes/flownet2/src/flownet_css/flownet_css.py b/Codes/flownet2/src/flownet_css/flownet_css.py new file mode 100644 index 0000000..93d9db2 --- /dev/null +++ b/Codes/flownet2/src/flownet_css/flownet_css.py @@ -0,0 +1,41 @@ +from ..net import Net, Mode +from ..flownet_cs.flownet_cs import FlowNetCS +from ..flownet_s.flownet_s import FlowNetS +from ..flow_warp import flow_warp +import tensorflow as tf + + +class FlowNetCSS(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + self.net_cs = FlowNetCS(mode, debug) + self.net_s = FlowNetS(mode, debug) + super(FlowNetCSS, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True): + with tf.variable_scope('FlowNetCSS'): + # Forward pass through FlowNetCS with weights frozen + net_cs_predictions = self.net_cs.model(inputs, training_schedule, trainable=True) + + # Perform flow warping (to move image B closer to image A based on flow prediction) + warped = flow_warp(inputs['input_b'], net_cs_predictions['flow']) + + # Compute brightness error: sqrt(sum (input_a - warped)^2 over channels) + brightness_error = inputs['input_a'] - warped + brightness_error = tf.square(brightness_error) + brightness_error = tf.reduce_sum(brightness_error, keep_dims=True, axis=3) + brightness_error = tf.sqrt(brightness_error) + + # Gather all inputs to FlowNetS + inputs_to_s = { + 'input_a': inputs['input_a'], + 'input_b': inputs['input_b'], + 'warped': warped, + 'flow': net_cs_predictions['flow'] * 0.05, + 'brightness_error': brightness_error, + } + + return self.net_s.model(inputs_to_s, training_schedule, trainable=trainable) + + def loss(self, flow, predictions): + return self.net_s.loss(flow, predictions) diff --git a/Codes/flownet2/src/flownet_css/test.py b/Codes/flownet2/src/flownet_css/test.py new file mode 100644 index 0000000..9d1249e --- /dev/null +++ b/Codes/flownet2/src/flownet_css/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet_css import FlowNetCSS + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNetCSS(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNetCSS/flownet-CSS.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet_css/train.py b/Codes/flownet2/src/flownet_css/train.py new file mode 100644 index 0000000..2964f3e --- /dev/null +++ b/Codes/flownet2/src/flownet_css/train.py @@ -0,0 +1,22 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet_css import FlowNetCSS + +# Create a new network +net = FlowNetCSS() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_css', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow, + # Load trained weights for CS part of network + checkpoints={ + './checkpoints/FlowNetCS/flownet-CS.ckpt-0': ('FlowNetCSS/FlowNetCS', 'FlowNetCSS')} +) diff --git a/Codes/flownet2/src/flownet_s/__init__.py b/Codes/flownet2/src/flownet_s/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet_s/__init__.py diff --git a/Codes/flownet2/src/flownet_s/flownet_s.py b/Codes/flownet2/src/flownet_s/flownet_s.py new file mode 100644 index 0000000..f6704b1 --- /dev/null +++ b/Codes/flownet2/src/flownet_s/flownet_s.py @@ -0,0 +1,161 @@ +from ..net import Net, Mode +from ..utils import LeakyReLU, average_endpoint_error, pad, antipad +from ..downsample import downsample +import math +import tensorflow as tf +slim = tf.contrib.slim + + +class FlowNetS(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + super(FlowNetS, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True): + _, height, width, _ = inputs['input_a'].shape.as_list() + stacked = False + with tf.variable_scope('FlowNetS'): + if 'warped' in inputs and 'flow' in inputs and 'brightness_error' in inputs: + stacked = True + concat_inputs = tf.concat([inputs['input_a'], + inputs['input_b'], + inputs['warped'], + inputs['flow'], + inputs['brightness_error']], axis=3) + else: + concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3) + with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], + # Only backprop this network if trainable + trainable=trainable, + # He (aka MSRA) weight initialization + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=LeakyReLU, + # We will do our own padding to match the original Caffe code + padding='VALID'): + + weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay']) + with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): + with slim.arg_scope([slim.conv2d], stride=2): + conv_1 = slim.conv2d(pad(concat_inputs, 3), 64, 7, scope='conv1') + conv_2 = slim.conv2d(pad(conv_1, 2), 128, 5, scope='conv2') + conv_3 = slim.conv2d(pad(conv_2, 2), 256, 5, scope='conv3') + + conv3_1 = slim.conv2d(pad(conv_3), 256, 3, scope='conv3_1') + with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3): + conv4 = slim.conv2d(pad(conv3_1), stride=2, scope='conv4') + conv4_1 = slim.conv2d(pad(conv4), scope='conv4_1') + conv5 = slim.conv2d(pad(conv4_1), stride=2, scope='conv5') + conv5_1 = slim.conv2d(pad(conv5), scope='conv5_1') + conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6') + conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1') + + """ START: Refinement Network """ + with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): + predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, + scope='predict_flow6', + activation_fn=None) + deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4, + stride=2, + scope='deconv5')) + upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4, + stride=2, + scope='upsample_flow6to5', + activation_fn=None)) + concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3) + + predict_flow5 = slim.conv2d(pad(concat5), 2, 3, + scope='predict_flow5', + activation_fn=None) + deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4, + stride=2, + scope='deconv4')) + upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4, + stride=2, + scope='upsample_flow5to4', + activation_fn=None)) + concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3) + + predict_flow4 = slim.conv2d(pad(concat4), 2, 3, + scope='predict_flow4', + activation_fn=None) + deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4, + stride=2, + scope='deconv3')) + upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4, + stride=2, + scope='upsample_flow4to3', + activation_fn=None)) + concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3) + + predict_flow3 = slim.conv2d(pad(concat3), 2, 3, + scope='predict_flow3', + activation_fn=None) + deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4, + stride=2, + scope='deconv2')) + upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4, + stride=2, + scope='upsample_flow3to2', + activation_fn=None)) + concat2 = tf.concat([conv_2, deconv2, upsample_flow3to2], axis=3) + + predict_flow2 = slim.conv2d(pad(concat2), 2, 3, + scope='predict_flow2', + activation_fn=None) + """ END: Refinement Network """ + + flow = predict_flow2 * 20.0 + # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different + flow = tf.image.resize_bilinear(flow, + tf.stack([height, width]), + align_corners=True) + + return { + 'predict_flow6': predict_flow6, + 'predict_flow5': predict_flow5, + 'predict_flow4': predict_flow4, + 'predict_flow3': predict_flow3, + 'predict_flow2': predict_flow2, + 'flow': flow, + } + + def loss(self, flow, predictions): + flow = flow * 0.05 + + losses = [] + INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value) + + # L2 loss between predict_flow6, blob23 (weighted w/ 0.32) + predict_flow6 = predictions['predict_flow6'] + size = [predict_flow6.shape[1], predict_flow6.shape[2]] + downsampled_flow6 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow6, predict_flow6)) + + # L2 loss between predict_flow5, blob28 (weighted w/ 0.08) + predict_flow5 = predictions['predict_flow5'] + size = [predict_flow5.shape[1], predict_flow5.shape[2]] + downsampled_flow5 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow5, predict_flow5)) + + # L2 loss between predict_flow4, blob33 (weighted w/ 0.02) + predict_flow4 = predictions['predict_flow4'] + size = [predict_flow4.shape[1], predict_flow4.shape[2]] + downsampled_flow4 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow4, predict_flow4)) + + # L2 loss between predict_flow3, blob38 (weighted w/ 0.01) + predict_flow3 = predictions['predict_flow3'] + size = [predict_flow3.shape[1], predict_flow3.shape[2]] + downsampled_flow3 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow3, predict_flow3)) + + # L2 loss between predict_flow2, blob43 (weighted w/ 0.005) + predict_flow2 = predictions['predict_flow2'] + size = [predict_flow2.shape[1], predict_flow2.shape[2]] + downsampled_flow2 = downsample(flow, size) + losses.append(average_endpoint_error(downsampled_flow2, predict_flow2)) + + loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005]) + + # Return the 'total' loss: loss fns + regularization terms defined in the model + return tf.losses.get_total_loss() diff --git a/Codes/flownet2/src/flownet_s/test.py b/Codes/flownet2/src/flownet_s/test.py new file mode 100644 index 0000000..ae1b2f3 --- /dev/null +++ b/Codes/flownet2/src/flownet_s/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet_s import FlowNetS + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNetS(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNetS/flownet-S.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet_s/train.py b/Codes/flownet2/src/flownet_s/train.py new file mode 100644 index 0000000..13a792a --- /dev/null +++ b/Codes/flownet2/src/flownet_s/train.py @@ -0,0 +1,19 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet_s import FlowNetS + +# Create a new network +net = FlowNetS() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_s_sample', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow +) diff --git a/Codes/flownet2/src/flownet_sd/__init__.py b/Codes/flownet2/src/flownet_sd/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/flownet_sd/__init__.py diff --git a/Codes/flownet2/src/flownet_sd/flownet_sd.py b/Codes/flownet2/src/flownet_sd/flownet_sd.py new file mode 100644 index 0000000..2f5c9e4 --- /dev/null +++ b/Codes/flownet2/src/flownet_sd/flownet_sd.py @@ -0,0 +1,160 @@ +from ..net import Net, Mode +from ..utils import LeakyReLU, average_endpoint_error, pad, antipad +# from ..downsample import downsample +import math +import tensorflow as tf +slim = tf.contrib.slim + + +class FlowNetSD(Net): + + def __init__(self, mode=Mode.TRAIN, debug=False): + super(FlowNetSD, self).__init__(mode=mode, debug=debug) + + def model(self, inputs, training_schedule, trainable=True, reuse=None): + _, height, width, _ = inputs['input_a'].shape.as_list() + with tf.variable_scope('FlowNetSD', reuse=reuse): + concat_inputs = tf.concat([inputs['input_a'], inputs['input_b']], axis=3) + with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], + # Only backprop this network if trainable + trainable=trainable, + # He (aka MSRA) weight initialization + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=LeakyReLU, + # We will do our own padding to match the original Caffe code + padding='VALID'): + + weights_regularizer = slim.l2_regularizer(training_schedule['weight_decay']) + with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): + conv0 = slim.conv2d(pad(concat_inputs), 64, 3, scope='conv0') + conv1 = slim.conv2d(pad(conv0), 64, 3, stride=2, scope='conv1') + conv1_1 = slim.conv2d(pad(conv1), 128, 3, scope='conv1_1') + conv2 = slim.conv2d(pad(conv1_1), 128, 3, stride=2, scope='conv2') + conv2_1 = slim.conv2d(pad(conv2), 128, 3, scope='conv2_1') + conv3 = slim.conv2d(pad(conv2_1), 256, 3, stride=2, scope='conv3') + conv3_1 = slim.conv2d(pad(conv3), 256, 3, scope='conv3_1') + conv4 = slim.conv2d(pad(conv3_1), 512, 3, stride=2, scope='conv4') + conv4_1 = slim.conv2d(pad(conv4), 512, 3, scope='conv4_1') + conv5 = slim.conv2d(pad(conv4_1), 512, 3, stride=2, scope='conv5') + conv5_1 = slim.conv2d(pad(conv5), 512, 3, scope='conv5_1') + conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope='conv6') + conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope='conv6_1') + + """ START: Refinement Network """ + with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): + predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, + scope='predict_flow6', + activation_fn=None) + deconv5 = antipad(slim.conv2d_transpose(conv6_1, 512, 4, + stride=2, + scope='deconv5')) + upsample_flow6to5 = antipad(slim.conv2d_transpose(predict_flow6, 2, 4, + stride=2, + scope='upsample_flow6to5', + activation_fn=None)) + concat5 = tf.concat([conv5_1, deconv5, upsample_flow6to5], axis=3) + interconv5 = slim.conv2d(pad(concat5), 512, 3, + activation_fn=None, scope='interconv5') + + predict_flow5 = slim.conv2d(pad(interconv5), 2, 3, + scope='predict_flow5', + activation_fn=None) + deconv4 = antipad(slim.conv2d_transpose(concat5, 256, 4, + stride=2, + scope='deconv4')) + upsample_flow5to4 = antipad(slim.conv2d_transpose(predict_flow5, 2, 4, + stride=2, + scope='upsample_flow5to4', + activation_fn=None)) + concat4 = tf.concat([conv4_1, deconv4, upsample_flow5to4], axis=3) + interconv4 = slim.conv2d(pad(concat4), 256, 3, + activation_fn=None, scope='interconv4') + + predict_flow4 = slim.conv2d(pad(interconv4), 2, 3, + scope='predict_flow4', + activation_fn=None) + deconv3 = antipad(slim.conv2d_transpose(concat4, 128, 4, + stride=2, + scope='deconv3')) + upsample_flow4to3 = antipad(slim.conv2d_transpose(predict_flow4, 2, 4, + stride=2, + scope='upsample_flow4to3', + activation_fn=None)) + concat3 = tf.concat([conv3_1, deconv3, upsample_flow4to3], axis=3) + interconv3 = slim.conv2d(pad(concat3), 128, 3, + activation_fn=None, scope='interconv3') + + predict_flow3 = slim.conv2d(pad(interconv3), 2, 3, + scope='predict_flow3', + activation_fn=None) + deconv2 = antipad(slim.conv2d_transpose(concat3, 64, 4, + stride=2, + scope='deconv2')) + upsample_flow3to2 = antipad(slim.conv2d_transpose(predict_flow3, 2, 4, + stride=2, + scope='upsample_flow3to2', + activation_fn=None)) + concat2 = tf.concat([conv2, deconv2, upsample_flow3to2], axis=3) + interconv2 = slim.conv2d(pad(concat2), 64, 3, + activation_fn=None, scope='interconv2') + + predict_flow2 = slim.conv2d(pad(interconv2), 2, 3, + scope='predict_flow2', + activation_fn=None) + """ END: Refinement Network """ + + flow = predict_flow2 * 0.05 + # TODO: Look at Accum (train) or Resample (deploy) to see if we need to do something different + flow = tf.image.resize_bilinear(flow, + tf.stack([height, width]), + align_corners=True) + + return { + 'predict_flow6': predict_flow6, + 'predict_flow5': predict_flow5, + 'predict_flow4': predict_flow4, + 'predict_flow3': predict_flow3, + 'predict_flow2': predict_flow2, + 'flow': flow, + } + + # def loss(self, flow, predictions): + # flow = flow * 20.0 + # + # losses = [] + # INPUT_HEIGHT, INPUT_WIDTH = float(flow.shape[1].value), float(flow.shape[2].value) + # + # # L2 loss between predict_flow6, blob23 (weighted w/ 0.32) + # predict_flow6 = predictions['predict_flow6'] + # size = [predict_flow6.shape[1], predict_flow6.shape[2]] + # downsampled_flow6 = downsample(flow, size) + # losses.append(average_endpoint_error(downsampled_flow6, predict_flow6)) + # + # # L2 loss between predict_flow5, blob28 (weighted w/ 0.08) + # predict_flow5 = predictions['predict_flow5'] + # size = [predict_flow5.shape[1], predict_flow5.shape[2]] + # downsampled_flow5 = downsample(flow, size) + # losses.append(average_endpoint_error(downsampled_flow5, predict_flow5)) + # + # # L2 loss between predict_flow4, blob33 (weighted w/ 0.02) + # predict_flow4 = predictions['predict_flow4'] + # size = [predict_flow4.shape[1], predict_flow4.shape[2]] + # downsampled_flow4 = downsample(flow, size) + # losses.append(average_endpoint_error(downsampled_flow4, predict_flow4)) + # + # # L2 loss between predict_flow3, blob38 (weighted w/ 0.01) + # predict_flow3 = predictions['predict_flow3'] + # size = [predict_flow3.shape[1], predict_flow3.shape[2]] + # downsampled_flow3 = downsample(flow, size) + # losses.append(average_endpoint_error(downsampled_flow3, predict_flow3)) + # + # # L2 loss between predict_flow2, blob43 (weighted w/ 0.005) + # predict_flow2 = predictions['predict_flow2'] + # size = [predict_flow2.shape[1], predict_flow2.shape[2]] + # downsampled_flow2 = downsample(flow, size) + # losses.append(average_endpoint_error(downsampled_flow2, predict_flow2)) + # + # loss = tf.losses.compute_weighted_loss(losses, [0.32, 0.08, 0.02, 0.01, 0.005]) + # + # # Return the 'total' loss: loss fns + regularization terms defined in the model + # return tf.losses.get_total_loss() diff --git a/Codes/flownet2/src/flownet_sd/test.py b/Codes/flownet2/src/flownet_sd/test.py new file mode 100644 index 0000000..b2ac285 --- /dev/null +++ b/Codes/flownet2/src/flownet_sd/test.py @@ -0,0 +1,51 @@ +import argparse +import os +from ..net import Mode +from .flownet_sd import FlowNetSD + +FLAGS = None + + +def main(): + # Create a new network + net = FlowNetSD(mode=Mode.TEST) + + # Train on the data + net.test( + checkpoint='./checkpoints/FlowNetSD/flownet-SD.ckpt-0', + input_a_path=FLAGS.input_a, + input_b_path=FLAGS.input_b, + out_path=FLAGS.out, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_a', + type=str, + required=True, + help='Path to first image' + ) + parser.add_argument( + '--input_b', + type=str, + required=True, + help='Path to second image' + ) + parser.add_argument( + '--out', + type=str, + required=True, + help='Path to output flow result' + ) + FLAGS = parser.parse_args() + + # Verify arguments are valid + if not os.path.exists(FLAGS.input_a): + raise ValueError('image_a path must exist') + if not os.path.exists(FLAGS.input_b): + raise ValueError('image_b path must exist') + if not os.path.isdir(FLAGS.out): + raise ValueError('out directory must exist') + main() diff --git a/Codes/flownet2/src/flownet_sd/train.py b/Codes/flownet2/src/flownet_sd/train.py new file mode 100644 index 0000000..86c64e5 --- /dev/null +++ b/Codes/flownet2/src/flownet_sd/train.py @@ -0,0 +1,19 @@ +from ..dataloader import load_batch +from ..dataset_configs import FLYING_CHAIRS_DATASET_CONFIG +from ..training_schedules import LONG_SCHEDULE +from .flownet_sd import FlowNetSD + +# Create a new network +net = FlowNetSD() + +# Load a batch of data +input_a, input_b, flow = load_batch(FLYING_CHAIRS_DATASET_CONFIG, 'sample', net.global_step) + +# Train on the data +net.train( + log_dir='./logs/flownet_sd_sample', + training_schedule=LONG_SCHEDULE, + input_a=input_a, + input_b=input_b, + flow=flow +) diff --git a/Codes/flownet2/src/net.py b/Codes/flownet2/src/net.py new file mode 100644 index 0000000..43b2193 --- /dev/null +++ b/Codes/flownet2/src/net.py @@ -0,0 +1,177 @@ +import abc +from enum import Enum +import os +import tensorflow as tf +from .flowlib import flow_to_image, write_flow +import numpy as np +# from scipy.misc import imread, imsave, imresize +import cv2 +import uuid +from .training_schedules import LONG_SCHEDULE +slim = tf.contrib.slim + +os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID" +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +class Mode(Enum): + TRAIN = 1 + TEST = 2 + + +class Net(object): + __metaclass__ = abc.ABCMeta + + def __init__(self, mode=Mode.TRAIN, debug=False): + self.global_step = slim.get_or_create_global_step() + self.mode = mode + self.debug = debug + + @abc.abstractmethod + def model(self, inputs, training_schedule, trainable=True): + """ + Defines the model and returns a tuple of Tensors needed for calculating the loss. + """ + return + + @abc.abstractmethod + def loss(self, **kwargs): + """ + Accepts prediction Tensors from the output of `model`. + Returns a single Tensor representing the total loss of the model. + """ + return + """ + python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_6.png \ + --input_b /home/liuwen/ssd/videogan/Save_2017_05_31/Images/ped1_adv/Evaluate/model.ckpt-100000/01/gen_7.png \ + --out ./ + python -m src.flownet_sd.test --input_a 006.png --input_b 007.png --out ./ + python -m src.flownet_sd.test --input_a /home/liuwen/ssd/videogan/ped1/frames/testing/01/006.jpg \ + --input_b /home/liuwen/ssd/videogan/ped1/frames/testing/01/007.jpg \ + --out ./ + """ + def test(self, checkpoint, input_a_path, input_b_path, out_path, save_image=True, save_flo=False): + input_a = cv2.imread(input_a_path) + input_b = cv2.imread(input_b_path) + + input_a = cv2.resize(input_a, (512, 384)) + input_b = cv2.resize(input_b, (512, 384)) + print(input_a.shape, input_b.shape) + + # Convert from RGB -> BGR + # input_a = input_a[..., [2, 1, 0]] + # input_b = input_b[..., [2, 1, 0]] + + # Scale from [0, 255] -> [0.0, 1.0] if needed + if input_a.max() > 1.0: + input_a = input_a / 255.0 + if input_b.max() > 1.0: + input_b = input_b / 255.0 + + # TODO: This is a hack, we should get rid of this + training_schedule = LONG_SCHEDULE + + inputs = { + 'input_a': tf.expand_dims(tf.constant(input_a, dtype=tf.float32), 0), + 'input_b': tf.expand_dims(tf.constant(input_b, dtype=tf.float32), 0), + } + predictions = self.model(inputs, training_schedule) + pred_flow = predictions['flow'] + + saver = tf.train.Saver() + + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + saver.restore(sess, checkpoint) + pred_flow = sess.run(pred_flow)[0, :, :, :] + + np.save('temporal_ped1', pred_flow) + + unique_name = 'flow-' + str(uuid.uuid4()) + if save_image: + flow_img = flow_to_image(pred_flow) + full_out_path = os.path.join(out_path, unique_name + '.png') + cv2.imwrite(full_out_path, flow_img) + + if save_flo: + full_out_path = os.path.join(out_path, unique_name + '.flo') + write_flow(pred_flow, full_out_path) + + def train(self, log_dir, training_schedule, input_a, input_b, flow, checkpoints=None): + tf.summary.image("image_a", input_a, max_outputs=2) + tf.summary.image("image_b", input_b, max_outputs=2) + + self.learning_rate = tf.train.piecewise_constant( + self.global_step, + [tf.cast(v, tf.int64) for v in training_schedule['step_values']], + training_schedule['learning_rates']) + + optimizer = tf.train.AdamOptimizer( + self.learning_rate, + training_schedule['momentum'], + training_schedule['momentum2']) + + inputs = { + 'input_a': input_a, + 'input_b': input_b, + } + predictions = self.model(inputs, training_schedule) + total_loss = self.loss(flow, predictions) + tf.summary.scalar('loss', total_loss) + + if checkpoints: + for (checkpoint_path, (scope, new_scope)) in checkpoints.iteritems(): + variables_to_restore = slim.get_variables(scope=scope) + renamed_variables = { + var.op.name.split(new_scope + '/')[1]: var + for var in variables_to_restore + } + restorer = tf.train.Saver(renamed_variables) + with tf.Session() as sess: + restorer.restore(sess, checkpoint_path) + + # Show the generated flow in TensorBoard + if 'flow' in predictions: + pred_flow_0 = predictions['flow'][0, :, :, :] + pred_flow_0 = tf.py_func(flow_to_image, [pred_flow_0], tf.uint8) + pred_flow_1 = predictions['flow'][1, :, :, :] + pred_flow_1 = tf.py_func(flow_to_image, [pred_flow_1], tf.uint8) + pred_flow_img = tf.stack([pred_flow_0, pred_flow_1], 0) + tf.summary.image('pred_flow', pred_flow_img, max_outputs=2) + + true_flow_0 = flow[0, :, :, :] + true_flow_0 = tf.py_func(flow_to_image, [true_flow_0], tf.uint8) + true_flow_1 = flow[1, :, :, :] + true_flow_1 = tf.py_func(flow_to_image, [true_flow_1], tf.uint8) + true_flow_img = tf.stack([true_flow_0, true_flow_1], 0) + tf.summary.image('true_flow', true_flow_img, max_outputs=2) + + train_op = slim.learning.create_train_op( + total_loss, + optimizer, + summarize_gradients=True) + + if self.debug: + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + tf.train.start_queue_runners(sess) + slim.learning.train_step( + sess, + train_op, + self.global_step, + { + 'should_trace': tf.constant(1), + 'should_log': tf.constant(1), + 'logdir': log_dir + '/debug', + } + ) + else: + slim.learning.train( + train_op, + log_dir, + # session_config=tf.ConfigProto(allow_soft_placement=True), + global_step=self.global_step, + save_summaries_secs=60, + number_of_steps=training_schedule['max_iter'] + ) diff --git a/Codes/flownet2/src/ops/build/.gitkeep b/Codes/flownet2/src/ops/build/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Codes/flownet2/src/ops/build/.gitkeep diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc new file mode 100644 index 0000000..4e92f45 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cc @@ -0,0 +1,160 @@ +#define EIGEN_USE_THREADS + +#include "correlation_kernel.h" +#include "pad.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +template<typename Device> +class CorrelationGradKernel : public OpKernel { + public: + explicit CorrelationGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) { + // Get the attributes + OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad)); + + OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd")); + } + + void Compute(OpKernelContext *ctx) override { + // Get the input images and verify their dimensions + const Tensor& gradients_t = ctx->input(0); + const Tensor& input_a_t = ctx->input(1); + const Tensor& input_b_t = ctx->input(2); + + OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4")); + OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4")); + + // Get dimensions of input + const int batch_size = input_a_t.dim_size(0); + const int in_height = input_a_t.dim_size(1); + const int in_width = input_a_t.dim_size(2); + const int in_channels = input_a_t.dim_size(3); + const int in_count_per_sample = in_height * in_width * in_channels; + const int padded_height = in_height + 2 * pad; + const int padded_width = in_width + 2 * pad; + + // The size of unreachable border region on each side + const int kernel_radius = (kernel_size - 1) / 2; + const int border_size = max_displacement + kernel_radius; + + // Calculate the output dimensions + const int out_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1); + const int out_width = ceil((float)(padded_width - border_size * 2) / (float)stride_1); + + const int neighborhood_grid_radius = max_displacement / stride_2; + const int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1; + const int out_channels = neighborhood_grid_width * neighborhood_grid_width; + + // Allocate the memory for the outputs + Tensor *output_a_gradient_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_a_t.shape(), &output_a_gradient_t)); + Tensor *output_b_gradient_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, input_b_t.shape(), &output_b_gradient_t)); + + // Get the tensors + auto gradients = gradients_t.tensor<float, 4>(); + auto input_a = input_a_t.tensor<float, 4>(); + auto input_b = input_b_t.tensor<float, 4>(); + auto output_a_gradient = output_a_gradient_t->tensor<float, 4>(); + auto output_b_gradient = output_b_gradient_t->tensor<float, 4>(); + + // Create temporary tensors for padded inputs + Tensor padded_input_a_t, padded_input_b_t; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, padded_height, padded_width, in_channels }), + &padded_input_a_t)); + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, padded_height, padded_width, in_channels }), + &padded_input_b_t)); + auto padded_input_a = padded_input_a_t.tensor<float, 4>(); + auto padded_input_b = padded_input_b_t.tensor<float, 4>(); + + // Pad the inputs + Pad(ctx->eigen_device<Device>(), + input_a.data(), + batch_size, + in_height, + in_width, + in_channels, + padded_height, + padded_width, + padded_input_a.data()); + Pad(ctx->eigen_device<Device>(), + input_b.data(), + batch_size, + in_height, + in_width, + in_channels, + padded_height, + padded_width, + padded_input_b.data()); + + CorrelationGradA(ctx->eigen_gpu_device(), + batch_size, + out_width, + out_height, + out_channels, + max_displacement, + neighborhood_grid_radius, + neighborhood_grid_width, + kernel_radius, + stride_1, + stride_2, + in_width, + in_height, + padded_width, + padded_height, + in_channels, + in_count_per_sample, + pad, + padded_input_b.data(), + gradients.data(), + output_a_gradient.data()); + + CorrelationGradB(ctx->eigen_gpu_device(), + batch_size, + out_width, + out_height, + out_channels, + max_displacement, + neighborhood_grid_radius, + neighborhood_grid_width, + kernel_radius, + stride_1, + stride_2, + in_width, + in_height, + padded_width, + padded_height, + in_channels, + in_count_per_sample, + pad, + padded_input_a.data(), + gradients.data(), + output_b_gradient.data()); + } + + private: + int kernel_size; + int max_displacement; + int stride_1; + int stride_2; + int pad; +}; + +REGISTER_KERNEL_BUILDER(Name("CorrelationGrad") + .Device(DEVICE_GPU), + CorrelationGradKernel<GPUDevice>) +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc new file mode 100644 index 0000000..19e3a40 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_grad_kernel.cu.cc @@ -0,0 +1,262 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#define ROUND_OFF 50000 + +#include <stdio.h> +#include <iostream> + +#include "correlation_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void CorrelateDataBackward0(const int nthreads, + int item, + int out_width, + int out_height, + int out_channels, + int max_displacement, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_radius, + int stride_1, + int stride_2, + int in_width, + int in_height, + int padded_in_width, + int padded_in_height, + int in_channels, + int in_count_per_sample, + int pad_size, + float *output_a_gradient, + const float *input_b, + const float *gradient) +{ + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int k = index % in_channels; // channels + int x = (index / in_channels) % in_width + pad_size; // w-pos + int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos + + // Get X,Y ranges and clamp + // round_off is a trick to enable integer division with ceil, even for + // negative numbers + // We use a large offset, for the inner part not to become negative. + const int round_off = ROUND_OFF; + const int round_off_s1 = stride_1 * round_off; + + // We add round_off before_s1 the int division and subtract round_off after + // it, to ensure the formula matches ceil behavior: + int xmin = (x - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 - + round_off; + int ymin = (y - 2 * kernel_radius - max_displacement + round_off_s1 - 1) / stride_1 + 1 - + round_off; + + // Same here: + int xmax = (x - max_displacement + round_off_s1) / stride_1 - round_off; + int ymax = (y - max_displacement + round_off_s1) / stride_1 - round_off; + + float sum = 0; + + if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) { + xmin = max(0, xmin); + xmax = min(out_width - 1, xmax); + + ymin = max(0, ymin); + ymax = min(out_height - 1, ymax); + + for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) { + for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) { + // Get input_b data: + int s2o = stride_2 * o; + int s2p = stride_2 * p; + int idx_input_b = ((item * padded_in_height + (y + s2p)) * padded_in_width + (x + s2o)) * + in_channels + k; + float input_b_tmp = input_b[idx_input_b]; // input_b[x+s2o,y+s2p,k] + + // Index offset for gradient in following loops: + int op = (p + neighborhood_grid_radius) * neighborhood_grid_width + + (o + neighborhood_grid_radius); // index [o,p] + + for (int y = ymin; y <= ymax; y++) { + for (int x = xmin; x <= xmax; x++) { + // gradient[x,y,o,p] + int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op; + sum += gradient[idx_gradient] * input_b_tmp; + } + } + } + } + } + const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels; + const int input_a_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k; + output_a_gradient[input_a_idx + item * in_count_per_sample] = sum / (float)sumelems; + } +} + +__global__ void CorrelateDataBackward1(const int nthreads, + int item, + int out_width, + int out_height, + int out_channels, + int max_displacement, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_radius, + int stride_1, + int stride_2, + int in_width, + int in_height, + int padded_in_width, + int padded_in_height, + int in_channels, + int in_count_per_sample, + int pad_size, + float *output_b_gradient, + const float *input_a, + const float *gradient) +{ + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int k = index % in_channels; // channels + int x = (index / in_channels) % in_width + pad_size; // w-pos + int y = (index / in_channels / in_width) % in_height + pad_size; // h-pos + + // round_off is a trick to enable integer division with ceil, even for + // negative numbers + // We use a large offset, for the inner part not to become negative. + const int round_off = ROUND_OFF; + const int round_off_s1 = stride_1 * round_off; + + float sum = 0; + + // Height (y) + for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) { + // Width (x) + for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) { + int s2o = stride_2 * o; + int s2p = stride_2 * p; + + // Get X,Y ranges and clamp + // We add round_off before_s1 the int division and subtract round_off + // after it, to ensure the formula matches ceil behavior: + int xmin = (x - 2 * kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride_1 + + 1 - round_off; + int ymin = (y - 2 * kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride_1 + + 1 - round_off; + + // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w) + // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) + + // Same here: + int xmax = (x - max_displacement - s2o + round_off_s1) / stride_1 - round_off; + int ymax = (y - max_displacement - s2p + round_off_s1) / stride_1 - round_off; + + if ((xmax >= 0) && (ymax >= 0) && (xmin <= out_width - 1) && (ymin <= out_height - 1)) { + xmin = max(0, xmin); + xmax = min(out_width - 1, xmax); + + ymin = max(0, ymin); + ymax = min(out_height - 1, ymax); + + // Get input_a data: + int idx_input_a = ((item * padded_in_height + (y - s2p)) * padded_in_width + (x - s2o)) * + in_channels + k; + float input_a_tmp = input_a[idx_input_a]; + + // Index offset for gradient in following loops: + int op = (p + neighborhood_grid_radius) * neighborhood_grid_width + + (o + neighborhood_grid_radius); // index [o,p] + + for (int y = ymin; y <= ymax; y++) { + for (int x = xmin; x <= xmax; x++) { + int idx_gradient = ((item * out_height + y) * out_width + x) * out_channels + op; + sum += gradient[idx_gradient] * input_a_tmp; + } + } + } + } + } + const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * in_channels; + const int input_b_idx = ((y - pad_size) * in_width + (x - pad_size)) * in_channels + k; + output_b_gradient[input_b_idx + item * in_count_per_sample] = sum / (float)sumelems; + } +} + +void CorrelationGradA(const GPUDevice& device, + const int batch_size, + const int out_width, + const int out_height, + const int out_channels, + const int max_displacement, + const int neighborhood_grid_radius, + const int neighborhood_grid_width, + const int kernel_radius, + const int stride_1, + const int stride_2, + const int in_width, + const int in_height, + const int padded_in_width, + const int padded_in_height, + const int in_channels, + const int in_count_per_sample, // h * w * ch + const int pad, + const float *input_b, + const float *gradient, + float *output_a_gradient) { + CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device); + + for (int n = 0; n < batch_size; n++) { + CorrelateDataBackward0 << < config.block_count, config.thread_per_block, 0, + device.stream() >> > ( + in_count_per_sample, + n, out_width, out_height, out_channels, + max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius, + stride_1, stride_2, + in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad, + output_a_gradient, input_b, gradient); + } +} + +void CorrelationGradB(const GPUDevice& device, + const int batch_size, + const int out_width, + const int out_height, + const int out_channels, + const int max_displacement, + const int neighborhood_grid_radius, + const int neighborhood_grid_width, + const int kernel_radius, + const int stride_1, + const int stride_2, + const int in_width, + const int in_height, + const int padded_in_width, + const int padded_in_height, + const int in_channels, + const int in_count_per_sample, + const int pad, + const float *input_a, + const float *gradient, + float *output_b_gradient) { + CudaLaunchConfig config = GetCudaLaunchConfig(in_count_per_sample, device); + + for (int n = 0; n < batch_size; n++) { + CorrelateDataBackward1 << < config.block_count, config.thread_per_block, 0, + device.stream() >> > ( + in_count_per_sample, + n, out_width, out_height, out_channels, + max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius, + stride_1, stride_2, + in_width, in_height, padded_in_width, padded_in_height, in_channels, in_count_per_sample, pad, + output_b_gradient, input_a, gradient); + } +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc new file mode 100644 index 0000000..f8a5193 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cc @@ -0,0 +1,137 @@ +#define EIGEN_USE_THREADS + +#include <utility> + +#include "correlation_kernel.h" +#include "pad.h" + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +template<typename Device> +class CorrelationKernel : public OpKernel { + public: + explicit CorrelationKernel(OpKernelConstruction *ctx) : OpKernel(ctx) { + // Get the attributes + OP_REQUIRES_OK(ctx, ctx->GetAttr("kernel_size", &kernel_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_displacement", &max_displacement)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_1", &stride_1)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("stride_2", &stride_2)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("pad", &pad)); + + OP_REQUIRES(ctx, kernel_size % 2 != 0, errors::InvalidArgument("kernel_size must be odd")); + } + + void Compute(OpKernelContext *ctx) override { + // Get the input images and transforms and verify their dimensions + const Tensor& input_a_t = ctx->input(0); + const Tensor& input_b_t = ctx->input(1); + + OP_REQUIRES(ctx, input_a_t.dims() == 4, errors::InvalidArgument("input_a must have rank 4")); + OP_REQUIRES(ctx, input_b_t.dims() == 4, errors::InvalidArgument("input_b must have rank 4")); + + // Get dimensions of input (already padded) + int batch_size = input_a_t.dim_size(0); + int input_height = input_a_t.dim_size(1); + int input_width = input_a_t.dim_size(2); + int input_channels = input_a_t.dim_size(3); + int padded_height = input_height + 2 * pad; + int padded_width = input_width + 2 * pad; + + // The size of unreachable border region on each side + int kernel_radius = (kernel_size - 1) / 2; + int border_size = max_displacement + kernel_radius; + + // Calculate the output dimensions + int output_height = ceil((float)(padded_height - border_size * 2) / (float)stride_1); + int output_width = ceil((float)(padded_width - border_size * 2) / (float)stride_1); + + OP_REQUIRES(ctx, output_height >= 1, + errors::InvalidArgument("Neighborhood and kernel don't fit in input height.")); + OP_REQUIRES(ctx, output_width >= 1, + errors::InvalidArgument("Neighborhood and kernel don't fit in input width.")); + + int neighborhood_grid_radius = max_displacement / stride_2; + int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1; + int output_channels = neighborhood_grid_width * neighborhood_grid_width; + + // Allocate the memory for the output + Tensor *output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output( + 0, + TensorShape({ batch_size, output_height, output_width, output_channels }), + &output_t)); + + // Get the tensors + auto input_a = input_a_t.tensor<float, 4>(); + auto input_b = input_b_t.tensor<float, 4>(); + auto output = output_t->tensor<float, 4>(); + + // Create temporary tensors for padded inputs + Tensor padded_input_a_t, padded_input_b_t; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, padded_height, padded_width, input_channels }), + &padded_input_a_t)); + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, padded_height, padded_width, input_channels }), + &padded_input_b_t)); + auto padded_input_a = padded_input_a_t.tensor<float, 4>(); + auto padded_input_b = padded_input_b_t.tensor<float, 4>(); + + // Pad the inputs + Pad(ctx->eigen_device<Device>(), + input_a.data(), + batch_size, + input_height, + input_width, + input_channels, + padded_height, + padded_width, + padded_input_a.data()); + Pad(ctx->eigen_device<Device>(), + input_b.data(), + batch_size, + input_height, + input_width, + input_channels, + padded_height, + padded_width, + padded_input_b.data()); + + // Perform cross correlation + Correlation(ctx->eigen_device<Device>(), + padded_input_a.data(), + padded_input_b.data(), + batch_size, + output_height, + output_width, + output_channels, + output_height * output_width * output_channels, + padded_height, + padded_width, + input_channels, + max_displacement, + neighborhood_grid_radius, + neighborhood_grid_width, + kernel_radius, + kernel_size, + stride_1, + stride_2, + output.data()); + } + + private: + int kernel_size; + int max_displacement; + int stride_1; + int stride_2; + int pad; +}; + +REGISTER_KERNEL_BUILDER(Name("Correlation") + .Device(DEVICE_GPU), + CorrelationKernel<GPUDevice>) +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc new file mode 100644 index 0000000..c63e489 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.cu.cc @@ -0,0 +1,153 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#define WARPS_PER_BLOCK 1 +#define THREADS_PER_WARP 32 + +#include <stdio.h> +#include <iostream> + +#include "correlation_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void CorrelateData(int batch_size, + int out_width, + int out_height, + int out_channels, + int out_count, + int max_displacement, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_radius, + int kernel_size, + int stride_1, + int stride_2, + int in_width_padded, + int in_height_padded, + int in_channels, + const float *input_a, + const float *input_b, + float *output) { + extern __shared__ char patch_data_char[]; + + float *patch_data = (float *)patch_data_char; + + // First (upper left) position of kernel upper-left corner in current center + // position of neighborhood in image 1 + int x1 = blockIdx.x * stride_1 + max_displacement; + int y1 = blockIdx.y * stride_1 + max_displacement; + int item = blockIdx.z; + int ch_off = threadIdx.x; + + // Load 3D patch into shared shared memory + // HEIGHT + for (int j = 0; j < kernel_size; j++) { + // WIDTH + for (int i = 0; i < kernel_size; i++) { + int ji_off = ((j * kernel_size) + i) * in_channels; + + // CHANNELS + for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) { + int idx1 = ((item * in_height_padded + y1 + j) * in_width_padded + x1 + i) * + in_channels + ch; + int idxPatchData = ji_off + ch; + patch_data[idxPatchData] = input_a[idx1]; + } + } + } + + __syncthreads(); + + __shared__ float sum[WARPS_PER_BLOCK * THREADS_PER_WARP]; + + // Compute correlation + for (int out_channel = 0; out_channel < out_channels; out_channel++) { + sum[ch_off] = 0; + + int s2o = (out_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride_2; + int s2p = (out_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride_2; + int x2 = x1 + s2o; + int y2 = y1 + s2p; + + // HEIGHT + for (int j = 0; j < kernel_size; j++) { + // WIDTH + for (int i = 0; i < kernel_size; i++) { + int ji_off = ((j * kernel_size) + i) * in_channels; + + // CHANNELS + for (int ch = ch_off; ch < in_channels; ch += (WARPS_PER_BLOCK * THREADS_PER_WARP)) { + int idxPatchData = ji_off + ch; + int idx2 = ((item * in_height_padded + y2 + j) * in_width_padded + x2 + i) * + in_channels + ch; + + sum[ch_off] += patch_data[idxPatchData] * input_b[idx2]; + } + } + } + + __syncthreads(); + + if (ch_off == 0) { + float total_sum = 0; + + for (int idx = 0; idx < WARPS_PER_BLOCK * THREADS_PER_WARP; idx++) { + total_sum += sum[idx]; + } + const int sumelems = kernel_size * kernel_size * in_channels; + const int index = (blockIdx.y * out_width + blockIdx.x) * out_channels + out_channel; + + /* from Caffe: const int index = ((out_channel * out_height + + blockIdx.y) * out_width) + blockIdx.x; */ + output[index + item * out_count] = total_sum / (float)sumelems; + + // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w) + // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) + // n = 0 + // caffe: ((k * H + h) * W + w) + n * K * H * W + // tf: (h * W + w) * K + k + n * H * W * K + } + } +} + +void Correlation(const GPUDevice& device, + const float *input_a, + const float *input_b, + const int batch_size, + const int out_height, + const int out_width, + const int out_channels, + const int out_count, + const int in_height_padded, + const int in_width_padded, + const int in_channels, + int max_displacement, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_radius, + int kernel_size, + int stride_1, + int stride_2, + float *output) { + dim3 totalBlocksCorr(out_width, out_height, batch_size); + dim3 threadsPerBlock(THREADS_PER_WARP *WARPS_PER_BLOCK); + const int shared_memory_per_block = (kernel_size * kernel_size) * in_channels; + + CorrelateData << < totalBlocksCorr, threadsPerBlock, shared_memory_per_block * sizeof(float), + device.stream() >> > ( + batch_size, out_width, out_height, out_channels, out_count, + max_displacement, neighborhood_grid_radius, neighborhood_grid_width, kernel_radius, + kernel_size, stride_1, stride_2, in_width_padded, in_height_padded, in_channels, + input_a, input_b, output); +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/correlation/correlation_kernel.h b/Codes/flownet2/src/ops/correlation/correlation_kernel.h new file mode 100644 index 0000000..a1dfb62 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_kernel.h @@ -0,0 +1,77 @@ +#ifndef FLOWNET_CORRELATION_H_ +#define FLOWNET_CORRELATION_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +void Correlation(const GPUDevice& device, + const float *input_a, + const float *input_b, + const int batch_size, + const int out_height, + const int out_width, + const int out_channels, + const int out_count, + const int in_height_padded, + const int in_width_padded, + const int in_channels, + int max_displacement, + int neighborhood_grid_radius, + int neighborhood_grid_width, + int kernel_radius, + int kernel_size, + int stride_1, + int stride_2, + float *output); + + +void CorrelationGradA(const GPUDevice& device, + const int batch_size, + const int out_width, + const int out_height, + const int out_channels, + const int max_displacement, + const int neighborhood_grid_radius, + const int neighborhood_grid_width, + const int kernel_radius, + const int stride_1, + const int stride_2, + const int in_width, + const int in_height, + const int padded_in_width, + const int padded_in_height, + const int in_channels, + const int in_count_per_sample, + const int pad, + const float *input_b, + const float *gradient, + float *output_a_gradient); + +void CorrelationGradB(const GPUDevice& device, + const int batch_size, + const int out_width, + const int out_height, + const int out_channels, + const int max_displacement, + const int neighborhood_grid_radius, + const int neighborhood_grid_width, + const int kernel_radius, + const int stride_1, + const int stride_2, + const int in_width, + const int in_height, + const int padded_in_width, + const int padded_in_height, + const int in_channels, + const int in_count_per_sample, + const int pad, + const float *input_a, + const float *gradient, + float *output_b_gradient); +} // end namespace tensorflow + +#endif // FLOWNET_CORRELATION_H_ diff --git a/Codes/flownet2/src/ops/correlation/correlation_op.cc b/Codes/flownet2/src/ops/correlation/correlation_op.cc new file mode 100644 index 0000000..4f420f0 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/correlation_op.cc @@ -0,0 +1,83 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +Status SetOutput(InferenceContext *c) { + ShapeHandle input_a, input_b, input; + + // Get shapes of both inputs and verify they are rank 4 + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_a)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_b)); + + // Verify inputs are same dimensions + TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input)); + + // Get the attributes + int kernel_size, max_displacement, stride_1, stride_2, pad; + TF_RETURN_IF_ERROR(c->GetAttr("kernel_size", &kernel_size)); + TF_RETURN_IF_ERROR(c->GetAttr("max_displacement", &max_displacement)); + TF_RETURN_IF_ERROR(c->GetAttr("stride_1", &stride_1)); + TF_RETURN_IF_ERROR(c->GetAttr("stride_2", &stride_2)); + TF_RETURN_IF_ERROR(c->GetAttr("pad", &pad)); + + // Get dimensions of input (already padded) + int64 batch = c->Value(c->Dim(input, 0)); + int64 input_height = c->Value(c->Dim(input, 1)); + int64 input_width = c->Value(c->Dim(input, 2)); + int64 padded_height = input_height + 2 * pad; + int64 padded_width = input_width + 2 * pad; + + // The size of unreachable border region on each side + int kernel_radius = (kernel_size - 1) / 2; + int border_size = max_displacement + kernel_radius; + + // Calculate the output dimensions + int64 output_height = (int64)ceil((float)(padded_height - border_size * 2) / (float)stride_1); + int64 output_width = (int64)ceil((float)(padded_width - border_size * 2) / (float)stride_1); + + // TODO: Verify output size >= 1 + + int neighborhood_grid_radius = max_displacement / stride_2; + int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1; + int64 output_channels = neighborhood_grid_width * neighborhood_grid_width; + + // Set output shape + c->set_output(0, c->MakeShape({ batch, output_height, output_width, output_channels })); + return Status::OK(); +} + +REGISTER_OP("Correlation") +.Input("input_a: float32") +.Input("input_b: float32") +.Attr("kernel_size: int") +.Attr("max_displacement: int") +.Attr("stride_1: int") +.Attr("stride_2: int") +.Attr("pad: int") +.Output("output: float32") +.SetShapeFn(SetOutput); + +REGISTER_OP("CorrelationGrad") +.Input("gradients: float32") +.Input("input_a: float32") +.Input("input_b: float32") +.Attr("kernel_size: int") +.Attr("max_displacement: int") +.Attr("stride_1: int") +.Attr("stride_2: int") +.Attr("pad: int") +.Output("backprops_a: float32") +.Output("backprops_b: float32") +.SetShapeFn([](InferenceContext *c) { + // Output gradients should be the same dimensions as the inputs + ShapeHandle out; + TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->input(2), &out)); + c->set_output(0, out); + c->set_output(1, out); + return Status::OK(); + }); +} // namespace tensorflow diff --git a/Codes/flownet2/src/ops/correlation/pad.cu.cc b/Codes/flownet2/src/ops/correlation/pad.cu.cc new file mode 100644 index 0000000..0b6c93d --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/pad.cu.cc @@ -0,0 +1,76 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "pad.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void PadData( + const float *in, + int in_widthheight, + int in_width, + int in_height, + int out_width, + int out_height, + int channels, + int padding, + float *out) { + int xy = blockIdx.x * blockDim.x + threadIdx.x; + + int x = xy % in_width; + int y = xy / in_width; + int ch = blockIdx.y; + int n = blockIdx.z; + + if (xy >= in_widthheight) { + out[((n * out_height + y) * out_width + x) * channels + ch] = 0.0; + return; + } + + float value = in[((n * in_height + y) * in_width + x) * channels + ch]; + + __syncthreads(); + + int xpad = x + padding; + int ypad = y + padding; + + out[((n * out_height + ypad) * out_width + xpad) * channels + ch] = value; +} + +void Pad(const GPUDevice& device, + const float *input, + int batch_size, + int input_height, + int input_width, + int input_channels, + int output_height, + int output_width, + float *output) { + int in_widthheight = input_width * input_height; + int threads_per_block = 16; + dim3 totalBlocks((in_widthheight - 1) / threads_per_block + 1, input_channels, batch_size); + + cudaMemset(output, 0, batch_size * output_height * output_width * input_channels * sizeof(float)); + + int padding = (output_height - input_height) / 2; + + // LAUNCH KERNEL + PadData << < totalBlocks, threads_per_block, 0, device.stream() >> > ( + input, + in_widthheight, + input_width, + input_height, + output_width, + output_height, + input_channels, + padding, + output); +} +} +#endif // if GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/correlation/pad.h b/Codes/flownet2/src/ops/correlation/pad.h new file mode 100644 index 0000000..afb4df0 --- /dev/null +++ b/Codes/flownet2/src/ops/correlation/pad.h @@ -0,0 +1,20 @@ +#ifndef FLOWNET_PAD_H_ +#define FLOWNET_PAD_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +void Pad(const GPUDevice& device, + const float *input, + int batch_size, + int input_height, + int input_width, + int input_channels, + int output_height, + int output_width, + float *output); +} // end namespace tensorflow + +#endif // ifndef FLOWNET_PAD_H_ diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc new file mode 100644 index 0000000..eefe247 --- /dev/null +++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.cc @@ -0,0 +1,47 @@ +#define EIGEN_USE_THREADS + +#include "downsample_kernel.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device> +class DownsampleKernel : public OpKernel { + public: + explicit DownsampleKernel(OpKernelConstruction* ctx) : OpKernel(ctx) { + // Get the size [height, width] tensor and verify its dimensions + OP_REQUIRES_OK(ctx, ctx->GetAttr("size", &size_)); + OP_REQUIRES(ctx, size_.size() == 2, errors::InvalidArgument("size must be 2 dimensions")); + } + + void Compute(OpKernelContext* ctx) override { + // Get the input images and transforms and verify their dimensions + const Tensor& input_t = ctx->input(0); + OP_REQUIRES(ctx, input_t.dims() == 4, + errors::InvalidArgument("Input images must have rank 4")); + + // Allocate the memory for the output + Tensor* output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output( + 0, TensorShape({input_t.dim_size(0), size_[0], size_[1], input_t.dim_size(3)}), &output_t)); + + // Perform flow augmentation + auto input = input_t.tensor<float, 4>(); + auto output = output_t->tensor<float, 4>(); + + Downsample(ctx->eigen_gpu_device(), input, output); + } + + private: + std::vector<int32> size_; +}; + +REGISTER_KERNEL_BUILDER(Name("Downsample") + .Device(DEVICE_GPU), + DownsampleKernel<GPUDevice>) +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel.h b/Codes/flownet2/src/ops/downsample/downsample_kernel.h new file mode 100644 index 0000000..bcc4e3f --- /dev/null +++ b/Codes/flownet2/src/ops/downsample/downsample_kernel.h @@ -0,0 +1,18 @@ +#ifndef FLOWNET_DOWNSAMPLE_H_ +#define FLOWNET_DOWNSAMPLE_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +bool Downsample(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor input, + typename TTypes<float, 4>::Tensor output); + +} // end namespace tensorflow + +#endif // FLOWNET_DOWNSAMPLE_H_ diff --git a/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc new file mode 100644 index 0000000..b7629a0 --- /dev/null +++ b/Codes/flownet2/src/ops/downsample/downsample_kernel_gpu.cu.cc @@ -0,0 +1,108 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "downsample_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +#define CUDART_NAN_F __int_as_float(0x7fffffff) + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +__global__ void DownsampleKernel( + const int32 nthreads, + const float* input_ptr, + float* output_ptr, + const int in_width, + const int in_height, + const int out_width, + const int out_height, + const int channels, + const float width_scale, + const float height_scale, + const int wradius, + const int hradius) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int c = index % channels; + const int destx = (index / channels) % out_width; + const int desty = (index / channels / out_width) % out_height; + const int n = (index / channels / out_width) / out_height; + + const float srcx = ((float)destx / (float)(out_width - 1)) * (float)(in_width - 1); + const float srcy = ((float)desty / (float)(out_height - 1)) * (float)(in_height - 1); + + const int isrcx = round(srcx); + const int isrcy = round(srcy); + + float accum_value = 0; + float accum_weight = 0; + float accum_nan = 0; + + for (int dy = -hradius; dy <= hradius; dy++) { + int yoff = isrcy + dy; + // + for (int dx = -wradius; dx <= wradius; dx++) { + int xoff = isrcx + dx; + + if (xoff >= 0 && yoff >= 0 && xoff < in_width && yoff < in_height) { + int idx = ((n * in_height + yoff) * in_width + xoff) * channels + c; + float sample = input_ptr[idx]; + float weight = fmaxf(0.0f, 1.0f - (fabsf((float)xoff - srcx) / width_scale)) + * fmaxf(0.0f, 1.0f - (fabsf((float)yoff - srcy) / height_scale)); + if (sample != sample) { // isnan + accum_nan += weight; + sample = 0; + weight = 0; + } + accum_value += sample * weight; + accum_weight += weight; + } + } + } + + if (accum_nan / accum_weight > 0.5) { + output_ptr[index] = CUDART_NAN_F; + } else { + output_ptr[index] = accum_value / accum_weight; + } + } +} + +bool Downsample(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor input, + typename TTypes<float, 4>::Tensor output) { + const int batch_size = output.dimension(0); + const int out_height = output.dimension(1); + const int out_width = output.dimension(2); + const int out_channels = output.dimension(3); + const int total_count = batch_size * out_height * out_width * out_channels; + + const int in_height = input.dimension(1); + const int in_width = input.dimension(2); + + const float width_scale = (float)(in_width - 1) / (float)(out_width - 1); + const float height_scale = (float)(in_height - 1) / (float)(out_height - 1); + + const int wradius = ceil(width_scale); + const int hradius = ceil(height_scale); + + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, device); + DownsampleKernel<<<config.block_count, config.thread_per_block, 0, + device.stream()>>>(total_count, input.data(), output.data(), + in_width, in_height, out_width, out_height, out_channels, + width_scale, height_scale, wradius, hradius); + return device.ok(); +} + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/downsample/downsample_op.cc b/Codes/flownet2/src/ops/downsample/downsample_op.cc new file mode 100644 index 0000000..6980dc7 --- /dev/null +++ b/Codes/flownet2/src/ops/downsample/downsample_op.cc @@ -0,0 +1,30 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; +using shape_inference::DimensionHandle; + +Status SetOutputToSizedImage(InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + DimensionHandle batch = c->Dim(input, 0); + DimensionHandle depth = c->Dim(input, 3); + std::vector<int32> size_; + c->GetAttr("size", &size_); + DimensionHandle height = c->MakeDim(size_[0]); + DimensionHandle width = c->MakeDim(size_[1]); + c->set_output(0, c->MakeShape({batch, height, width, depth})); + return Status::OK(); +} + +REGISTER_OP("Downsample") + .Input("input: float32") + .Attr("size: list(int) >= 2") + .Output("output: float32") + .SetShapeFn(SetOutputToSizedImage); + +} // namespace tensorflow diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc new file mode 100644 index 0000000..b5d9602 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cc @@ -0,0 +1,48 @@ +#define EIGEN_USE_THREADS + +#include "flow_warp.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +template<typename Device> +class FlowWarpKernel : public OpKernel { + public: + explicit FlowWarpKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext *ctx) override { + // Get the input image and flow and verify dimensions + const Tensor& input_t = ctx->input(0); + const Tensor& flow_t = ctx->input(1); + + OP_REQUIRES(ctx, input_t.dims() == 4, + errors::InvalidArgument("Input image must have rank 4")); + OP_REQUIRES(ctx, flow_t.dims() == 4, + errors::InvalidArgument("Input flow must have rank 4")); + OP_REQUIRES(ctx, + input_t.dim_size(0) == flow_t.dim_size(0) && input_t.dim_size( + 1) == flow_t.dim_size(1) && input_t.dim_size(2) == flow_t.dim_size(2), + errors::InvalidArgument( + "Input image and flow must have same N x H x W dimensions")); + + // Allocate the memory for the output + Tensor *output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input_t.shape(), &output_t)); + + // Perform flow augmentation + auto input = input_t.tensor<float, 4>(); + auto flow = flow_t.tensor<float, 4>(); + auto output = output_t->tensor<float, 4>(); + + FlowWarp(ctx->eigen_gpu_device(), input, flow, output); + } +}; + +REGISTER_KERNEL_BUILDER(Name("FlowWarp") + .Device(DEVICE_GPU), + FlowWarpKernel<GPUDevice>) +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc new file mode 100644 index 0000000..2007151 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.cu.cc @@ -0,0 +1,130 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "flow_warp.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +#define RA_TILE 32 +#define RA_ROWS 8 + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void FlowWarpKernel( + const float *image, + const float *flow, + float *warped, + const int batch_size, + const int channels, + const int cblocks, + const int width, + const int wblocks, + const int height, + const int width_height) { + int y = blockIdx.y; + int n = blockIdx.z; + + __shared__ float x2_buf[FW_TILE_X], y2_buf[FW_TILE_X]; + __shared__ float buffer[FW_TILE_C][FW_TILE_X + 1]; + + int x; + int c; + + x = blockIdx.x * FW_TILE_X + threadIdx.x; + + if ((threadIdx.y == 0) && (x < width)) { + const int idx = ((n * height + y) * width + x) * 2; + x2_buf[threadIdx.x] = float(x) + flow[idx]; + y2_buf[threadIdx.x] = float(y) + flow[idx + 1]; + } + + __syncthreads(); + + float x2 = x2_buf[threadIdx.y]; + float y2 = y2_buf[threadIdx.y]; + + int ix2_L = int(x2); + int iy2_T = int(y2); + int ix2_R = min(ix2_L + 1, width - 1); + int iy2_B = min(iy2_T + 1, height - 1); + + int off_TL = ((n * height + iy2_T) * width + ix2_L) * channels; + int off_TR = ((n * height + iy2_T) * width + ix2_R) * channels; + int off_BL = ((n * height + iy2_B) * width + ix2_L) * channels; + int off_BR = ((n * height + iy2_B) * width + ix2_R) * channels; + + float alpha = x2 - ix2_L; + float beta = y2 - iy2_T; + float coeffTL = (1 - alpha) * (1 - beta); + float coeffTR = alpha * (1 - beta); + float coeffBL = (1 - alpha) * beta; + float coeffBR = alpha * beta; + + for (int cb = 0; cb < cblocks; cb++) { + __syncthreads(); + + buffer[threadIdx.y][threadIdx.x] = 0.0; + + __syncthreads(); + + c = cb * FW_TILE_C + threadIdx.x; + + if ((x2 >= 0) && (y2 >= 0) && (x2 < width) && (y2 < height) && (c < channels)) { + buffer[threadIdx.y][threadIdx.x] = // buffer [x][c] + coeffTL * image[off_TL + c] + + coeffTR * image[off_TR + c] + + coeffBL * image[off_BL + c] + + coeffBR * image[off_BR + c]; + } + + __syncthreads(); + + c = cb * FW_TILE_C + threadIdx.y; + x = blockIdx.x * FW_TILE_X + threadIdx.x; + + if ((c < channels) && (x < width)) { + warped[((n * height + y) * width + x) * channels + c] = buffer[threadIdx.x][threadIdx.y]; + } + } +} + +void FlowWarp(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor input, + typename TTypes<float, 4>::ConstTensor flow, + typename TTypes<float, 4>::Tensor output) { + const int batch_size = input.dimension(0); + const int height = input.dimension(1); + const int width = input.dimension(2); + const int channels = input.dimension(3); + + const int width_height = width * height; + int wblocks = ((width - 1) / FW_TILE_X + 1); + int cblocks = ((channels - 1) / FW_TILE_C + 1); + dim3 warpThreads(FW_TILE_X, FW_TILE_C); + dim3 warpBlocks(wblocks, height, batch_size); + + cudaMemset(output.data(), 0, batch_size * height * width * 2 * sizeof(float)); + + FlowWarpKernel << < warpBlocks, warpThreads, 0, device.stream() >> > ( + input.data(), + flow.data(), + output.data(), + batch_size, + channels, + cblocks, + width, + wblocks, + height, + width_height); +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp.h b/Codes/flownet2/src/ops/flow_warp/flow_warp.h new file mode 100644 index 0000000..2780316 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp.h @@ -0,0 +1,28 @@ +#ifndef FLOWNET_FLOWWARP_H_ +#define FLOWNET_FLOWWARP_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +#define FW_THREADS 32 +#define FW_TILE_X FW_THREADS +#define FW_TILE_C FW_THREADS + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +void FlowWarp(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor input, + typename TTypes<float, 4>::ConstTensor flow, + typename TTypes<float, 4>::Tensor output); + +void FlowWarpGrad(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor image, + typename TTypes<float, 4>::ConstTensor flow, + typename TTypes<float, 4>::ConstTensor gradient, + typename TTypes<float, 4>::Tensor image_grad, + typename TTypes<float, 4>::Tensor flow_grad); +} // end namespace tensorflow + +#endif // FLOWNET_FLOWWARP_H_ diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc new file mode 100644 index 0000000..9f3e7ea --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cc @@ -0,0 +1,57 @@ +#define EIGEN_USE_THREADS + +#include "flow_warp.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +template<typename Device> +class FlowWarpGradKernel : public OpKernel { + public: + explicit FlowWarpGradKernel(OpKernelConstruction *ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext *ctx) override { + // Get the input image and flow and verify dimensions + const Tensor& image_t = ctx->input(0); + const Tensor& flow_t = ctx->input(1); + const Tensor& grad_t = ctx->input(2); + + OP_REQUIRES(ctx, image_t.dims() == 4, + errors::InvalidArgument("Input image must have rank 4")); + OP_REQUIRES(ctx, flow_t.dims() == 4, + errors::InvalidArgument("Input flow must have rank 4")); + OP_REQUIRES(ctx, + image_t.dim_size(0) == flow_t.dim_size(0) && image_t.dim_size( + 1) == flow_t.dim_size(1) && image_t.dim_size(2) == flow_t.dim_size(2), + errors::InvalidArgument( + "Input image and flow must have same N x H x W dimensions")); + + // Allocate the memory for the output + Tensor *image_grad_t; + Tensor *flow_grad_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, image_t.shape(), &image_grad_t)); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, flow_t.shape(), &flow_grad_t)); + + auto image = image_t.tensor<float, 4>(); + auto flow = flow_t.tensor<float, 4>(); + auto gradient = grad_t.tensor<float, 4>(); + auto image_grad = image_grad_t->tensor<float, 4>(); + auto flow_grad = flow_grad_t->tensor<float, 4>(); + + FlowWarpGrad(ctx->eigen_gpu_device(), + image, + flow, + gradient, + image_grad, + flow_grad); + } +}; + +REGISTER_KERNEL_BUILDER(Name("FlowWarpGrad") + .Device(DEVICE_GPU), + FlowWarpGradKernel<GPUDevice>) +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc new file mode 100644 index 0000000..25248c8 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_grad.cu.cc @@ -0,0 +1,126 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "flow_warp.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +__global__ void FlowWarpGradKernel( + const float *image, + float *image_grad, + const float *flow, + float *flow_grad, + const float *gradient, + int batch_size, + int channels, + int cblocks, + int width, + int wblocks, + int height, + int widthheight) { + int x = blockIdx.x * FW_TILE_X + threadIdx.x; + + if (x >= width) return; + + int y = blockIdx.y; + int n = blockIdx.z; + + const int flow_idx = ((n * height + y) * width + x) * 2; + float x2 = float(x) + flow[flow_idx]; + float y2 = float(y) + flow[flow_idx + 1]; + + if ((x2 >= 0.f) && (y2 >= 0.f) && (x2 < width) && (y2 < height)) { + int ix2_L = int(x2); + int iy2_T = int(y2); + int ix2_R = min(ix2_L + 1, width - 1); + int iy2_B = min(iy2_T + 1, height - 1); + + float alpha = x2 - ix2_L; + float beta = y2 - iy2_T; + + for (int c = 0; c < channels; c++) { + float warped_diff_value = gradient[((n * height + y) * width + x) * channels + c]; + atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_L) * channels + c], + warped_diff_value * (1 - alpha) * (1 - beta)); + atomicAdd(&image_grad[((n * height + iy2_T) * width + ix2_R) * channels + c], + warped_diff_value * alpha * (1 - beta)); + atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_L) * channels + c], + warped_diff_value * (1 - alpha) * beta); + atomicAdd(&image_grad[((n * height + iy2_B) * width + ix2_R) * channels + c], + warped_diff_value * alpha * beta); + } + + float gamma = iy2_B - y2; + float bot_diff = 0; + + for (int c = 0; c < channels; c++) { + int ch_off = (n * channels + c) * height; + float temp = 0; + temp += gamma * + (image[((n * height + iy2_T) * width + ix2_R) * channels + c] - + image[((n * height + iy2_T) * width + ix2_L) * channels + c]); + temp += (1 - gamma) * + (image[((n * height + iy2_B) * width + ix2_R) * channels + c] - + image[((n * height + iy2_B) * width + ix2_L) * channels + c]); + + bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp; + } + flow_grad[((n * height + y) * width + x) * 2] = bot_diff; + + gamma = ix2_R - x2; + bot_diff = 0; + + for (int c = 0; c < channels; c++) { + float temp = 0; + temp += gamma * + (image[((n * height + iy2_B) * width + ix2_L) * channels + c] - + image[((n * height + iy2_T) * width + ix2_L) * channels + c]); + temp += (1 - gamma) * + (image[((n * height + iy2_B) * width + ix2_R) * channels + c] - + image[((n * height + iy2_T) * width + ix2_R) * channels + c]); + + bot_diff += gradient[((n * height + y) * width + x) * channels + c] * temp; + } + flow_grad[((n * height + y) * width + x) * 2 + 1] = bot_diff; + } +} + +void FlowWarpGrad(const GPUDevice& device, + typename TTypes<float, 4>::ConstTensor image, + typename TTypes<float, 4>::ConstTensor flow, + typename TTypes<float, 4>::ConstTensor gradient, + typename TTypes<float, 4>::Tensor image_grad, + typename TTypes<float, 4>::Tensor flow_grad) { + const int batch_size = image.dimension(0); + const int height = image.dimension(1); + const int width = image.dimension(2); + const int channels = image.dimension(3); + const int width_height = width * height; + + int wblocks = ((width - 1) / FW_TILE_X + 1); + int cblocks = ((channels - 1) / FW_TILE_C + 1); + dim3 warpThreads(FW_TILE_X, 1); + dim3 warpBlocks(wblocks, height, batch_size); + + cudaMemset(image_grad.data(), 0, batch_size * height * width * channels * sizeof(float)); + cudaMemset(flow_grad.data(), 0, batch_size * height * width * 2 * sizeof(float)); + + FlowWarpGradKernel << < warpBlocks, warpThreads, 0, device.stream() >> > ( + image.data(), + image_grad.data(), + flow.data(), + flow_grad.data(), + gradient.data(), + batch_size, + channels, + cblocks, + width, + wblocks, + height, + width_height); +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc new file mode 100644 index 0000000..aef9c74 --- /dev/null +++ b/Codes/flownet2/src/ops/flow_warp/flow_warp_op.cc @@ -0,0 +1,23 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { +REGISTER_OP("FlowWarp") +.Input("image: float32") +.Input("flow: float32") +.Output("output: float32") +.SetShapeFn(::tensorflow::shape_inference::UnchangedShape); + +REGISTER_OP("FlowWarpGrad") +.Input("image: float32") +.Input("flow: float32") +.Input("gradient: float32") +.Output("image_grad: float32") +.Output("flow_grad: float32") +.SetShapeFn([](shape_inference::InferenceContext *c) { + c->set_output(0, c->input(0)); + c->set_output(1, c->input(1)); + return Status::OK(); + }); +} // namespace tensorflow diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc new file mode 100644 index 0000000..b93dfa6 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.cc @@ -0,0 +1,420 @@ +#include "augmentation_base.h" + +#include <math.h> +#include <random> + +namespace tensorflow { +/** TransMat Functions **/ +void AugmentationLayerBase::TransMat::fromCoeff(AugmentationCoeff *coeff, + int out_width, + int out_height, + int src_width, + int src_height) { + leftMultiply(1, 0, -0.5 * out_width, + 0, 1, -0.5 * out_height); + + if (coeff->angle) { + leftMultiply(cos(coeff->angle()), -sin(coeff->angle()), 0, + sin(coeff->angle()), cos(coeff->angle()), 0); + } + + if (coeff->dx || coeff->dy) { + leftMultiply(1, 0, coeff->dx() * out_width, + 0, 1, coeff->dy() * out_height); + } + + if (coeff->zoom_x || coeff->zoom_y) { + leftMultiply(1.0 / coeff->zoom_x(), 0, 0, + 0, 1.0 / coeff->zoom_y(), 0); + } + + leftMultiply(1, 0, 0.5 * src_width, + 0, 1, 0.5 * src_height); +} + +void AugmentationLayerBase::TransMat::fromTensor(const float *tensor_data) { + t0 = tensor_data[0]; + t1 = tensor_data[1]; + t2 = tensor_data[2]; + t3 = tensor_data[3]; + t4 = tensor_data[4]; + t5 = tensor_data[5]; +} + +AugmentationLayerBase::TransMat AugmentationLayerBase::TransMat::inverse() { + float a = this->t0, b = this->t1, c = this->t2; + float d = this->t3, e = this->t4, f = this->t5; + + float denom = a * e - b * d; + + TransMat result; + + result.t0 = e / denom; + result.t1 = b / -denom; + result.t2 = (c * e - b * f) / -denom; + result.t3 = d / -denom; + result.t4 = a / denom; + result.t5 = (c * d - a * f) / denom; + + return result; +} + +void AugmentationLayerBase::TransMat::leftMultiply(float u0, + float u1, + float u2, + float u3, + float u4, + float u5) { + float t0 = this->t0, t1 = this->t1, t2 = this->t2; + float t3 = this->t3, t4 = this->t4, t5 = this->t5; + + this->t0 = t0 * u0 + t3 * u1; + this->t1 = t1 * u0 + t4 * u1; + this->t2 = t2 * u0 + t5 * u1 + u2; + this->t3 = t0 * u3 + t3 * u4; + this->t4 = t1 * u3 + t4 * u4; + this->t5 = t2 * u3 + t5 * u4 + u5; +} + +void AugmentationLayerBase::TransMat::toIdentity() { + t0 = 1; t1 = 0; t2 = 0; + t3 = 0; t4 = 1; t5 = 0; +} + +/** AugmentationCoeff Functions **/ +void AugmentationCoeff::clear() { + // Spatial variables + dx.clear(); + dy.clear(); + angle.clear(); + zoom_x.clear(); + zoom_y.clear(); + + // Chromatic variables + gamma.clear(); + brightness.clear(); + contrast.clear(); + color1.clear(); + color2.clear(); + color3.clear(); +} + +void AugmentationCoeff::combine_with(const AugmentationCoeff& coeff) { + // Spatial types + if (coeff.dx) { + dx = dx() * coeff.dx(); + } + + if (coeff.dy) { + dy = dy() * coeff.dy(); + } + + if (coeff.angle) { + angle = angle() * coeff.angle(); + } + + if (coeff.zoom_x) { + zoom_x = zoom_x() * coeff.zoom_x(); + } + + if (coeff.zoom_y) { + zoom_y = zoom_y() * coeff.zoom_y(); + } + + // Chromatic types + if (coeff.gamma) { + gamma = gamma() * coeff.gamma(); + } + + if (coeff.brightness) { + brightness = brightness() * coeff.brightness(); + } + + if (coeff.contrast) { + contrast = contrast() * coeff.contrast(); + } + + if (coeff.color1) { + color1 = color1() * coeff.color1(); + } + + if (coeff.color2) { + color2 = color2() * coeff.color2(); + } + + if (coeff.color3) { + color3 = color3() * coeff.color3(); + } +} + +void AugmentationCoeff::replace_with(const AugmentationCoeff& coeff) { + // Spatial types + if (coeff.dx) { + dx = coeff.dx(); + } + + if (coeff.dy) { + dy = coeff.dy(); + } + + if (coeff.angle) { + angle = coeff.angle(); + } + + if (coeff.zoom_x) { + zoom_x = coeff.zoom_x(); + } + + if (coeff.zoom_y) { + zoom_y = coeff.zoom_y(); + } + + // Chromatic types + if (coeff.gamma) { + gamma = gamma() * coeff.gamma(); + } + + if (coeff.brightness) { + brightness = coeff.brightness(); + } + + if (coeff.contrast) { + contrast = coeff.contrast(); + } + + if (coeff.color1) { + color1 = coeff.color1(); + } + + if (coeff.color2) { + color2 = coeff.color2(); + } + + if (coeff.color3) { + color3 = coeff.color3(); + } +} + +/** AugmentationLayerBase Functions **/ +float AugmentationLayerBase::rng_generate(const AugmentationParam& param, + float discount_coeff, + const float default_value) { + std::random_device rd; // Will be used to obtain a seed for the random number + // engine + std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() + + float spread = param.spread * discount_coeff; + + if (param.rand_type == "uniform_bernoulli") { + float tmp1 = 0.0; + bool tmp2 = false; + + if (param.prob > 0.0) { + std::bernoulli_distribution bernoulli(param.prob); + tmp2 = bernoulli(gen); + } + + if (!tmp2) { + return default_value; + } + + if (param.spread > 0.0) { + std::uniform_real_distribution<> uniform(param.mean - spread, + param.mean + spread); + tmp1 = uniform(gen); + } else { + tmp1 = param.mean; + } + + if (param.should_exp) { + tmp1 = exp(tmp1); + } + + return tmp1; + } else if (param.rand_type == "gaussian_bernoulli") { + float tmp1 = 0.0; + bool tmp2 = false; + + if (param.prob > 0.0) { + std::bernoulli_distribution bernoulli(param.prob); + tmp2 = bernoulli(gen); + } + + if (!tmp2) { + return default_value; + } + + if (spread > 0.0) { + std::normal_distribution<> normal(param.mean, spread); + tmp1 = normal(gen); + } else { + tmp1 = param.mean; + } + + if (param.should_exp) { + tmp1 = exp(tmp1); + } + + return tmp1; + } else { + throw "Unknown random type: " + param.rand_type; + } +} + +void AugmentationLayerBase::generate_chromatic_coeffs(float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff) { + if (aug.gamma) { + coeff.gamma = rng_generate(aug.gamma(), discount_coeff, coeff.gamma.get_default()); + } + + if (aug.brightness) { + coeff.brightness = + rng_generate(aug.brightness(), discount_coeff, coeff.brightness.get_default()); + } + + if (aug.contrast) { + coeff.contrast = rng_generate(aug.contrast(), discount_coeff, coeff.contrast.get_default()); + } + + if (aug.color) { + coeff.color1 = rng_generate(aug.color(), discount_coeff, coeff.color1.get_default()); + coeff.color2 = rng_generate(aug.color(), discount_coeff, coeff.color2.get_default()); + coeff.color3 = rng_generate(aug.color(), discount_coeff, coeff.color3.get_default()); + } +} + +void AugmentationLayerBase::generate_spatial_coeffs(float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff) { + if (aug.translate) { + coeff.dx = rng_generate(aug.translate(), discount_coeff, coeff.dx.get_default()); + coeff.dy = rng_generate(aug.translate(), discount_coeff, coeff.dy.get_default()); + } + + if (aug.rotate) { + coeff.angle = rng_generate(aug.rotate(), discount_coeff, coeff.angle.get_default()); + } + + if (aug.zoom) { + coeff.zoom_x = rng_generate(aug.zoom(), discount_coeff, coeff.zoom_x.get_default()); + coeff.zoom_y = coeff.zoom_x(); + } + + if (aug.squeeze) { + float squeeze_coeff = rng_generate(aug.squeeze(), discount_coeff, 1.0); + coeff.zoom_x = coeff.zoom_x() * squeeze_coeff; + coeff.zoom_y = coeff.zoom_y() * squeeze_coeff; + } +} + +void AugmentationLayerBase::generate_valid_spatial_coeffs( + float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff, + int src_width, + int src_height, + int out_width, + int out_height) { + int x, y; + float x1, y1, x2, y2; + int counter = 0; + int good_params = 0; + AugmentationCoeff incoming_coeff(coeff); + + while (good_params < 4 && counter < 50) { + coeff.clear(); + AugmentationLayerBase::generate_spatial_coeffs(discount_coeff, aug, coeff); + coeff.combine_with(incoming_coeff); + + // Check if all 4 corners of the transformed image fit into the original + // image + good_params = 0; + + for (x = 0; x < out_width; x += out_width - 1) { + for (y = 0; y < out_height; y += out_height - 1) { + // move the origin + x1 = x - 0.5 * out_width; + y1 = y - 0.5 * out_height; + + // rotate + x2 = cos(coeff.angle()) * x1 - sin(coeff.angle()) * y1; + y2 = sin(coeff.angle()) * x1 + sin(coeff.angle()) * y1; + + // translate + x2 = x2 + coeff.dx() * out_width; + y2 = y2 + coeff.dy() * out_height; + + // zoom + x2 = x2 / coeff.zoom_x(); + y2 = y2 / coeff.zoom_y(); + + // move the origin back + x2 = x2 + 0.5 * src_width; + y2 = y2 + 0.5 * src_height; + + if (!((floor(x2) < 0) || (floor(x2) > src_width - 2.0) || + (floor(y2) < 0) || (floor(y2) > src_height - 2.0))) { + good_params++; + } + } + } + counter++; + } + + if (counter >= 50) { + printf("Warning: No suitable spatial transformation after %d attempts.\n", counter); + coeff.clear(); + coeff.replace_with(incoming_coeff); + } +} + +void AugmentationLayerBase::copy_chromatic_coeffs_to_tensor( + const std::vector<AugmentationCoeff>& coeff_arr, + typename TTypes<float, 2>::Tensor& out) +{ + float *out_ptr = out.data(); + int counter = 0; + + for (AugmentationCoeff coeff : coeff_arr) { + out_ptr[counter + 0] = coeff.gamma(); + out_ptr[counter + 1] = coeff.brightness(); + out_ptr[counter + 2] = coeff.contrast(); + out_ptr[counter + 3] = coeff.color1(); + out_ptr[counter + 4] = coeff.color2(); + out_ptr[counter + 5] = coeff.color3(); + counter += 6; + } +} + +void AugmentationLayerBase::copy_spatial_coeffs_to_tensor( + const std::vector<AugmentationCoeff>& coeff_arr, + const int out_width, + const int out_height, + const int src_width, + const int src_height, + typename TTypes<float, 2>::Tensor& out, + const bool invert) +{ + float *out_ptr = out.data(); + int counter = 0; + TransMat t; + + for (AugmentationCoeff coeff : coeff_arr) { + t.toIdentity(); + t.fromCoeff(&coeff, out_width, out_height, src_width, src_height); + + if (invert) { + t = t.inverse(); + } + + out_ptr[counter + 0] = t.t0; + out_ptr[counter + 1] = t.t1; + out_ptr[counter + 2] = t.t2; + out_ptr[counter + 3] = t.t3; + out_ptr[counter + 4] = t.t4; + out_ptr[counter + 5] = t.t5; + counter += 6; + } +} +} diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h new file mode 100644 index 0000000..d2aba2c --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/augmentation_base.h @@ -0,0 +1,228 @@ +#ifndef AUGMENTATION_LAYER_BASE_H_ +#define AUGMENTATION_LAYER_BASE_H_ + +#include "tensorflow/core/framework/tensor_types.h" + +#include <iostream> +#include <string> +#include <vector> + +namespace tensorflow { +template<typename T> +class OptionalType { + public: + OptionalType(const T default_value) : default_value(default_value), has_value(false) {} + + operator bool() const { + return has_value; + } + + OptionalType& operator=(T val) { + has_value = true; + value = val; + return *this; + } + + const T operator()() const { + return has_value ? value : default_value; + } + + void clear() { + has_value = false; + } + + const T get_default() { + return default_value; + } + + private: + T value; + bool has_value; + const T default_value; +}; + +class AugmentationCoeff { + public: + // Spatial Types + OptionalType<float>dx; + OptionalType<float>dy; + OptionalType<float>angle; + OptionalType<float>zoom_x; + OptionalType<float>zoom_y; + + // Chromatic Types + OptionalType<float>gamma; + OptionalType<float>brightness; + OptionalType<float>contrast; + OptionalType<float>color1; + OptionalType<float>color2; + OptionalType<float>color3; + + AugmentationCoeff() : dx(0.0), dy(0.0), angle(0.0), zoom_x(1.0), zoom_y(1.0), gamma(1.0), + brightness(0.0), contrast(1.0), color1(1.0), color2(1.0), color3(1.0) {} + + AugmentationCoeff(const AugmentationCoeff& coeff) : AugmentationCoeff() { + replace_with(coeff); + } + + void clear(); + + void combine_with(const AugmentationCoeff& coeff); + + void replace_with(const AugmentationCoeff& coeff); +}; + +typedef struct AugmentationParam { + std::string rand_type; + bool should_exp; + float mean; + float spread; + float prob; +} AugmentationParam; + +class AugmentationParams { + public: + int crop_height; + int crop_width; + + // Spatial options + OptionalType<struct AugmentationParam>translate; + OptionalType<struct AugmentationParam>rotate; + OptionalType<struct AugmentationParam>zoom; + OptionalType<struct AugmentationParam>squeeze; + + // Chromatic options + OptionalType<struct AugmentationParam>gamma; + OptionalType<struct AugmentationParam>brightness; + OptionalType<struct AugmentationParam>contrast; + OptionalType<struct AugmentationParam>color; + + inline AugmentationParams(int crop_height, + int crop_width, + std::vector<std::string>params_name, + std::vector<std::string>params_rand_type, + std::vector<bool> params_exp, + std::vector<float> params_mean, + std::vector<float> params_spread, + std::vector<float> params_prob) : + crop_height(crop_height), + crop_width(crop_width), + translate(AugmentationParam()), + rotate(AugmentationParam()), + zoom(AugmentationParam()), + squeeze(AugmentationParam()), + gamma(AugmentationParam()), + brightness(AugmentationParam()), + contrast(AugmentationParam()), + color(AugmentationParam()) { + for (int i = 0; i < params_name.size(); i++) { + const std::string name = params_name[i]; + const std::string rand_type = params_rand_type[i]; + const bool should_exp = params_exp[i]; + const float mean = params_mean[i]; + const float spread = params_spread[i]; + const float prob = params_prob[i]; + + struct AugmentationParam param = { rand_type, should_exp, mean, spread, prob }; + + if (name == "translate") { + this->translate = param; + } else if (name == "rotate") { + this->rotate = param; + } else if (name == "zoom") { + this->zoom = param; + } else if (name == "squeeze") { + this->squeeze = param; + } else if (name == "noise") { + // NoOp: We handle noise on the Python side + } else if (name == "gamma") { + this->gamma = param; + } else if (name == "brightness") { + this->brightness = param; + } else if (name == "contrast") { + this->contrast = param; + } else if (name == "color") { + this->color = param; + } else { + std::cout << "Ignoring unknown augmentation parameter: " << name << std::endl; + } + } + } + + bool should_do_spatial_transform() { + return this->translate || this->rotate || this->zoom || this->squeeze; + } + + bool should_do_chromatic_transform() { + return this->gamma || this->brightness || this->contrast || this->color; + } +}; + +class AugmentationLayerBase { + public: + class TransMat { + /** + * Translation matrix class for spatial augmentation + * | 0 1 2 | + * | 3 4 5 | + */ + + public: + float t0, t1, t2; + float t3, t4, t5; + + + void fromCoeff(AugmentationCoeff *coeff, + int out_width, + int out_height, + int src_width, + int src_height); + + void fromTensor(const float *tensor_data); + + TransMat inverse(); + + void leftMultiply(float u0, + float u1, + float u2, + float u3, + float u4, + float u5); + + void toIdentity(); + }; + + // TODO: Class ChromaticCoeffs + + static float rng_generate(const AugmentationParam& param, + float discount_coeff, + const float default_value); + + static void clear_spatial_coeffs(AugmentationCoeff& coeff); + static void generate_chromatic_coeffs(float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff); + static void generate_spatial_coeffs(float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff); + static void generate_valid_spatial_coeffs(float discount_coeff, + const AugmentationParams& aug, + AugmentationCoeff & coeff, + int src_width, + int src_height, + int out_width, + int out_height); + + static void copy_chromatic_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr, + typename TTypes<float, 2>::Tensor& out); + static void copy_spatial_coeffs_to_tensor(const std::vector<AugmentationCoeff>& coeff_arr, + const int out_width, + const int out_height, + const int src_width, + const int src_height, + typename TTypes<float, 2>::Tensor& out, + const bool invert = false); +}; +} // namespace tensorflow + +#endif // AUGMENTATION_LAYER_BASE_H_ diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc new file mode 100644 index 0000000..77b8c83 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cc @@ -0,0 +1,461 @@ +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <iostream> +#include <random> +#include <vector> + +#include "augmentation_base.h" +#include "data_augmentation.h" +#include "tensorflow/core/framework/op_kernel.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" + +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +inline float clamp(float f, float a, float b) { + return fmaxf(a, fminf(f, b)); +} + +template<> +void Augment(OpKernelContext *context, + const CPUDevice& d, + const int batch_size, + const int channels, + const int src_width, + const int src_height, + const int src_count, + const int out_width, + const int out_height, + const float *src_data, + float *out_data, + const float *transMats, + float *chromatic_coeffs) { + const int64 channel_count = batch_size * out_height * out_width; + const int kCostPerChannel = 10; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + + Shard(worker_threads.num_threads, + worker_threads.workers, + channel_count, + kCostPerChannel, + [batch_size, channels, src_width, + src_height, src_count, out_width, out_height, src_data, + out_data, transMats, chromatic_coeffs]( + int64 start_channel, int64 end_channel) { + // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) + for (int index = start_channel; index < end_channel; index++) { + int x = index % out_width; + int y = (index / out_width) % out_height; + int n = index / out_width / out_height; + + const float *transMat = transMats + n * 6; + + float gamma, brightness, contrast; + + if (chromatic_coeffs) { + gamma = chromatic_coeffs[n * 6 + 0]; + brightness = chromatic_coeffs[n * 6 + 1]; + contrast = chromatic_coeffs[n * 6 + 2]; + } + + float xpos = x * transMat[0] + y * transMat[1] + transMat[2]; + float ypos = x * transMat[3] + y * transMat[4] + transMat[5]; + + xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f); + ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f); + + float tlx = floor(xpos); + float tly = floor(ypos); + + float xdist = xpos - tlx; + float ydist = ypos - tly; + + int srcTLIdxOffset = ((n * src_height + (int)tly) * src_width + (int)tlx) * channels; + + // ((n * src_height + tly) * src_width + (tlx + 1)) * channels + int srcTRIdxOffset = srcTLIdxOffset + channels; + + // ((n * src_height + (tly + 1)) * src_width + tlx) * channels + int srcBLIdxOffset = srcTLIdxOffset + channels * src_width; + + // ((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + int srcBRIdxOffset = srcTLIdxOffset + channels + channels * src_width; + + // Variables for chromatic transform + int data_index[3]; + float rgb[3]; + float mean_in = 0; + float mean_out = 0; + + for (int c = 0; c < channels; c++) { + // Bilinear interpolation + int srcTLIdx = srcTLIdxOffset + c; + int srcTRIdx = std::min(srcTRIdxOffset + c, src_count); + int srcBLIdx = std::min(srcBLIdxOffset + c, src_count); + int srcBRIdx = std::min(srcBRIdxOffset + c, src_count); + + float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx] + + (xdist) * (ydist) * src_data[srcBRIdx] + + (1 - xdist) * (ydist) * src_data[srcBLIdx] + + (xdist) * (1 - ydist) * src_data[srcTRIdx]; + + if (chromatic_coeffs) { + // Gather data for chromatic transform + data_index[c] = index * channels + c; + rgb[c] = dest; + mean_in += rgb[c]; + + // Note: coeff[3] == color1, coeff[4] == color2, ... + rgb[c] *= chromatic_coeffs[n * 6 + (3 + c)]; + + mean_out += rgb[c]; + } else { + out_data[index * channels + c] = dest; + } + } + + float brightness_coeff = mean_in / (mean_out + 0.01f); + + if (chromatic_coeffs) { + // Chromatic transformation + for (int c = 0; c < channels; c++) { + // compensate brightness + rgb[c] = clamp(rgb[c] * brightness_coeff, 0.0f, 1.0f); + + // gamma change + rgb[c] = pow(rgb[c], gamma); + + // brightness change + rgb[c] = rgb[c] + brightness; + + // contrast change + rgb[c] = 0.5f + (rgb[c] - 0.5f) * contrast; + + out_data[data_index[c]] = clamp(rgb[c], 0.0f, 1.0f); + } + } + } + }); +} + +template<typename Device> +class DataAugmentation : public OpKernel { + public: + explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) { + // Get the crop [height, width] tensor and verify its dimensions + OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_)); + OP_REQUIRES(ctx, crop_.size() == 2, + errors::InvalidArgument("crop must be 2 dimensions")); + + // TODO: Verify params are all the same length + + // Get the tensors for params_a and verify their dimensions + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", ¶ms_a_name_)); + OP_REQUIRES_OK(ctx, + ctx->GetAttr("params_a_rand_type", ¶ms_a_rand_type_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", ¶ms_a_exp_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", ¶ms_a_mean_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread", ¶ms_a_spread_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", ¶ms_a_prob_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_coeff_schedule", ¶ms_a_coeff_schedule_)); + + // Get the tensors for params_b and verify their dimensions + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", ¶ms_b_name_)); + OP_REQUIRES_OK(ctx, + ctx->GetAttr("params_b_rand_type", ¶ms_b_rand_type_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", ¶ms_b_exp_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", ¶ms_b_mean_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread", ¶ms_b_spread_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", ¶ms_b_prob_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_coeff_schedule", ¶ms_b_coeff_schedule_)); + } + + void Compute(OpKernelContext *ctx) override { + // Get the input images + const Tensor& input_a_t = ctx->input(0); + const Tensor& input_b_t = ctx->input(1); + + // Get the global step value + const Tensor& global_step_t = ctx->input(2); + auto global_step_eigen = global_step_t.tensor<int64, 0>(); + const int64 global_step = global_step_eigen.data()[0]; + + // Dimension constants + const int batch_size = input_a_t.dim_size(0); + const int src_height = input_a_t.dim_size(1); + const int src_width = input_a_t.dim_size(2); + const int channels = input_a_t.dim_size(3); + const int src_count = batch_size * src_height * src_width * channels; + const int out_height = crop_[0]; + const int out_width = crop_[1]; + const int out_count = batch_size * out_height * out_width * channels; + + // All tensors for this op + Tensor chromatic_coeffs_a_t; + Tensor chromatic_coeffs_b_t; + + // Allocate the memory for the output images + Tensor *output_a_t; + Tensor *output_b_t; + + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, TensorShape({ batch_size, crop_[0], crop_[1], + channels }), &output_a_t)); + OP_REQUIRES_OK(ctx, + ctx->allocate_output(1, TensorShape({ batch_size, crop_[0], crop_[1], + channels }), &output_b_t)); + + // Allocate the memory for the output spatial transforms + Tensor *spat_transform_a_t; + Tensor *spat_transform_b_t; + + OP_REQUIRES_OK(ctx, + ctx->allocate_output(2, TensorShape({ batch_size, 6 }), + &spat_transform_a_t)); + OP_REQUIRES_OK(ctx, + ctx->allocate_output(3, TensorShape({ batch_size, 6 }), + &spat_transform_b_t)); + + // Compute discount for coefficients if using a schedule + float discount_coeff_a = 1.0; + float discount_coeff_b = 1.0; + + if (params_a_coeff_schedule_.size() == 3) { + float half_life = params_a_coeff_schedule_[0]; + float initial_coeff = params_a_coeff_schedule_[1]; + float final_coeff = params_a_coeff_schedule_[2]; + discount_coeff_a = initial_coeff + (final_coeff - initial_coeff) * + (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0); + } + + if (params_b_coeff_schedule_.size() == 3) { + if (params_a_coeff_schedule_.size() == 3) { + discount_coeff_b = discount_coeff_a; + } else { + float half_life = params_b_coeff_schedule_[0]; + float initial_coeff = params_b_coeff_schedule_[1]; + float final_coeff = params_b_coeff_schedule_[2]; + discount_coeff_b = initial_coeff + (final_coeff - initial_coeff) * + (2.0 / (1.0 + exp(-1.0986 * global_step / half_life)) - 1.0); + } + } + + /*** BEGIN AUGMENTATION TO IMAGE A ***/ + auto input_a = input_a_t.tensor<float, 4>(); + auto output_a = output_a_t->tensor<float, 4>(); + + // Load augmentation parameters for image A + AugmentationParams aug_a = AugmentationParams(out_height, out_width, + params_a_name_, + params_a_rand_type_, + params_a_exp_, + params_a_mean_, + params_a_spread_, + params_a_prob_); + + std::vector<AugmentationCoeff> coeffs_a; + + + bool gen_spatial_transform = aug_a.should_do_spatial_transform(); + bool gen_chromatic_transform = aug_a.should_do_chromatic_transform(); + + for (int n = 0; n < batch_size; n++) { + AugmentationCoeff coeff; + + if (gen_spatial_transform) { + AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_a, aug_a, coeff, + src_width, src_height, + out_width, out_height); + } + + if (gen_chromatic_transform) { + AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_a, aug_a, coeff); + } + + coeffs_a.push_back(coeff); + } + + // Copy spatial coefficients A to the output Tensor on the CPU + // (output for FlowAugmentation) + auto spat_transform_a = spat_transform_a_t->tensor<float, 2>(); + AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a, + out_width, out_height, + src_width, src_height, + spat_transform_a); + + float *chromatic_coeffs_a_data = NULL; + + if (gen_chromatic_transform) { + // Allocate a temporary tensor to hold the chromatic coefficients + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, 6 }), + &chromatic_coeffs_a_t)); + + // Copy the chromatic coefficients A to a temporary Tensor on the CPU + auto chromatic_coeffs_a = chromatic_coeffs_a_t.tensor<float, 2>(); + AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_a, chromatic_coeffs_a); + chromatic_coeffs_a_data = chromatic_coeffs_a.data(); + } + + // Perform augmentation either on CPU or GPU + Augment<Device>( + ctx, + ctx->eigen_device<Device>(), + batch_size, + channels, + src_width, + src_height, + src_count, + out_width, + out_height, + input_a.data(), + output_a.data(), + spat_transform_a.data(), + chromatic_coeffs_a_data); + + /*** END AUGMENTATION TO IMAGE A ***/ + + /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/ + AugmentationParams aug_b = AugmentationParams(out_height, out_width, + params_b_name_, + params_b_rand_type_, + params_b_exp_, + params_b_mean_, + params_b_spread_, + params_b_prob_); + + std::vector<AugmentationCoeff> coeffs_b; + + bool gen_spatial_transform_b = aug_b.should_do_spatial_transform(); + bool gen_chromatic_transform_b = aug_b.should_do_chromatic_transform(); + + for (int n = 0; n < batch_size; n++) { + AugmentationCoeff coeff(coeffs_a[n]); + + // If we did a spatial transform on image A, we need to do the same one + // (+ possibly more) on image B + if (gen_spatial_transform_b) { + AugmentationLayerBase::generate_valid_spatial_coeffs(discount_coeff_b, aug_b, coeff, + src_width, src_height, + out_width, out_height); + } + + if (gen_chromatic_transform_b) { + AugmentationLayerBase::generate_chromatic_coeffs(discount_coeff_b, aug_b, coeff); + } + + coeffs_b.push_back(coeff); + } + + /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/ + + /*** BEGIN AUGMENTATION TO IMAGE B ***/ + auto input_b = input_b_t.tensor<float, 4>(); + auto output_b = output_b_t->tensor<float, 4>(); + + // Copy spatial coefficients B to the output Tensor on the CPU + auto spat_transform_b = spat_transform_b_t->tensor<float, 2>(); + AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b, + out_width, out_height, + src_width, src_height, + spat_transform_b); + + float *chromatic_coeffs_b_data = NULL; + + if (gen_chromatic_transform || gen_chromatic_transform_b) { + // Allocate a temporary tensor to hold the chromatic coefficients + tensorflow::AllocatorAttributes pinned_allocator; + pinned_allocator.set_on_host(true); + pinned_allocator.set_gpu_compatible(true); + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({ batch_size, 6 }), + &chromatic_coeffs_b_t, pinned_allocator)); + + // Copy the chromatic coefficients A to a temporary Tensor on the CPU + auto chromatic_coeffs_b = chromatic_coeffs_b_t.tensor<float, 2>(); + AugmentationLayerBase::copy_chromatic_coeffs_to_tensor(coeffs_b, chromatic_coeffs_b); + chromatic_coeffs_b_data = chromatic_coeffs_b.data(); + } + + // Perform augmentation either on CPU or GPU + Augment<Device>( + ctx, + ctx->eigen_device<Device>(), + batch_size, + channels, + src_width, + src_height, + src_count, + out_width, + out_height, + input_b.data(), + output_b.data(), + spat_transform_b.data(), + chromatic_coeffs_b_data); + + // FlowAugmentation needs the inverse + // TODO: To avoid rewriting, can we invert when we read on the + // FlowAugmentation side? + AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b, + out_width, out_height, + src_width, src_height, + spat_transform_b, + true); + + /*** END AUGMENTATION TO IMAGE B ***/ + } + + private: + std::vector<int32>crop_; + + // Params A + std::vector<string>params_a_name_; + std::vector<string>params_a_rand_type_; + std::vector<bool>params_a_exp_; + std::vector<float>params_a_mean_; + std::vector<float>params_a_spread_; + std::vector<float>params_a_prob_; + std::vector<float>params_a_coeff_schedule_; + + // Params B + std::vector<string>params_b_name_; + std::vector<string>params_b_rand_type_; + std::vector<bool>params_b_exp_; + std::vector<float>params_b_mean_; + std::vector<float>params_b_spread_; + std::vector<float>params_b_prob_; + std::vector<float>params_b_coeff_schedule_; +}; + + +REGISTER_KERNEL_BUILDER(Name("DataAugmentation") + .Device(DEVICE_CPU) + .HostMemory("global_step") + .HostMemory("transforms_from_a") + .HostMemory("transforms_from_b"), + DataAugmentation<CPUDevice>) + +#if GOOGLE_CUDA + +REGISTER_KERNEL_BUILDER(Name("DataAugmentation") + .Device(DEVICE_GPU) + .HostMemory("global_step") + .HostMemory("transforms_from_a") + .HostMemory("transforms_from_b"), + DataAugmentation<GPUDevice>) +#endif // GOOGLE_CUDA +} // namespace tensorflow diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc new file mode 100644 index 0000000..7a2101d --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.cu.cc @@ -0,0 +1,348 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "augmentation_base.h" +#include "data_augmentation.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +inline __device__ __host__ float clamp(float f, float a, float b) { + return fmaxf(a, fminf(f, b)); +} + +__global__ void SpatialAugmentation( + const int32 nthreads, + const int src_width, + const int src_height, + const int channels, + const int src_count, + const int out_width, + const int out_height, + const float *src_data, + float *out_data, + const float *transMats) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w) + // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) + int c = index % channels; + int x = (index / channels) % out_width; + int y = (index / channels / out_width) % out_height; + int n = index / channels / out_width / out_height; + + const float *transMat = transMats + n * 6; + float xpos = x * transMat[0] + y * transMat[1] + transMat[2]; + float ypos = x * transMat[3] + y * transMat[4] + transMat[5]; + + xpos = clamp(xpos, 0.0f, (float)(src_width) - 1.05f); + ypos = clamp(ypos, 0.0f, (float)(src_height) - 1.05f); + + float tlx = floor(xpos); + float tly = floor(ypos); + + // Bilinear interpolation + int srcTLIdx = ((n * src_height + tly) * src_width + tlx) * channels + c; + int srcTRIdx = min((int)(((n * src_height + tly) * src_width + (tlx + 1)) * channels + c), + src_count); + int srcBLIdx = min((int)(((n * src_height + (tly + 1)) * src_width + tlx) * channels + c), + src_count); + int srcBRIdx = min((int)(((n * src_height + (tly + 1)) * src_width + (tlx + 1)) * channels + c), + src_count); + + float xdist = xpos - tlx; + float ydist = ypos - tly; + + float dest = (1 - xdist) * (1 - ydist) * src_data[srcTLIdx] + + (xdist) * (ydist) * src_data[srcBRIdx] + + (1 - xdist) * (ydist) * src_data[srcBLIdx] + + (xdist) * (1 - ydist) * src_data[srcTRIdx]; + + out_data[index] = dest; + } +} + +typedef Eigen::GpuDevice GPUDevice; + +template<> +void Augment(OpKernelContext *context, + const GPUDevice& d, + const int batch_size, + const int channels, + const int src_width, + const int src_height, + const int src_count, + const int out_width, + const int out_height, + const float *src_data, + float *out_data, + const float *transMats, + float *chromatic_coeffs) { + const int out_count = batch_size * out_height * out_width * channels; + CudaLaunchConfig config = GetCudaLaunchConfig(out_count, d); + + printf("Chromatic transform not yet implemented on GPU, ignoring."); + + SpatialAugmentation << < config.block_count, config.thread_per_block, 0, d.stream() >> > ( + config.virtual_thread_count, src_width, src_height, channels, src_count, + out_width, out_height, + src_data, out_data, transMats); +} + +// +// template<typename Device> +// class DataAugmentation : public OpKernel { +// public: +// explicit DataAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) { +// // Get the crop [height, width] tensor and verify its dimensions +// OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_)); +// OP_REQUIRES(ctx, crop_.size() == 2, +// errors::InvalidArgument("crop must be 2 dimensions")); +// +// // TODO: Verify params are all the same length +// +// // Get the tensors for params_a and verify their dimensions +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_name", ¶ms_a_name_)); +// OP_REQUIRES_OK(ctx, +// ctx->GetAttr("params_a_rand_type", +// ¶ms_a_rand_type_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_exp", ¶ms_a_exp_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_mean", ¶ms_a_mean_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_spread", +// ¶ms_a_spread_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_a_prob", ¶ms_a_prob_)); +// +// // Get the tensors for params_b and verify their dimensions +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_name", ¶ms_b_name_)); +// OP_REQUIRES_OK(ctx, +// ctx->GetAttr("params_b_rand_type", +// ¶ms_b_rand_type_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_exp", ¶ms_b_exp_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_mean", ¶ms_b_mean_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_spread", +// ¶ms_b_spread_)); +// OP_REQUIRES_OK(ctx, ctx->GetAttr("params_b_prob", ¶ms_b_prob_)); +// } +// +// void Compute(OpKernelContext *ctx) override { +// const GPUDevice& device = ctx->eigen_gpu_device(); +// +// // Get the input images +// const Tensor& input_a_t = ctx->input(0); +// const Tensor& input_b_t = ctx->input(1); +// +// // Dimension constants +// const int batch_size = input_a_t.dim_size(0); +// const int src_height = input_a_t.dim_size(1); +// const int src_width = input_a_t.dim_size(2); +// const int channels = input_a_t.dim_size(3); +// const int src_count = batch_size * src_height * src_width * channels; +// const int out_height = crop_[0]; +// const int out_width = crop_[1]; +// const int out_count = batch_size * out_height * out_width * channels; +// +// // Allocate the memory for the output images +// Tensor *output_a_t; +// Tensor *output_b_t; +// +// OP_REQUIRES_OK(ctx, +// ctx->allocate_output(0, TensorShape({ batch_size, +// crop_[0], crop_[1], +// channels }), +// &output_a_t)); +// OP_REQUIRES_OK(ctx, +// ctx->allocate_output(1, TensorShape({ batch_size, +// crop_[0], crop_[1], +// channels }), +// &output_b_t)); +// +// // Allocate the memory for the output spatial transforms +// Tensor *spat_transform_a_t; +// Tensor *spat_transform_b_t; +// +// OP_REQUIRES_OK(ctx, +// ctx->allocate_output(2, TensorShape({ batch_size, 6 }), +// &spat_transform_a_t)); +// OP_REQUIRES_OK(ctx, +// ctx->allocate_output(3, TensorShape({ batch_size, 6 }), +// &spat_transform_b_t)); +// +// // Allocate temporary pinned memory for the spatial transforms to be +// used +// // on the GPU +// tensorflow::AllocatorAttributes pinned_allocator; +// pinned_allocator.set_on_host(true); +// pinned_allocator.set_gpu_compatible(true); +// +// Tensor spat_transform_a_pinned_t; +// Tensor spat_transform_b_pinned_t; +// OP_REQUIRES_OK(ctx, +// ctx->allocate_temp(DataTypeToEnum<float>::value, +// TensorShape({ batch_size, 6 }), +// &spat_transform_a_pinned_t, +// pinned_allocator)); +// OP_REQUIRES_OK(ctx, +// ctx->allocate_temp(DataTypeToEnum<float>::value, +// TensorShape({ batch_size, 6 }), +// &spat_transform_b_pinned_t, +// pinned_allocator)); +// auto spat_transform_a_pinned = spat_transform_a_pinned_t.tensor<float, +// 2>(); +// auto spat_transform_b_pinned = spat_transform_b_pinned_t.tensor<float, +// 2>(); +// +// /*** BEGIN AUGMENTATION TO IMAGE A ***/ +// auto input_a = input_a_t.tensor<float, 4>(); +// auto output_a = output_a_t->tensor<float, 4>(); +// +// // Load augmentation parameters for image A +// AugmentationParams aug_a = AugmentationParams(out_height, out_width, +// params_a_name_, +// params_a_rand_type_, +// params_a_exp_, +// params_a_mean_, +// params_a_spread_, +// params_a_prob_); +// +// std::vector<AugmentationCoeff> coeffs_a; +// +// bool gen_spatial_transform = aug_a.should_do_spatial_transform(); +// +// for (int n = 0; n < batch_size; n++) { +// AugmentationCoeff coeff; +// +// if (gen_spatial_transform) { +// AugmentationLayerBase::generate_valid_spatial_coeffs(aug_a, coeff, +// src_width, +// src_height, +// out_width, +// out_height); +// } +// +// coeffs_a.push_back(coeff); +// } +// +// // Copy spatial coefficients A to the output Tensor on the CPU (output +// for +// // FlowAugmentation) +// auto spat_transform_a = spat_transform_a_t->tensor<float, 2>(); +// AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a, +// out_width, +// out_height, +// src_width, +// src_height, +// spat_transform_a); +// +// // ...as well as a Tensor going to the GPU +// AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_a, +// out_width, +// out_height, +// src_width, +// src_height, +// +// +// +// spat_transform_a_pinned); +// +// CudaLaunchConfig config = GetCudaLaunchConfig(out_count, device); +// SpatialAugmentation << < config.block_count, config.thread_per_block, +// 0, +// device.stream() >> > ( +// config.virtual_thread_count, src_width, src_height, channels, +// src_count, +// out_width, out_height, +// input_a.data(), output_a.data(), spat_transform_a_pinned.data()); +// +// /*** END AUGMENTATION TO IMAGE A ***/ +// +// /*** BEGIN GENERATE NEW COEFFICIENTS FOR IMAGE B ***/ +// AugmentationParams aug_b = AugmentationParams(out_height, out_width, +// params_b_name_, +// params_b_rand_type_, +// params_b_exp_, +// params_b_mean_, +// params_b_spread_, +// params_b_prob_); +// +// std::vector<AugmentationCoeff> coeffs_b; +// +// gen_spatial_transform = aug_b.should_do_spatial_transform(); +// +// for (int n = 0; n < batch_size; n++) { +// AugmentationCoeff coeff; +// +// if (gen_spatial_transform) { +// AugmentationLayerBase::generate_valid_spatial_coeffs(aug_b, coeff, +// src_width, +// src_height, +// out_width, +// out_height); +// } +// +// coeffs_b.push_back(coeff); +// } +// +// /*** END GENERATE NEW COEFFICIENTS FOR IMAGE B ***/ +// +// /*** BEGIN AUGMENTATION TO IMAGE B ***/ +// auto input_b = input_b_t.tensor<float, 4>(); +// auto output_b = output_b_t->tensor<float, 4>(); +// +// // Copy spatial coefficients B to the output Tensor on the CPU +// auto spat_transform_b = spat_transform_b_t->tensor<float, 2>(); +// AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b, +// out_width, +// out_height, +// src_width, +// src_height, +// spat_transform_b, +// true); +// AugmentationLayerBase::copy_spatial_coeffs_to_tensor(coeffs_b, +// out_width, +// out_height, +// src_width, +// src_height, +// +// +// +// spat_transform_b_pinned); +// +// SpatialAugmentation << < config.block_count, config.thread_per_block, +// 0, +// device.stream() >> > ( +// config.virtual_thread_count, src_width, src_height, channels, +// src_count, +// out_width, out_height, +// input_b.data(), output_b.data(), spat_transform_b_pinned.data()); +// +// /*** END AUGMENTATION TO IMAGE B ***/ +// } +// +// private: +// std::vector<int32>crop_; +// +// // Params A +// std::vector<string>params_a_name_; +// std::vector<string>params_a_rand_type_; +// std::vector<bool>params_a_exp_; +// std::vector<float>params_a_mean_; +// std::vector<float>params_a_spread_; +// std::vector<float>params_a_prob_; +// +// // Params B +// std::vector<string>params_b_name_; +// std::vector<string>params_b_rand_type_; +// std::vector<bool>params_b_exp_; +// std::vector<float>params_b_mean_; +// std::vector<float>params_b_spread_; +// std::vector<float>params_b_prob_; +// }; +} // namespace tensorflow +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h new file mode 100644 index 0000000..545b8a0 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/data_augmentation.h @@ -0,0 +1,22 @@ +#ifndef FLOWNET_DATA_AUGMENTATION_H_ +#define FLOWNET_DATA_AUGMENTATION_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +template<class Device> +void Augment(OpKernelContext *context, + const Device & d, + const int batch_size, + const int channels, + const int src_width, + const int src_height, + const int src_count, + const int out_width, + const int out_height, + const float *src_data, + float *out_data, + const float *transMats, + float *chromatic_coeffs); +} // namespace tensorflow +#endif // FLOWNET_DATA_AUGMENTATION_H_ diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc new file mode 100644 index 0000000..b5cc11f --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.cc @@ -0,0 +1,129 @@ +#define EIGEN_USE_THREADS + +#include "flow_augmentation.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +inline int clamp(int f, int a, int b) { + return std::max(a, std::min(f, b)); +} + +template<> +void FillFlowAugmentation(const CPUDevice& device, + typename TTypes<float, 4>::Tensor output, + typename TTypes<float, 4>::ConstTensor flows, + typename TTypes<float, 2>::ConstTensor transforms_from_a, + typename TTypes<float, 2>::ConstTensor transforms_from_b) { + const int batch_size = output.dimension(0); + const int out_height = output.dimension(1); + const int out_width = output.dimension(2); + const int src_height = flows.dimension(1); + const int src_width = flows.dimension(2); + const int src_total_count = flows.dimension(0) * flows.dimension(1) * + flows.dimension(2) * flows.dimension(3); + float *output_ptr = output.data(); + const float *flow_ptr = flows.data(); + + for (int n = 0; n < batch_size; n++) { + const float *transMatA = transforms_from_a.data() + n * 6; + const float *transMatB = transforms_from_b.data() + n * 6; + + for (int y = 0; y < out_height; y++) { + int outputIdxOffset = (n * out_height + y) * out_width; + + for (int x = 0; x < out_width; x++) { + // Apply transformation matrix applied to first image + const float xpos1 = x * transMatA[0] + y * transMatA[1] + transMatA[2]; + const float ypos1 = x * transMatA[3] + y * transMatA[4] + transMatA[5]; + + const int srcXIdx = + ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * 2 + 0; + const int srcYIdx = srcXIdx + 1; + + const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)]; + const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)]; + + // Apply inverse of the transformation matrix applied to second image + const float xpos3 = xpos2 * transMatB[0] + ypos2 * transMatB[1] + transMatB[2]; + const float ypos3 = xpos2 * transMatB[3] + ypos2 * transMatB[4] + transMatB[5]; + + output_ptr[(outputIdxOffset + x) * 2 + 0] = xpos3 - (float)x; + output_ptr[(outputIdxOffset + x) * 2 + 1] = ypos3 - (float)y; + } + } + } +} + +template<typename Device> +class FlowAugmentation : public OpKernel { + public: + explicit FlowAugmentation(OpKernelConstruction *ctx) : OpKernel(ctx) { + // Get the crop [height, width] tensor and verify its dimensions + OP_REQUIRES_OK(ctx, ctx->GetAttr("crop", &crop_)); + OP_REQUIRES(ctx, crop_.size() == 2, + errors::InvalidArgument("crop must be 2 dimensions")); + } + + void Compute(OpKernelContext *ctx) override { + // Get the input images and transforms and verify their dimensions + const Tensor& flows_t = ctx->input(0); + const Tensor& transforms_from_a_t = ctx->input(1); + const Tensor& transforms_from_b_t = ctx->input(2); + + OP_REQUIRES(ctx, flows_t.dims() == 4, + errors::InvalidArgument("Input images must have rank 4")); + OP_REQUIRES(ctx, + (TensorShapeUtils::IsMatrix(transforms_from_a_t.shape()) && + transforms_from_a_t.dim_size(0) == + flows_t.dim_size(0) && + transforms_from_a_t.dim_size(1) == 6), + errors::InvalidArgument( + "Input transforms_from_a should be num_images x 6")); + OP_REQUIRES(ctx, + (TensorShapeUtils::IsMatrix(transforms_from_b_t.shape()) && + transforms_from_b_t.dim_size(0) == + flows_t.dim_size(0) && + transforms_from_b_t.dim_size(1) == 6), + errors::InvalidArgument( + "Input transforms_from_b should be num_images x 6")); + + // Allocate the memory for the output + Tensor *output_t; + OP_REQUIRES_OK(ctx, ctx->allocate_output( + 0, + TensorShape({ flows_t.dim_size(0), crop_[0], crop_[1], + flows_t.dim_size(3) }), &output_t)); + + // Perform flow augmentation + auto flows = flows_t.tensor<float, 4>(); + auto transforms_from_a = transforms_from_a_t.tensor<float, 2>(); + auto transforms_from_b = transforms_from_b_t.tensor<float, 2>(); + auto output = output_t->tensor<float, 4>(); + + FillFlowAugmentation(ctx->eigen_device<Device>(), + output, + flows, + transforms_from_a, + transforms_from_b); + } + + private: + std::vector<int32>crop_; +}; + +REGISTER_KERNEL_BUILDER(Name("FlowAugmentation") + .Device(DEVICE_CPU), + FlowAugmentation<CPUDevice>) + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("FlowAugmentation") + .Device(DEVICE_GPU), + FlowAugmentation<GPUDevice>) +#endif // GOOGLE_CUDA +} // end namespace tensorflow diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h new file mode 100644 index 0000000..7795991 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation.h @@ -0,0 +1,19 @@ +#ifndef FLOWNET_FLOW_AUG_H_ +#define FLOWNET_FLOW_AUG_H_ + +// See docs in ../ops/image_ops.cc. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +template<class Device> +void FillFlowAugmentation(const Device& device, + typename TTypes<float, 4>::Tensor output, + typename TTypes<float, 4>::ConstTensor flows, + typename TTypes<float, 2>::ConstTensor transforms_from_a, + typename TTypes<float, 2>::ConstTensor transforms_from_b); +} // end namespace tensorflow + +#endif // FLOWNET_FLOW_AUG_H_ diff --git a/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc new file mode 100644 index 0000000..7e10864 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/kernels/flow_augmentation_gpu.cu.cc @@ -0,0 +1,95 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "flow_augmentation.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +typedef Eigen::GpuDevice GPUDevice; + +inline __device__ __host__ int clamp(int f, int a, int b) { + return max(a, min(f, b)); +} + +__global__ void FillFlowAugmentationKernel( + const int32 nthreads, + const float *flow_ptr, + const float *transforms_from_a, + const float *inv_transforms_from_b, + const int src_total_count, const int src_height, const int src_width, + const int batch_size, const int out_height, + const int out_width, float *output_ptr) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const float x = (float)(index % out_width); + const float y = (float)((index / out_width) % out_height); + const int n = (index / out_width / out_height); + + const int transformIdx = n * 6; + + // Apply transformation matrix applied to second image + const float xpos1 = x * transforms_from_a[transformIdx + 0] + + y * transforms_from_a[transformIdx + 1] + + transforms_from_a[transformIdx + 2]; + const float ypos1 = x * transforms_from_a[transformIdx + 3] + + y * transforms_from_a[transformIdx + 4] + + transforms_from_a[transformIdx + 5]; + + // Caffe, NKHW: ((n * K + k) * H + h) * W + w at point (n, k, h, w) + // TF, NHWK: ((n * H + h) * W + w) * K + k at point (n, h, w, k) + const int srcXIdx = + ((n * src_height + (int)(ypos1 + 0.5)) * src_width + (int)(xpos1 + 0.5)) * + 2 + 0; + const int srcYIdx = srcXIdx + 1; + + const float xpos2 = xpos1 + flow_ptr[clamp(srcXIdx, 0, src_total_count - 1)]; + const float ypos2 = ypos1 + flow_ptr[clamp(srcYIdx, 0, src_total_count - 1)]; + + // Apply inverse of the transformation matrix applied to first image + const float xpos3 = xpos2 * inv_transforms_from_b[transformIdx + 0] + + ypos2 * inv_transforms_from_b[transformIdx + 1] + + inv_transforms_from_b[transformIdx + 2]; + const float ypos3 = xpos2 * inv_transforms_from_b[transformIdx + 3] + + ypos2 * inv_transforms_from_b[transformIdx + 4] + + inv_transforms_from_b[transformIdx + 5]; + + output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 0] = xpos3 - + x; + output_ptr[((n * out_height + (int)y) * out_width + (int)x) * 2 + 1] = ypos3 - + y; + } +} + +template<> +void FillFlowAugmentation(const GPUDevice& device, + typename TTypes<float, 4>::Tensor output, + typename TTypes<float, 4>::ConstTensor flows, + typename TTypes<const float, 2>::ConstTensor transforms_from_a, + typename TTypes<const float, 2>::ConstTensor transforms_from_b) { + const int batch_size = output.dimension(0); + const int out_height = output.dimension(1); + const int out_width = output.dimension(2); + const int depth = 2; + const int total_count = batch_size * out_height * out_width * depth; + const int src_total_count = flows.dimension(0) * flows.dimension(1) * + flows.dimension(2) * flows.dimension(3); + + CudaLaunchConfig config = GetCudaLaunchConfig(total_count / 2, device); + + FillFlowAugmentationKernel << < config.block_count, config.thread_per_block, 0, + device.stream() >> > ( + total_count / 2, flows.data(), transforms_from_a.data(), + transforms_from_b.data(), + src_total_count, flows.dimension(1), flows.dimension(2), batch_size, + out_height, out_width, output.data()); +} +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/Codes/flownet2/src/ops/preprocessing/preprocessing.cc b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc new file mode 100644 index 0000000..086a0d0 --- /dev/null +++ b/Codes/flownet2/src/ops/preprocessing/preprocessing.cc @@ -0,0 +1,96 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; +using shape_inference::DimensionHandle; + +Status SetOutputToSizedImage(InferenceContext *c) { + ShapeHandle input; + + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + DimensionHandle batch = c->Dim(input, 0); + DimensionHandle depth = c->Dim(input, 3); + std::vector<int32> crop_; + c->GetAttr("crop", &crop_); + DimensionHandle height = c->MakeDim(crop_[0]); + DimensionHandle width = c->MakeDim(crop_[1]); + c->set_output(0, c->MakeShape({ batch, height, width, depth })); + return Status::OK(); +} + +REGISTER_OP("DataAugmentation") +.Input("image_a: float32") +.Input("image_b: float32") +.Input("global_step: int64") +.Attr("crop: list(int) >= 2") +.Attr("params_a_name: list(string)") +.Attr("params_a_rand_type: list(string)") +.Attr("params_a_exp: list(bool)") +.Attr("params_a_mean: list(float)") +.Attr("params_a_spread: list(float)") +.Attr("params_a_prob: list(float)") +.Attr("params_a_coeff_schedule: list(float)") +.Attr("params_b_name: list(string)") +.Attr("params_b_rand_type: list(string)") +.Attr("params_b_exp: list(bool)") +.Attr("params_b_mean: list(float)") +.Attr("params_b_spread: list(float)") +.Attr("params_b_prob: list(float)") +.Attr("params_b_coeff_schedule: list(float)") +.Output("aug_image_a: float32") +.Output("aug_image_b: float32") +.Output("transforms_from_a: float32") +.Output("transforms_from_b: float32") +.SetShapeFn([](InferenceContext *c) { + // Verify input A and input B both have 4 dimensions + ShapeHandle input_shape_a, input_shape_b; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape_a)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &input_shape_b)); + + // TODO: Verify params vectors all have the same length + + // TODO: Move this out of here and into Compute + // Verify input A and input B are the same shape + DimensionHandle batch_size, unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 0), + c->Value(c->Dim(input_shape_b, 0)), + &batch_size)); + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 1), + c->Value(c->Dim(input_shape_b, 1)), &unused)); + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 2), + c->Value(c->Dim(input_shape_b, 2)), &unused)); + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(input_shape_a, 3), + c->Value(c->Dim(input_shape_b, 3)), &unused)); + + // Get cropping dimensions + std::vector<int32>crop_; + TF_RETURN_IF_ERROR(c->GetAttr("crop", &crop_)); + + // Reshape input shape to cropped shape + TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 1, c->MakeDim(crop_[0]), + &input_shape_a)); + TF_RETURN_IF_ERROR(c->ReplaceDim(input_shape_a, 2, c->MakeDim(crop_[1]), + &input_shape_a)); + + // Set output images shapes + c->set_output(0, input_shape_a); + c->set_output(1, input_shape_a); + + // Set output spatial transforms shapes + c->set_output(2, c->MakeShape({ batch_size, 6 })); + c->set_output(3, c->MakeShape({ batch_size, 6 })); + + return Status::OK(); + }); + +REGISTER_OP("FlowAugmentation") +.Input("flows: float32") +.Input("transforms_from_a: float32") +.Input("transforms_from_b: float32") +.Attr("crop: list(int) >= 2") +.Output("transformed_flows: float32") +.SetShapeFn(SetOutputToSizedImage); +} // namespace tensorflow diff --git a/Codes/flownet2/src/training_schedules.py b/Codes/flownet2/src/training_schedules.py new file mode 100644 index 0000000..4db5aab --- /dev/null +++ b/Codes/flownet2/src/training_schedules.py @@ -0,0 +1,12 @@ +LONG_SCHEDULE = { + 'step_values': [400000, 600000, 800000, 1000000], + 'learning_rates': [0.0001, 0.00005, 0.000025, 0.0000125, 0.00000625], + 'momentum': 0.9, + 'momentum2': 0.999, + 'weight_decay': 0.0004, + 'max_iter': 1200000, +} + +FINETUNE_SCHEDULE = { + # TODO: Finetune schedule +} diff --git a/Codes/flownet2/src/utils.py b/Codes/flownet2/src/utils.py new file mode 100644 index 0000000..f6abe18 --- /dev/null +++ b/Codes/flownet2/src/utils.py @@ -0,0 +1,46 @@ +import tensorflow as tf + + +# Thanks, https://github.com/tensorflow/tensorflow/issues/4079 +def LeakyReLU(x, leak=0.1, name="lrelu"): + with tf.variable_scope(name): + f1 = 0.5 * (1.0 + leak) + f2 = 0.5 * (1.0 - leak) + return f1 * x + f2 * abs(x) + + +def average_endpoint_error(labels, predictions): + """ + Given labels and predictions of size (N, H, W, 2), calculates average endpoint error: + sqrt[sum_across_channels{(X - Y)^2}] + """ + num_samples = predictions.shape.as_list()[0] + with tf.name_scope(None, "average_endpoint_error", (predictions, labels)) as scope: + predictions = tf.to_float(predictions) + labels = tf.to_float(labels) + predictions.get_shape().assert_is_compatible_with(labels.get_shape()) + + squared_difference = tf.square(tf.subtract(predictions, labels)) + # sum across channels: sum[(X - Y)^2] -> N, H, W, 1 + loss = tf.reduce_sum(squared_difference, 3, keep_dims=True) + loss = tf.sqrt(loss) + return tf.reduce_sum(loss) / num_samples + + +def pad(tensor, num=1): + """ + Pads the given tensor along the height and width dimensions with `num` 0s on each side + """ + return tf.pad(tensor, [[0, 0], [num, num], [num, num], [0, 0]], "CONSTANT") + + +def antipad(tensor, num=1): + """ + Performs a crop. "padding" for a deconvolutional layer (conv2d tranpose) removes + padding from the output rather than adding it to the input. + """ + batch, h, w, c = tensor.get_shape().as_list() + # print(batch, h, w, c) + # print(type(batch), type(h), type(w), type(c)) + # return tf.slice(tensor, begin=[0, num, num, 0], size=[batch, h - 2 * num, w - 2 * num, c]) + return tensor[:, num: num + h - 2 * num, num: num + w - 2 * num, :] |
