diff options
Diffstat (limited to 'neural_style.py')
| -rw-r--r-- | neural_style.py | 809 |
1 files changed, 809 insertions, 0 deletions
diff --git a/neural_style.py b/neural_style.py new file mode 100644 index 0000000..994e2dc --- /dev/null +++ b/neural_style.py @@ -0,0 +1,809 @@ +import matplotlib.pyplot as plt +import tensorflow.python +import tensorflow as tf +import numpy as np +import scipy.io +import argparse +import struct +import time +import cv2 +import csv +import os + +''' + parsing and configuration +''' +def parse_args(): + + desc = "TensorFlow implementation of 'A Neural Algorithm for Artisitc Style'" + parser = argparse.ArgumentParser(description=desc) + + # options for single image + parser.add_argument('--img_name', type=str, + default="result", + help="Basename of output file.") + + parser.add_argument('--style_imgs', nargs='+', type=str, + help='Filenames of the style images (example: starry-night.jpg)', + required=True) + + parser.add_argument('--style_imgs_weights', nargs='+', type=float, + default=[1.0], + help='Interpolation weights of each of the style images. (example: 0.5 0.5)') + + parser.add_argument('--content_img', type=str, + help='Filename of the content image (example: lion.jpg)') + + parser.add_argument('--style_imgs_dir', type=str, + default='./input/styles', + help='Directory path to the style images. (default: %(default)s)') + + parser.add_argument('--content_img_dir', type=str, + default='./input/content', + help='Directory path to the content image. (default: %(default)s)') + + parser.add_argument('--init_img_type', type=str, + default='content', + choices=['random', 'content', 'style'], + help='Image used to initialize the network. (default: %(default)s)') + + parser.add_argument('--max_size', type=int, + default=512, + help='Maximum width or height of the input images. (default: %(default)s)') + + parser.add_argument('--content_weight', type=float, + default=5e0, + help='Weight for the content loss function. (default: %(default)s)') + + parser.add_argument('--style_weight', type=float, + default=1e3, + help='Weight for the style loss function. (default: %(default)s)') + + parser.add_argument('--tv_weight', type=float, + default=0, + help='Weight for the transvariational loss function. Set small (e.g. 1e-3). (default: %(default)s)') + + parser.add_argument('--temporal_weight', type=float, + default=2e2, + help='Weight for the temporal loss function. (default: %(default)s)') + + parser.add_argument('--content_loss_function', type=int, + default=1, + choices=[1, 2, 3], + help='A few different constants for the content layer loss functions have been presented. (default: %(default)s)') + + parser.add_argument('--content_layers', type=str, + default=['conv4_2'], + help='VGG19 layers used for the content image. (default: %(default)s)') + + parser.add_argument('--style_layers', nargs='+', type=str, + default=['relu1_1', 'relu2_1', 'relu3_1', 'relu4_1', 'relu5_1'], + help='VGG19 layers used for the style image. (default: %(default)s)') + + parser.add_argument('--content_layer_weights', type=float, + default=[1.0], + help='Contributions (weights) of each content layer to loss. (default: %(default)s)') + + parser.add_argument('--style_layer_weights', nargs='+', type=float, + default=[0.2, 0.2, 0.2, 0.2, 0.2], + help='Contributions (weights) of each style layer to loss. (default: %(default)s)') + + parser.add_argument('--style_scale', type=float, default=1.0) + + parser.add_argument('--is_original_colors', type=bool, + default=False, + help='Transfer the style but not the colors. (default: %(default)s)') + + parser.add_argument('--has_style_mask', type=bool, + default=False, + help='Transfer the style to masked regions.') + + parser.add_argument('--style_mask_imgs', nargs='+', type=str, + default=None, + help='Filenames of the style mask images (example: face_mask.png)') + + parser.add_argument('--noise_ratio', type=float, default=1.0) + + parser.add_argument('--seed', type=int, + default=0, + help='Seed for the random number generator. (default: %(default)s)') + + parser.add_argument('--model_weights', type=str, + default='imagenet-vgg-verydeep-19.mat') + + parser.add_argument('--pooling_type', type=str, + default='avg', + choices=['avg', 'max'], + help="Type of pooling in convolutional neural network. (default: %(default)s)") + + parser.add_argument('--device', type=str, + default='/gpu:0', + choices=['/gpu:0', '/cpu:0'], + help='GPU or CPU mode. GPU mode requires NVIDIA CUDA. (default|recommended: %(default)s)') + + parser.add_argument('--image_output_dir', type=str, + default='./image_output', + help='Relative or absolute directory path to output image and data.') + + # optimizations + parser.add_argument('--optimizer', type=str, + default='lbfgs', + choices=['lbfgs', 'adam'], + help='Loss minimization optimizer. L-BFGS gives better results. Adam uses less memory. (default|recommended: %(default)s)') + + parser.add_argument('--learning_rate', type=float, + default=1e1, + help='Learning rate parameter for the Adam optimizer. (default: %(default)s)') + + parser.add_argument('--max_iterations', type=int, + default=1e3, + help='Max number of iterations for the Adam or L-BFGS optimizer. (default: %(default)s)') + + parser.add_argument('--verbose', action='store_true', + help="Boolean flag indicating if print statements should be included during execution.") + + # options for video frames + parser.add_argument('--is_video', action='store_true', + help='Boolean flag indicating if the user is generating a video. (default=%(default)s)') + + parser.add_argument('--start_frame', type=int, default=1, + help='First frame number.') + + parser.add_argument('--end_frame', type=int, default=1, + help='Last frame number.') + + parser.add_argument('--first_frame_type', type=str, + choices=['random', 'content', 'style'], + default='content', + help='Image used to initialize the network during the rendering of the first frame.') + + parser.add_argument('--init_frame_type', type=str, + choices=['prev_warped', 'prev', 'random', 'content', 'style'], + default='prev_warped', + help='Image used to initialize the network during the every rendering after the first frame.') + + parser.add_argument('--video_input_dir', type=str, + default='./video_input', + help='Relative or absolute directory path to input frames.') + + parser.add_argument('--video_output_dir', type=str, + default='./video_output', + help='Relative or absolute directory path to output frames.') + + parser.add_argument('--content_frame_frmt', type=str, + default='frame_{}.ppm', + help='Filename format of the input content frames.') + + parser.add_argument('--backward_optical_flow_frmt', type=str, + default='backward_{}_{}.flo', + help='Filename format of the backward optical flow files.') + + parser.add_argument('--forward_optical_flow_frmt', type=str, + default='forward_{}_{}.flo', + help='Filename format of the forward optical flow files') + + parser.add_argument('--content_weights_frmt', type=str, + default='reliable_{}_{}.txt', + help='Filename format of the optical flow consistency files.') + + parser.add_argument('--prev_frame_indices', nargs='+', type=int, + default=[1], + help='Previous frames to consider for longterm temporal consistency.') + + args = parser.parse_args() + + # create directories for output + if args.is_video: + maybe_make_directory(args.video_output_dir) + else: + maybe_make_directory(args.image_output_dir) + + return args + +''' + pre-trained vgg19 convolutional neural network + + remark: layers are manually initialized for clarity. +''' +vgg19_mean = np.array([123.68, 116.779, 103.939]).reshape((1,1,1,3)) + +def build_vgg19(input_img): + if args.verbose: print("\nBUILDING VGG-19 NETWORK") + net = {} + _, h, w, d = input_img.shape + + if args.verbose: print('loading model weights...') + vgg_rawnet = scipy.io.loadmat(args.model_weights) + vgg_layers = vgg_rawnet['layers'][0] + if args.verbose: print('constructing layers...') + net['input'] = tf.Variable(np.zeros((1, h, w, d), dtype=np.float32)) + + if args.verbose: print('LAYER GROUP 1') + net['conv1_1'] = conv_layer('conv1_1', net['input'], W=get_weights(vgg_layers, 0)) + net['relu1_1'] = relu_layer('relu1_1', net['conv1_1'], b=get_bias(vgg_layers, 0)) + + net['conv1_2'] = conv_layer('conv1_2', net['relu1_1'], W=get_weights(vgg_layers, 2)) + net['relu1_2'] = relu_layer('relu1_2', net['conv1_2'], b=get_bias(vgg_layers, 2)) + + net['pool1'] = pool_layer('pool1', net['relu1_2']) + + if args.verbose: print('LAYER GROUP 2') + net['conv2_1'] = conv_layer('conv2_1', net['pool1'], W=get_weights(vgg_layers, 5)) + net['relu2_1'] = relu_layer('relu2_1', net['conv2_1'], b=get_bias(vgg_layers, 5)) + + net['conv2_2'] = conv_layer('conv2_2', net['relu2_1'], W=get_weights(vgg_layers, 7)) + net['relu2_2'] = relu_layer('relu2_2', net['conv2_2'], b=get_bias(vgg_layers, 7)) + + net['pool2'] = pool_layer('pool2', net['relu2_2']) + + if args.verbose: print('LAYER GROUP 3') + net['conv3_1'] = conv_layer('conv3_1', net['pool2'], W=get_weights(vgg_layers, 10)) + net['relu3_1'] = relu_layer('relu3_1', net['conv3_1'], b=get_bias(vgg_layers, 10)) + + net['conv3_2'] = conv_layer('conv3_2', net['relu3_1'], W=get_weights(vgg_layers, 12)) + net['relu3_2'] = relu_layer('relu3_2', net['conv3_2'], b=get_bias(vgg_layers, 12)) + + net['conv3_3'] = conv_layer('conv3_3', net['relu3_2'], W=get_weights(vgg_layers, 14)) + net['relu3_3'] = relu_layer('relu3_3', net['conv3_3'], b=get_bias(vgg_layers, 14)) + + net['conv3_4'] = conv_layer('conv3_4', net['relu3_3'], W=get_weights(vgg_layers, 16)) + net['relu3_4'] = relu_layer('relu3_4', net['conv3_4'], b=get_bias(vgg_layers, 16)) + + net['pool3'] = pool_layer('pool3', net['relu3_4']) + + if args.verbose: print('LAYER GROUP 4') + net['conv4_1'] = conv_layer('conv4_1', net['pool3'], W=get_weights(vgg_layers, 19)) + net['relu4_1'] = relu_layer('relu4_1', net['conv4_1'], b=get_bias(vgg_layers, 19)) + + net['conv4_2'] = conv_layer('conv4_2', net['relu4_1'], W=get_weights(vgg_layers, 21)) + net['relu4_2'] = relu_layer('relu4_2', net['conv4_2'], b=get_bias(vgg_layers, 21)) + + net['conv4_3'] = conv_layer('conv4_3', net['relu4_2'], W=get_weights(vgg_layers, 23)) + net['relu4_3'] = relu_layer('relu4_3', net['conv4_3'], b=get_bias(vgg_layers, 23)) + + net['conv4_4'] = conv_layer('conv4_4', net['relu4_3'], W=get_weights(vgg_layers, 25)) + net['relu4_4'] = relu_layer('relu4_4', net['conv4_4'], b=get_bias(vgg_layers, 25)) + + net['pool4'] = pool_layer('pool4', net['relu4_4']) + + if args.verbose: print('LAYER GROUP 5') + net['conv5_1'] = conv_layer('conv5_1', net['pool4'], W=get_weights(vgg_layers, 28)) + net['relu5_1'] = relu_layer('relu5_1', net['conv5_1'], b=get_bias(vgg_layers, 28)) + + net['conv5_2'] = conv_layer('conv5_2', net['relu5_1'], W=get_weights(vgg_layers, 30)) + net['relu5_2'] = relu_layer('relu5_2', net['conv5_2'], b=get_bias(vgg_layers, 30)) + + net['conv5_3'] = conv_layer('conv5_3', net['relu5_2'], W=get_weights(vgg_layers, 32)) + net['relu5_3'] = relu_layer('relu5_3', net['conv5_3'], b=get_bias(vgg_layers, 32)) + + net['conv5_4'] = conv_layer('conv5_4', net['relu5_3'], W=get_weights(vgg_layers, 34)) + net['relu5_4'] = relu_layer('relu5_4', net['conv5_4'], b=get_bias(vgg_layers, 34)) + + net['pool5'] = pool_layer('pool5', net['relu5_4']) + + return net + +def conv_layer(layer_name, layer_input, W): + conv = tf.nn.conv2d(layer_input, W, strides=[1, 1, 1, 1], padding='SAME') + if args.verbose: print('--{} | shape={} | weights_shape={}'.format(layer_name, + conv.get_shape(), W.get_shape())) + return conv + +def relu_layer(layer_name, layer_input, b): + relu = tf.nn.relu(layer_input + b) + if args.verbose: + print('--{} | shape={} | bias_shape={}'.format(layer_name, relu.get_shape(), + b.get_shape())) + return relu + +def pool_layer(layer_name, layer_input): + if args.pooling_type == 'avg': + pool = tf.nn.avg_pool(layer_input, ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], padding='SAME') + elif args.pooling_type == 'max': + pool = tf.nn.max_pool(layer_input, ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], padding='SAME') + if args.verbose: + print('--{} | shape={}'.format(layer_name, pool.get_shape())) + return pool + +def get_weights(vgg_layers, i): + weights = vgg_layers[i][0][0][2][0][0] + W = tf.constant(weights) + return W + +def get_bias(vgg_layers, i): + bias = vgg_layers[i][0][0][2][0][1] + b = tf.constant(np.reshape(bias, (bias.size))) + return b + +''' + 'a neural algorithm for artistic style' loss functions +''' +def content_layer_loss(p, x): + _, h, w, d = p.get_shape() + M = h.value * w.value + N = d.value + loss = (1./(2 * N**0.5 * M**0.5 )) * tf.reduce_sum(tf.pow((x - p), 2)) + #loss = (1./2.) * tf.reduce_sum(tf.pow((x - p), 2)) + #loss = (1./(N * M)) * tf.reduce_sum(tf.pow((x - p), 2)) + return loss + +def gram_matrix(x, area, depth): + F = tf.reshape(x[0], (area, depth)) + G = tf.matmul(tf.transpose(F), F) + return G + +def style_layer_loss(a, x): + _, h, w, d = a.get_shape() + M = h.value * w.value + N = d.value + A = gram_matrix(a, M, N) + G = gram_matrix(x, M, N) + loss = (1./(4 * N**2 * M**2)) * tf.reduce_sum(tf.pow((G - A), 2)) + return loss + +def mask_style_layer(a, x, mask_img): + _, h, w, d = a.get_shape() + mask = get_mask_image(mask_img, w.value, h.value) + mask = tf.convert_to_tensor(mask) + tensors = [] + for _ in range(d.value): + tensors.append(mask) + mask = tf.pack(tensors, axis=2) + mask = tf.pack(mask, axis=0) + mask = tf.expand_dims(mask, 0) + a = tf.mul(a, mask) + x = tf.mul(x, mask) + return a, x + +def sum_masked_style_losses(sess, net, style_imgs): + total_style_loss = 0. + weights = args.style_imgs_weights + masks = args.style_mask_imgs + for img, img_weight, img_mask in zip(style_imgs, weights, masks): + sess.run(net['input'].assign(img)) + style_loss = 0. + for layer, weight in zip(args.style_layers, args.style_layer_weights): + a = sess.run(net[layer]) + x = net[layer] + a = tf.convert_to_tensor(a) + a, x = mask_style_layer(a, x, img_mask) + style_loss += style_layer_loss(a, x) * weight + style_loss /= float(len(args.style_layers)) + total_style_loss += (style_loss * img_weight) + total_style_loss /= float(len(style_imgs)) + return total_style_loss + +def sum_style_losses(sess, net, style_imgs): + total_style_loss = 0. + weights = args.style_imgs_weights + for img, img_weight in zip(style_imgs, weights): + sess.run(net['input'].assign(img)) + style_loss = 0. + for layer, weight in zip(args.style_layers, args.style_layer_weights): + a = sess.run(net[layer]) + x = net[layer] + a = tf.convert_to_tensor(a) + style_loss += style_layer_loss(a, x) * weight + style_loss /= float(len(args.style_layers)) + total_style_loss += (style_loss * img_weight) + total_style_loss /= float(len(style_imgs)) + return total_style_loss + +def sum_content_losses(sess, net, content_img): + sess.run(net['input'].assign(content_img)) + content_loss = 0. + for layer, weight in zip(args.content_layers, args.content_layer_weights): + p = sess.run(net[layer]) + x = net[layer] + p = tf.convert_to_tensor(p) + x = tf.convert_to_tensor(x) + content_loss += content_layer_loss(p, x) * weight + content_loss /= float(len(args.content_layers)) + return content_loss + +''' + 'artistic style transfer for videos' loss functions +''' +def temporal_loss(x, w, c): + c = c[np.newaxis,:,:,:] + D = float(x.size) + loss = (1. / D) * tf.reduce_sum(c * tf.nn.l2_loss(x - w)) + loss = tf.cast(loss, tf.float32) + return loss + +def get_longterm_weights(i, j): + c_sum = 0. + for k in range(args.prev_frame_indices): + if i - k > i - j: + c_sum += get_content_weights(i, i - k) + c = get_content_weights(i, i - j) + c_max = tf.maximum(c - c_sum, 0.) + return c_max + +def sum_longterm_temporal_losses(net, frame, x): + loss = 0. + for j in range(args.prev_frame_indices): + prev_frame = frame - j + w = get_prev_warped_frame(frame) + c = get_longterm_weights(frame, prev_frame) + loss += temporal_loss(x, w, c) + return loss + +def sum_shortterm_temporal_losses(net, frame, x): + prev_frame = frame - 1 + w = get_prev_warped_frame(frame) + c = get_content_weights(frame, prev_frame) + loss = temporal_loss(x, w, c) + return loss + +''' + denoising loss function + + remark: not convinced this does anything significant. +''' +def sum_total_variation_losses(x): + b, h, w, d = x.shape + tv_y_size = b * (h-1) * w * d + tv_x_size = b * h * (w-1) * d + loss_y = tf.nn.l2_loss(x[:,1:,:,:] - x[:,:h-1,:,:]) + loss_y /= tv_y_size + loss_x = tf.nn.l2_loss(x[:,:,1:,:] - x[:,:,:w-1,:]) + loss_x /= tv_x_size + loss = 2 * (loss_y + loss_x) + loss = tf.cast(loss, tf.float32) + return loss + +''' + utilities and i/o +''' +def read_image(path): + # BGR image + img = cv2.imread(path, cv2.IMREAD_COLOR).astype('float') + img = preprocess(img, vgg19_mean) + return img + +def write_image(path, img): + img = postprocess(img, vgg19_mean) + cv2.imwrite(path, img) + +def preprocess(img, mean): + # BGR to RGB + img = img[...,::-1] + # shape (H, W, D) to (1, H, W, D) + img = img[np.newaxis,:,:,:] + # subtract mean + img -= mean + return img + +def postprocess(img, mean): + # add mean + img += mean + # shape (1, H, W, D) to (H, W, D) + img = img[0] + img = np.clip(img, 0, 255).astype('uint8') + # RGB to BGR + img = img[...,::-1] + return img + +def read_flow_file(path): + with open(path, "rb") as f: + # 4 bytes header + header = struct.unpack('4s', f.read(4))[0] + # 4 bytes width, height + w = struct.unpack('i', f.read(4))[0] + h = struct.unpack('i', f.read(4))[0] + flow = np.ndarray((2, h, w), dtype=np.float32) + for y in range(h): + for x in range(w): + flow[1,y,x] = struct.unpack('f', f.read(4))[0] + flow[0,y,x] = struct.unpack('f', f.read(4))[0] + return flow + +def read_weights_file(path): + lines = open(path).read().splitlines() + header = map(int, lines[0].split(' ')) + w = header[0] + h = header[1] + vals = np.zeros((h, w), dtype=np.float32) + for i in range(1, len(lines)): + line = lines[i].rstrip().split(' ') + vals[i-1] = np.array(map(np.float32, line)) + vals[i-1] = map(lambda x: 0. if x < 255. else 1., vals[i-1]) + # expand to 3 channels + weights = np.dstack([vals.astype(np.float32)] * 3) + return weights + +def maybe_make_directory(dir_path): + if not os.path.exists(dir_path): + os.makedirs(dir_path) + +''' + rendering -- where the magic happens +''' +def stylize(content_img, style_imgs, init_img, frame=None): + with tf.device(args.device), tf.Session() as sess: + # setup network + net = build_vgg19(content_img) + + # style loss + if args.has_style_mask: + L_style = sum_masked_style_losses(sess, net, style_imgs) + else: + L_style = sum_style_losses(sess, net, style_imgs) + + # content loss + L_content = sum_content_losses(sess, net, content_img) + + # denoising loss + L_tv = sum_total_variation_losses(init_img) + + # loss weights + alpha = args.content_weight + beta = args.style_weight + theta = args.tv_weight + + # total loss + L_total = alpha * L_content + L_total += beta * L_style + L_total += theta * L_tv + + if args.is_video and frame > 1: + gamma = args.temporal_weight + L_temporal = sum_shortterm_temporal_losses(sess, frame, init_img) + L_total += gamma * L_temporal + + # optimization algorithm + optimizer = get_optimizer(L_total) + + if args.optimizer == 'adam': + minimize_with_adam(sess, net, optimizer, init_img) + elif args.optimizer == 'lbfgs': + minimize_with_lbfgs(sess, net, optimizer, init_img) + + output_img = sess.run(net['input']) + + if args.is_original_colors: + output_img = convert_to_original_colors(np.copy(content_img), np.copy(output_img)) + + if args.is_video: + write_video_output(frame, output_img) + else: + write_image_output(output_img, content_img, style_imgs, init_img) + +def minimize_with_lbfgs(sess, net, optimizer, init_img): + if args.verbose: print('MINIMIZING LOSS USING: L-BFGS OPTIMIZER') + init_op = tf.initialize_all_variables() + sess.run(init_op) + sess.run(net['input'].assign(init_img)) + optimizer.minimize(sess) + +def minimize_with_adam(sess, net, optimizer, init_img): + if args.verbose: print('MINIMIZING LOSS USING: ADAM OPTIMIZER') + train_op = optimizer.minimize(L_total) + init_op = tf.initialize_all_variables() + sess.run(init_op) + sess.run(net['input'].assign(init_img)) + iterations = 0 + while (iterations < args.max_iterations): + sess.run(train_op) + iterations += 1 + +def get_optimizer(loss): + if args.optimizer == 'lbfgs': + optimizer = tf.contrib.opt.ScipyOptimizerInterface( + loss, + method='L-BFGS-B', + options={'maxiter': args.max_iterations}) + elif args.optimizer == 'adam': + optimizer = tf.train.AdamOptimizer(args.learning_rate, epsilon=1.0) + return optimizer + +def write_video_output(frame, output_img): + output_frame_fn = args.content_frame_frmt.format(str(frame).zfill(4)) + output_frame_path = os.path.join(args.video_output_dir, output_frame_fn) + write_image(output_frame_path, output_img) + +def write_image_output(output_img, content_img, style_imgs, init_img): + out_dir = os.path.join(args.image_output_dir, args.img_name) + maybe_make_directory(out_dir) + img_path = os.path.join(out_dir, "output.png") + content_path = os.path.join(out_dir, "content.png") + init_path = os.path.join(out_dir, "init.png") + + write_image(img_path, output_img) + write_image(content_path, content_img) + write_image(init_path, init_img) + index = 0 + for style_img in style_imgs: + path = os.path.join(out_dir, str(index)+"_style.png") + write_image(path, style_img) + index += 1 + + # save the configuration settings + out_file = os.path.join(out_dir, "meta_data.txt") + f = open(out_file, "w") + f.write("image name: {}\n".format(args.img_name)) + f.write("content: {}\n".format(args.content_img)) + index = 0 + for style_img, weight in zip(args.style_imgs, args.style_imgs_weights): + f.write("styles ["+str(index)+"]: {} * {}\n".format(weight, style_img)) + index = 0 + if args.style_mask_imgs is not None: + for mask in args.style_mask_imgs: + f.write("style masks ["+str(index)+"]: {}\n".format(mask)) + f.write("init_type: {}\n".format(args.init_img_type)) + f.write("content_weight: {}\n".format(args.content_weight)) + f.write("style_weight: {}\n".format(args.style_weight)) + f.write("tv_weight: {}\n".format(args.tv_weight)) + f.write("content_layers: {}\n".format(args.content_layers)) + f.write("style_layers: {}\n".format(args.style_layers)) + f.write("optimizer_type: {}\n".format(args.optimizer)) + f.write("max_iterations: {}\n".format(args.max_iterations)) + f.write("max_image_size: {}\n".format(args.max_size)) + f.close() + +''' + image loading and processing +''' +def get_init_image(init_type, content_img, style_img, frame=None): + if init_type == 'content': + return content_img + elif init_type == 'style': + return style_img + elif init_type == 'random': + init_img = get_noise_image(args.noise_ratio, content_img) + return init_img + # only for video frames + elif init_type == 'prev': + init_img = get_prev_frame(frame) + return init_img + elif init_type == 'prev_warped': + init_img = get_prev_warped_frame(frame) + return init_img + +def get_content_frame(frame): + content_fn = args.content_frame_frmt.format(str(frame).zfill(4)) + content_path = os.path.join(args.video_input_dir, content_fn) + img = read_image(content_path) + return img + +def get_content_image(content_img): + # BGR image + path = os.path.join(args.content_img_dir, content_img) + img = cv2.imread(path, cv2.IMREAD_COLOR).astype('float') + h, w, d = img.shape + mx = args.max_size + # resize if > max size + if h > w and h > mx: + w = (float(mx) / float(h)) * w + img = cv2.resize(img, dsize=(int(w), mx), interpolation=cv2.INTER_CUBIC) + if w > mx: + h = (float(mx) / float(w)) * h + img = cv2.resize(img, dsize=(mx, int(h)), interpolation=cv2.INTER_CUBIC) + img = preprocess(img, vgg19_mean) + return img + +def get_style_images(content_img, scale): + style_imgs = [] + for style_fn in args.style_imgs: + path = os.path.join(args.style_imgs_dir, style_fn) + # BGR image + img = cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) + _, h, w, d = content_img.shape + img = cv2.resize(img, dsize=(int(w*scale), int(h*scale))) + img = preprocess(img, vgg19_mean) + style_imgs.append(img) + return style_imgs + +def get_noise_image(noise_ratio, content_img): + np.random.seed(args.seed) + noise_img = np.random.uniform(-20., 20., content_img.shape).astype(np.float32) + img = noise_ratio * noise_img + (1.-noise_ratio) * content_img + return img + +def get_mask_image(mask_img, width, height): + path = os.path.join(args.content_img_dir, mask_img) + img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) + img = cv2.resize(img, dsize=(width, height)).astype(np.float32) + mx = np.amax(img) + img /= mx + return img + +def get_prev_frame(frame): + # previously stylized frame + prev_frame = frame - 1 + prev_frame_fn = args.content_frame_frmt.format(str(prev_frame).zfill(4)) + prev_frame_path = os.path.join(args.video_output_dir, prev_frame_fn) + img = cv2.imread(prev_frame_path, cv2.IMREAD_COLOR) + return img + +def get_prev_warped_frame(frame): + prev_img = get_prev_frame(frame) + prev_frame = frame - 1 + # backwards flow: current frame -> previous frame + flow_fn = args.backward_optical_flow_frmt.format(str(frame), str(prev_frame)) + flow_path = os.path.join(args.video_input_dir, flow_fn) + flow = read_flow_file(flow_path) + warped_img = warp_image(prev_img, flow).astype('float32') + img = preprocess(warped_img, vgg19_mean) + return img + +def get_content_weights(frame, prev_frame): + forward_fn = args.content_weights_frmt.format(str(prev_frame), str(frame)) + backward_fn = args.content_weights_frmt.format(str(frame), str(prev_frame)) + forward_path = os.path.join(args.video_input_dir, forward_fn) + backward_path = os.path.join(args.video_input_dir, backward_fn) + forward_weights = read_weights_file(forward_path) + backward_weights = read_weights_file(backward_path) + forward_weights = np.clip(forward_weights, 0, 255).astype('uint8') + backward_weights = np.clip(backward_weights, 0, 255).astype('uint8') + return forward_weights #, backward_weights + +def warp_image(src, flow): + _, h, w = flow.shape + flow_map = np.zeros(flow.shape, dtype=np.float32) + for y in range(h): + flow_map[1,y,:] = float(y) + flow[1,y,:] + for x in range(w): + flow_map[0,:,x] = float(x) + flow[0,:,x] + # remap pixels to optical flow + dst = cv2.remap( + src, flow_map[0], flow_map[1], + interpolation=cv2.INTER_CUBIC, + borderMode=cv2.BORDER_TRANSPARENT) + return dst + +def convert_to_original_colors(content_img, stylized_img): + content_img = postprocess(content_img, vgg19_mean) + stylized_img = postprocess(stylized_img, vgg19_mean) + content_yuv = cv2.cvtColor(content_img, cv2.COLOR_BGR2YUV) + stylized_yuv = cv2.cvtColor(stylized_img, cv2.COLOR_BGR2YUV) + y, _, _ = cv2.split(stylized_yuv) + _, u, v = cv2.split(content_yuv) + merged = cv2.merge((y, u, v)) + dst = cv2.cvtColor(merged, cv2.COLOR_YUV2BGR).astype('float') + dst = preprocess(dst, vgg19_mean) + return dst + +def render_single_image(): + content_img = get_content_image(args.content_img) + style_imgs = get_style_images(content_img, args.style_scale) + with tf.Graph().as_default(): + print('\n---- RENDERING SINGLE IMAGE ----\n') + init_img = get_init_image(args.init_img_type, content_img, style_imgs) + tick = time.time() + stylize(content_img, style_imgs, init_img) + tock = time.time() + print('Single image elapsed time: {}'.format(tock - tick)) + +def render_video(): + for frame in range(args.start_frame, args.end_frame+1): + with tf.Graph().as_default(): + print('\n---- RENDERING VIDEO FRAME: {}/{} ----\n'.format(frame, args.end_frame)) + if frame == 1: + content_frame = get_content_frame(frame) + style_imgs = get_style_images(content_frame, args.style_scale) + init_img = get_init_image(args.first_frame_type, content_frame, style_imgs, frame) + tick = time.time() + stylize(content_frame, style_imgs, init_img, frame) + tock = time.time() + print('Frame {} elapsed time: {}'.format(frame, tock - tick)) + else: + content_frame = get_content_frame(frame) + style_imgs = get_style_images(content_frame, args.style_scale) + init_img = get_init_image(args.init_frame_type, content_frame, style_imgs, frame) + tick = time.time() + stylize(content_frame, style_imgs, init_img, frame) + tock = time.time() + print('Frame {} elapsed time: {}'.format(frame, tock - tick)) + +def main(): + global args + args = parse_args() + if args.is_video: render_video() + else: render_single_image() + +if __name__ == '__main__': + main()
\ No newline at end of file |
