# coding: utf-8 from tensorflow.python.client import device_lib import tensorflow as tf import librosa import os # from IPython.display import Audio, display import numpy as np # import matplotlib.pyplot as plt import sys # get_ipython().magic(u'matplotlib inline') # ### Load style and content # In[5]: if len(sys.argv) < 4: print "python nsatf.py content.wav style.wav output.wav alpha" sys.exit() CONTENT_FILENAME = sys.argv[1] STYLE_FILENAME = sys.argv[2] OUTPUT_FILENAME = sys.argv[3] if len(sys.argv) == 5: ALPHA = float(sys.argv[4] or "1e-3") else: ALPHA = 1e-3 device_ids = [device.name for device in device_lib.list_local_devices()] if '/device:GPU:0' in device_ids: DEVICE = '/device:GPU:0' else: DEVICE = '/device:CPU:0' print DEVICE # In[6]: # display(Audio(CONTENT_FILENAME)) # display(Audio(STYLE_FILENAME)) # In[7]: # Reads wav file and produces spectrum # Fourier phases are ignored N_FFT = 2048 def read_audio_spectum(filename): print 'load ' + filename x, fs = librosa.load(filename, 44100) S = librosa.stft(x, N_FFT) p = np.angle(S) S = np.log1p(np.abs(S[:,:1020])) return S, fs # In[8]: a_content, fs = read_audio_spectum(CONTENT_FILENAME) a_style, fs = read_audio_spectum(STYLE_FILENAME) hs = a_content.shape[1] ms = a_style.shape[1] if hs > ms: a_style = np.lib.pad(a_style, ((0,0), (0, hs - ms)), 'constant', constant_values=(0, 0)) else: a_content = np.lib.pad(a_content, ((0,0), (0, ms - hs)), 'constant', constant_values=(0, 0)) print a_content.shape print a_style.shape hs = a_content.shape[0] ms = a_style.shape[0] if hs > ms: a_style = np.lib.pad(a_style, ((0, hs - ms), (0,0)), 'constant', constant_values=(0, 0)) else: a_content = np.lib.pad(a_content, ((0, ms - hs), (0,0)), 'constant', constant_values=(0, 0)) print a_content.shape print a_style.shape N_SAMPLES = a_style.shape[1] N_CHANNELS = a_style.shape[0] # ### Visualize spectrograms for content and style tracks # In[9]: """ plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.title('Content') plt.imshow(a_content[:400,:]) plt.subplot(1, 2, 2) plt.title('Style') plt.imshow(a_style[:400,:]) plt.show() """ # ### Compute content and style feats # In[10]: N_FILTERS = 4096 a_content_tf = np.ascontiguousarray(a_content.T[None,None,:,:]) a_style_tf = np.ascontiguousarray(a_style.T[None,None,:,:]) # filter shape is "[filter_height, filter_width, in_channels, out_channels]" std = np.sqrt(2) * np.sqrt(2.0 / ((N_CHANNELS + N_FILTERS) * 10)) kernel = np.random.randn(1, 10, N_CHANNELS, N_FILTERS)*std g = tf.Graph() with g.as_default(), g.device(DEVICE), tf.Session() as sess: # data shape is "[batch, in_height, in_width, in_channels]", x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x") kernel_tf = tf.constant(kernel, name="kernel", dtype='float32') conv = tf.nn.conv2d( x, kernel_tf, strides=[1, 1, 1, 1], padding="VALID", name="conv") net = tf.nn.relu(conv) content_features = net.eval(feed_dict={x: a_content_tf}) style_features = net.eval(feed_dict={x: a_style_tf}) features = np.reshape(style_features, (-1, N_FILTERS)) style_gram = np.matmul(features.T, features) / N_SAMPLES # ### Optimize # In[14]: from sys import stderr iterations = 100 result = None with tf.Graph().as_default(): # Build graph with variable input # x = tf.Variable(np.zeros([1,1,N_SAMPLES,N_CHANNELS], dtype=np.float32), name="x") x = tf.Variable(np.random.randn(1,1,N_SAMPLES,N_CHANNELS).astype(np.float32)*1e-3, name="x") kernel_tf = tf.constant(kernel, name="kernel", dtype='float32') conv = tf.nn.conv2d( x, kernel_tf, strides=[1, 1, 1, 1], padding="VALID", name="conv") net = tf.nn.relu(conv) content_loss = ALPHA * 2 * tf.nn.l2_loss( net - content_features ) style_loss = 0 _, height, width, number = map(lambda i: i.value, net.get_shape()) size = height * width * number feats = tf.reshape(net, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / N_SAMPLES style_loss = 2 * tf.nn.l2_loss(gram - style_gram) # Overall loss loss = content_loss + style_loss opt = tf.contrib.opt.ScipyOptimizerInterface( loss, method='L-BFGS-B', options={'maxiter': iterations}) # Optimization with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print('Started optimization.') opt.minimize(sess) print 'Final loss:', loss.eval() result = x.eval() # ### Invert spectrogram and save the result # In[15]: a = np.zeros_like(a_content) a[:N_CHANNELS,:] = np.exp(result[0,0].T) - 1 # This code is supposed to do phase reconstruction p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi for i in range(500): S = a * np.exp(1j*p) x = librosa.istft(S) p = np.angle(librosa.stft(x, N_FFT)) librosa.output.write_wav(OUTPUT_FILENAME, x, fs) # In[16]: #print OUTPUT_FILENAME #display(Audio(OUTPUT_FILENAME)) # ### Visualize spectrograms # In[17]: """ plt.figure(figsize=(15,5)) plt.subplot(1,3,1) plt.title('Content') plt.imshow(a_content[:400,:]) plt.subplot(1,3,2) plt.title('Style') plt.imshow(a_style[:400,:]) plt.subplot(1,3,3) plt.title('Result') plt.imshow(a[:400,:]) plt.show() """ # In[ ]: