scripts/process.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import argparse
import glob
import multiprocessing
import re
from functools import partial
from pathlib import Path

import librosa
import numpy

from become_yukarin import AcousticConverter
from become_yukarin import SuperResolution
from become_yukarin import VoiceChanger
from become_yukarin.config.config import create_from_json as create_conv_config
from become_yukarin.config.sr_config import create_from_json as create_sr_config
from become_yukarin.dataset.dataset import AcousticFeatureProcess
from become_yukarin.dataset.dataset import WaveFileLoadProcess

parser = argparse.ArgumentParser()
parser.add_argument('-cmd', '--conv_model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
parser.add_argument('-srmd', '--sr_model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
parser.add_argument('-iwd', '--input_directory', type=Path, default=Path('./input'))
parser.add_argument('-owd', '--output_directory', type=Path, default=Path('./output'))
parser.add_argument('-it', '--iteration', type=int)
parser.add_argument('-g', '--gpu', type=int)
args = parser.parse_args()

conv_model = args.conv_model_directory  # type: Path
sr_model = args.sr_model_directory  # type: Path
input_directory = args.input_directory
output_directory = args.output_directory
it = args.iteration
gpu = args.gpu

conv_config = create_conv_config(conv_model / 'config.json')
sr_config = create_sr_config(sr_model / 'config.json')

param = sr_config.dataset.param
acoustic_feature_process = AcousticFeatureProcess(
    frame_period=param.acoustic_feature_param.frame_period,
    order=param.acoustic_feature_param.order,
    alpha=param.acoustic_feature_param.alpha,
    f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
)

def extract_number(f):
    s = re.findall("\d+", str(f))
    return int(s[-1]) if s else -1

def load_acoustic_converter():
  if it is not None:
      conv_model_path = conv_model / 'predictor_{}.npz'.format(it)
  else:
      conv_model_paths = conv_model.glob('predictor_*.npz')
      conv_model_path = list(sorted(conv_model_paths, key=extract_number))[-1]
  print(conv_model_path)
  return AcousticConverter(conv_config, conv_model_path, gpu=gpu)
def load_super_resolution():
  sr_model_paths = sr_model.glob('predictor*.npz')
  sr_model_path = list(sorted(sr_model_paths, key=extract_number))[-1]
  print(sr_model_path)
  return SuperResolution(sr_config, sr_model_path, gpu=gpu)

def process(p: Path, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, output: Path):
  try:
    print(str(p))
    input = acoustic_converter(p)

    input_wave = acoustic_converter._wave_process(str(p), test=True)
    input_feature = acoustic_converter._feature_process(input_wave, test=True)
    converted_feature = acoustic_converter.convert_to_feature(input=input_feature, out_sampling_rate=param.voice_param.sample_rate)
    # converted_feature.f0 = input_feature.f0.astype(numpy.float64)

    wave = super_resolution(converted_feature.spectrogram.astype(numpy.float64), acoustic_feature=converted_feature.astype(numpy.float64), sampling_rate=param.voice_param.sample_rate)

    librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True)
  except:
    import traceback
    print('error!', str(p))
    print(traceback.format_exc())

def run():
  input_paths = list(input_directory.glob('*.wav'))

  acoustic_converter = load_acoustic_converter()
  super_resolution = load_super_resolution()
  # voice_changer = VoiceChanger(acoustic_converter: acoustic_converter, super_resolution: super_resolution)

  output = output_directory.absolute()
  output.mkdir(exist_ok=True)

  process_partial = partial(process, acoustic_converter=acoustic_converter, super_resolution=super_resolution, output=output)
  if gpu is None:
      pool = multiprocessing.Pool()
      pool.map(process_partial, input_paths)
  else:
      list(map(process_partial, input_paths))

run()