1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
import argparse
import glob
import multiprocessing
import re
from functools import partial
from pathlib import Path
import librosa
import numpy
from become_yukarin import AcousticConverter
from become_yukarin import SuperResolution
from become_yukarin import VoiceChanger
from become_yukarin.data_struct import AcousticFeature
from become_yukarin.config.config import create_from_json as create_conv_config
from become_yukarin.config.sr_config import create_from_json as create_sr_config
from become_yukarin.dataset.dataset import AcousticFeatureProcess
from become_yukarin.dataset.dataset import WaveFileLoadProcess
parser = argparse.ArgumentParser()
parser.add_argument('-cmd', '--conv_model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
parser.add_argument('-srmd', '--sr_model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
parser.add_argument('-iwd', '--input_directory', type=Path, default=Path('./input'))
parser.add_argument('-owd', '--output_directory', type=Path, default=Path('./output'))
parser.add_argument('-it', '--iteration', type=int)
parser.add_argument('-g', '--gpu', type=int)
parser.add_argument('-f0', '--convert_f0', action='store_true')
args = parser.parse_args()
conv_model = args.conv_model_directory # type: Path
sr_model = args.sr_model_directory # type: Path
input_directory = args.input_directory
output_directory = args.output_directory
it = args.iteration
gpu = args.gpu
convert_f0 = args.convert_f0
conv_config = create_conv_config(conv_model / 'config.json')
sr_config = create_sr_config(sr_model / 'config.json')
param = sr_config.dataset.param
acoustic_feature_process = AcousticFeatureProcess(
frame_period=param.acoustic_feature_param.frame_period,
order=param.acoustic_feature_param.order,
alpha=param.acoustic_feature_param.alpha,
f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
)
def extract_number(f):
s = re.findall("\d+", str(f))
return int(s[-1]) if s else -1
def load_acoustic_converter():
if it is not None:
conv_model_path = conv_model / 'predictor_{}.npz'.format(it)
else:
conv_model_paths = conv_model.glob('predictor_*.npz')
conv_model_path = list(sorted(conv_model_paths, key=extract_number))[-1]
print(conv_model_path)
return AcousticConverter(conv_config, conv_model_path, gpu=gpu)
def load_super_resolution():
sr_model_paths = sr_model.glob('predictor*.npz')
sr_model_path = list(sorted(sr_model_paths, key=extract_number))[-1]
print(sr_model_path)
return SuperResolution(sr_config, sr_model_path, gpu=gpu)
def process(p: Path, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, output: Path):
try:
print(str(p))
input = acoustic_converter(p)
input_wave = acoustic_converter._wave_process(str(p), test=True)
input_feature = acoustic_converter._feature_process(input_wave, test=True)
converted_feature = acoustic_converter.convert_to_feature(input=input_feature, out_sampling_rate=param.voice_param.sample_rate)
# converted_feature.f0 = input_feature.f0.astype(numpy.float64)
merged_feature = AcousticFeature(
f0=converted_feature.f0 if convert_f0 else input_feature.f0,
spectrogram=converted_feature.spectrogram,
aperiodicity=converted_feature.aperiodicity,
mfcc=converted_feature.mfcc,
voiced=converted_feature.voiced,
)
wave = super_resolution(merged_feature.spectrogram.astype(numpy.float32), acoustic_feature=merged_feature, sampling_rate=param.voice_param.sample_rate)
librosa.output.write_wav(str(output / p.stem) + '.wav', wave.wave, wave.sampling_rate, norm=True)
except:
import traceback
print('error!', str(p))
print(traceback.format_exc())
def run():
input_paths = list(input_directory.glob('*.wav'))
acoustic_converter = load_acoustic_converter()
super_resolution = load_super_resolution()
# voice_changer = VoiceChanger(acoustic_converter: acoustic_converter, super_resolution: super_resolution)
output = output_directory.absolute()
output.mkdir(exist_ok=True)
process_partial = partial(process, acoustic_converter=acoustic_converter, super_resolution=super_resolution, output=output)
if gpu is None:
pool = multiprocessing.Pool()
pool.map(process_partial, input_paths)
else:
list(map(process_partial, input_paths))
run()
|