summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2018-01-31 05:09:07 +0900
committerHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2018-01-31 05:09:07 +0900
commit48addd22a87f248bb8041bca47e9c209a16175a4 (patch)
tree3c2386adafdea434483106a646f33c6f6a7e10cb
parentb432502ccc924bb10bee0cf8fe11afd0a5f4757d (diff)
RealtimeVocoderを試せるコード追加
-rw-r--r--become_yukarin/vocoder.py4
-rw-r--r--scripts/extract_spectrogram_pair.py2
-rw-r--r--scripts/super_resolution_test.py12
-rw-r--r--tests/test-deep-learning-yuduki-yukari.wavbin0 -> 401534 bytes
-rw-r--r--tests/test_voice_changer.py89
5 files changed, 103 insertions, 4 deletions
diff --git a/become_yukarin/vocoder.py b/become_yukarin/vocoder.py
index a49e8f2..f1a9f03 100644
--- a/become_yukarin/vocoder.py
+++ b/become_yukarin/vocoder.py
@@ -20,6 +20,7 @@ class Vocoder(object):
frame_period=acoustic_feature_param.frame_period,
order=acoustic_feature_param.order,
alpha=acoustic_feature_param.alpha,
+ f0_estimating_method=acoustic_feature_param.f0_estimating_method,
)
def encode(self, wave: Wave):
@@ -102,4 +103,5 @@ class RealtimeVocoder(Vocoder):
self.decode(f)
def __del__(self):
- apidefinitions._DestroySynthesizer(self._synthesizer)
+ if hasattr(self, '_synthesizer'):
+ apidefinitions._DestroySynthesizer(self._synthesizer)
diff --git a/scripts/extract_spectrogram_pair.py b/scripts/extract_spectrogram_pair.py
index be21459..f2b96b4 100644
--- a/scripts/extract_spectrogram_pair.py
+++ b/scripts/extract_spectrogram_pair.py
@@ -29,6 +29,7 @@ parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_sec
parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period)
parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order)
parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha)
+parser.add_argument('--f0_estimating_method', default=base_acoustic_feature_param.f0_estimating_method)
parser.add_argument('--enable_overwrite', action='store_true')
arguments = parser.parse_args()
@@ -53,6 +54,7 @@ def generate_file(path):
frame_period=arguments.frame_period,
order=arguments.order,
alpha=arguments.alpha,
+ f0_estimating_method=arguments.f0_estimating_method,
)
feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32)
high_spectrogram = feature.spectrogram
diff --git a/scripts/super_resolution_test.py b/scripts/super_resolution_test.py
index 8b04ce0..4f34632 100644
--- a/scripts/super_resolution_test.py
+++ b/scripts/super_resolution_test.py
@@ -18,10 +18,12 @@ parser.add_argument('model_names', nargs='+')
parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/'))
parser.add_argument('-iwd', '--input_wave_directory', type=Path,
default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/yukari-wave/yukari-news/'))
+parser.add_argument('-g', '--gpu', type=int)
args = parser.parse_args()
model_directory = args.model_directory # type: Path
input_wave_directory = args.input_wave_directory # type: Path
+gpu = args.gpu
paths_test = list(Path('./test_data_sr/').glob('*.wav'))
@@ -41,6 +43,7 @@ def process(p: Path, super_resolution: SuperResolution):
frame_period=param.acoustic_feature_param.frame_period,
order=param.acoustic_feature_param.order,
alpha=param.acoustic_feature_param.alpha,
+ f0_estimating_method=param.acoustic_feature_param.f0_estimating_method,
)
try:
@@ -68,7 +71,7 @@ for model_name in args.model_names:
model_paths = base_model.glob('predictor*.npz')
model_path = list(sorted(model_paths, key=extract_number))[-1]
print(model_path)
- super_resolution = SuperResolution(config, model_path)
+ super_resolution = SuperResolution(config, model_path, gpu=gpu)
output = Path('./output').absolute() / base_model.name
output.mkdir(exist_ok=True)
@@ -76,5 +79,8 @@ for model_name in args.model_names:
paths = [path_train, path_test] + paths_test
process_partial = partial(process, super_resolution=super_resolution)
- pool = multiprocessing.Pool()
- pool.map(process_partial, paths)
+ if gpu is None:
+ pool = multiprocessing.Pool()
+ pool.map(process_partial, paths)
+ else:
+ list(map(process_partial, paths))
diff --git a/tests/test-deep-learning-yuduki-yukari.wav b/tests/test-deep-learning-yuduki-yukari.wav
new file mode 100644
index 0000000..31d306e
--- /dev/null
+++ b/tests/test-deep-learning-yuduki-yukari.wav
Binary files differ
diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py
new file mode 100644
index 0000000..2a42f88
--- /dev/null
+++ b/tests/test_voice_changer.py
@@ -0,0 +1,89 @@
+from pathlib import Path
+from typing import NamedTuple
+
+import librosa
+import numpy
+
+from become_yukarin import AcousticConverter
+from become_yukarin import RealtimeVocoder
+from become_yukarin import SuperResolution
+from become_yukarin import VoiceChanger
+from become_yukarin.config.config import create_from_json as create_config
+from become_yukarin.config.sr_config import create_from_json as create_sr_config
+from become_yukarin.data_struct import Wave
+from become_yukarin.voice_changer import VoiceChangerStream
+from become_yukarin.voice_changer import VoiceChangerStreamWrapper
+
+
+class AudioConfig(NamedTuple):
+ rate: int
+ chunk: int
+ vocoder_buffer_size: int
+ out_norm: float
+
+
+model_base_path = Path('~/trained/')
+test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav')
+test_output_path = Path('tests/output.wav')
+
+print('model loading...', flush=True)
+
+model_path = model_base_path / Path('harvest-innoise03/predictor_1340000.npz')
+config_path = model_base_path / Path('harvest-innoise03/config.json')
+config = create_config(config_path)
+acoustic_converter = AcousticConverter(config, model_path, gpu=0)
+print('model 1 loaded!', flush=True)
+
+model_path = model_base_path / Path('sr-noise3/predictor_165000.npz')
+config_path = model_base_path / Path('sr-noise3/config.json')
+sr_config = create_sr_config(config_path)
+super_resolution = SuperResolution(sr_config, model_path, gpu=0)
+print('model 2 loaded!', flush=True)
+
+audio_config = AudioConfig(
+ rate=config.dataset.param.voice_param.sample_rate,
+ chunk=config.dataset.param.voice_param.sample_rate // 4,
+ vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
+ out_norm=4.5,
+)
+
+vocoder = RealtimeVocoder(
+ acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+ out_sampling_rate=audio_config.rate,
+ buffer_size=audio_config.vocoder_buffer_size,
+ number_of_pointers=16,
+)
+# vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
+
+voice_changer = VoiceChanger(
+ super_resolution=super_resolution,
+ acoustic_converter=acoustic_converter,
+ vocoder=vocoder,
+)
+
+voice_changer_stream = VoiceChangerStream(
+ voice_changer=voice_changer,
+ sampling_rate=audio_config.rate,
+ in_dtype=numpy.float32,
+)
+
+wrapper = VoiceChangerStreamWrapper(
+ voice_changer_stream=voice_changer_stream,
+ extra_time=0.2,
+)
+
+raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate)
+wave_out_list = []
+
+start_time = 0
+for i in range(0, len(raw_wave), audio_config.chunk):
+ wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate)
+ wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in)
+ start_time += len(wave_in.wave) / wave_in.sampling_rate
+
+ wave_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
+ wave_out_list.append(wave_out)
+ wrapper.remove_previous_wave()
+
+out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
+librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate)