diff options
| -rw-r--r-- | become_yukarin/vocoder.py | 4 | ||||
| -rw-r--r-- | scripts/extract_spectrogram_pair.py | 2 | ||||
| -rw-r--r-- | scripts/super_resolution_test.py | 12 | ||||
| -rw-r--r-- | tests/test-deep-learning-yuduki-yukari.wav | bin | 0 -> 401534 bytes | |||
| -rw-r--r-- | tests/test_voice_changer.py | 89 |
5 files changed, 103 insertions, 4 deletions
diff --git a/become_yukarin/vocoder.py b/become_yukarin/vocoder.py index a49e8f2..f1a9f03 100644 --- a/become_yukarin/vocoder.py +++ b/become_yukarin/vocoder.py @@ -20,6 +20,7 @@ class Vocoder(object): frame_period=acoustic_feature_param.frame_period, order=acoustic_feature_param.order, alpha=acoustic_feature_param.alpha, + f0_estimating_method=acoustic_feature_param.f0_estimating_method, ) def encode(self, wave: Wave): @@ -102,4 +103,5 @@ class RealtimeVocoder(Vocoder): self.decode(f) def __del__(self): - apidefinitions._DestroySynthesizer(self._synthesizer) + if hasattr(self, '_synthesizer'): + apidefinitions._DestroySynthesizer(self._synthesizer) diff --git a/scripts/extract_spectrogram_pair.py b/scripts/extract_spectrogram_pair.py index be21459..f2b96b4 100644 --- a/scripts/extract_spectrogram_pair.py +++ b/scripts/extract_spectrogram_pair.py @@ -29,6 +29,7 @@ parser.add_argument('--pad_second', type=float, default=base_voice_param.pad_sec parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period) parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order) parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha) +parser.add_argument('--f0_estimating_method', default=base_acoustic_feature_param.f0_estimating_method) parser.add_argument('--enable_overwrite', action='store_true') arguments = parser.parse_args() @@ -53,6 +54,7 @@ def generate_file(path): frame_period=arguments.frame_period, order=arguments.order, alpha=arguments.alpha, + f0_estimating_method=arguments.f0_estimating_method, ) feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32) high_spectrogram = feature.spectrogram diff --git a/scripts/super_resolution_test.py b/scripts/super_resolution_test.py index 8b04ce0..4f34632 100644 --- a/scripts/super_resolution_test.py +++ b/scripts/super_resolution_test.py @@ -18,10 +18,12 @@ parser.add_argument('model_names', nargs='+') parser.add_argument('-md', '--model_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/')) parser.add_argument('-iwd', '--input_wave_directory', type=Path, default=Path('/mnt/dwango/hiroshiba/become-yukarin/dataset/yukari-wave/yukari-news/')) +parser.add_argument('-g', '--gpu', type=int) args = parser.parse_args() model_directory = args.model_directory # type: Path input_wave_directory = args.input_wave_directory # type: Path +gpu = args.gpu paths_test = list(Path('./test_data_sr/').glob('*.wav')) @@ -41,6 +43,7 @@ def process(p: Path, super_resolution: SuperResolution): frame_period=param.acoustic_feature_param.frame_period, order=param.acoustic_feature_param.order, alpha=param.acoustic_feature_param.alpha, + f0_estimating_method=param.acoustic_feature_param.f0_estimating_method, ) try: @@ -68,7 +71,7 @@ for model_name in args.model_names: model_paths = base_model.glob('predictor*.npz') model_path = list(sorted(model_paths, key=extract_number))[-1] print(model_path) - super_resolution = SuperResolution(config, model_path) + super_resolution = SuperResolution(config, model_path, gpu=gpu) output = Path('./output').absolute() / base_model.name output.mkdir(exist_ok=True) @@ -76,5 +79,8 @@ for model_name in args.model_names: paths = [path_train, path_test] + paths_test process_partial = partial(process, super_resolution=super_resolution) - pool = multiprocessing.Pool() - pool.map(process_partial, paths) + if gpu is None: + pool = multiprocessing.Pool() + pool.map(process_partial, paths) + else: + list(map(process_partial, paths)) diff --git a/tests/test-deep-learning-yuduki-yukari.wav b/tests/test-deep-learning-yuduki-yukari.wav Binary files differnew file mode 100644 index 0000000..31d306e --- /dev/null +++ b/tests/test-deep-learning-yuduki-yukari.wav diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py new file mode 100644 index 0000000..2a42f88 --- /dev/null +++ b/tests/test_voice_changer.py @@ -0,0 +1,89 @@ +from pathlib import Path +from typing import NamedTuple + +import librosa +import numpy + +from become_yukarin import AcousticConverter +from become_yukarin import RealtimeVocoder +from become_yukarin import SuperResolution +from become_yukarin import VoiceChanger +from become_yukarin.config.config import create_from_json as create_config +from become_yukarin.config.sr_config import create_from_json as create_sr_config +from become_yukarin.data_struct import Wave +from become_yukarin.voice_changer import VoiceChangerStream +from become_yukarin.voice_changer import VoiceChangerStreamWrapper + + +class AudioConfig(NamedTuple): + rate: int + chunk: int + vocoder_buffer_size: int + out_norm: float + + +model_base_path = Path('~/trained/') +test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav') +test_output_path = Path('tests/output.wav') + +print('model loading...', flush=True) + +model_path = model_base_path / Path('harvest-innoise03/predictor_1340000.npz') +config_path = model_base_path / Path('harvest-innoise03/config.json') +config = create_config(config_path) +acoustic_converter = AcousticConverter(config, model_path, gpu=0) +print('model 1 loaded!', flush=True) + +model_path = model_base_path / Path('sr-noise3/predictor_165000.npz') +config_path = model_base_path / Path('sr-noise3/config.json') +sr_config = create_sr_config(config_path) +super_resolution = SuperResolution(sr_config, model_path, gpu=0) +print('model 2 loaded!', flush=True) + +audio_config = AudioConfig( + rate=config.dataset.param.voice_param.sample_rate, + chunk=config.dataset.param.voice_param.sample_rate // 4, + vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, + out_norm=4.5, +) + +vocoder = RealtimeVocoder( + acoustic_feature_param=config.dataset.param.acoustic_feature_param, + out_sampling_rate=audio_config.rate, + buffer_size=audio_config.vocoder_buffer_size, + number_of_pointers=16, +) +# vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) + +voice_changer = VoiceChanger( + super_resolution=super_resolution, + acoustic_converter=acoustic_converter, + vocoder=vocoder, +) + +voice_changer_stream = VoiceChangerStream( + voice_changer=voice_changer, + sampling_rate=audio_config.rate, + in_dtype=numpy.float32, +) + +wrapper = VoiceChangerStreamWrapper( + voice_changer_stream=voice_changer_stream, + extra_time=0.2, +) + +raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate) +wave_out_list = [] + +start_time = 0 +for i in range(0, len(raw_wave), audio_config.chunk): + wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in) + start_time += len(wave_in.wave) / wave_in.sampling_rate + + wave_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate) + wave_out_list.append(wave_out) + wrapper.remove_previous_wave() + +out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32) +librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate) |
