diff options
| author | Hiroshiba <Hiroshiba@users.noreply.github.com> | 2018-03-09 03:00:11 +0900 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2018-03-09 03:00:11 +0900 |
| commit | ef2be3c389412e69d33d7da0066a4831500689d2 (patch) | |
| tree | c2a1973c7f720c5b8af2cad3f8844cdf4903825b /scripts | |
| parent | 7b268e80d1c27be0db48854eb6cc918f8b61635d (diff) | |
| parent | f279994afdba8e08fc5e042a25f50db548ddbae3 (diff) | |
Merge pull request #2 from Hiroshiba/harvest-realtime
Harvest realtime
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/realtime_voice_changer.py | 153 |
1 files changed, 112 insertions, 41 deletions
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py index a5d1a21..e96ce4e 100644 --- a/scripts/realtime_voice_changer.py +++ b/scripts/realtime_voice_changer.py @@ -14,92 +14,128 @@ import numpy import pyaudio from become_yukarin import AcousticConverter +from become_yukarin import Vocoder from become_yukarin import RealtimeVocoder from become_yukarin import SuperResolution from become_yukarin import VoiceChanger +from become_yukarin.config.config import Config from become_yukarin.config.config import create_from_json as create_config from become_yukarin.config.sr_config import create_from_json as create_sr_config from become_yukarin.data_struct import Wave +from become_yukarin.data_struct import AcousticFeature from become_yukarin.voice_changer import VoiceChangerStream from become_yukarin.voice_changer import VoiceChangerStreamWrapper class AudioConfig(NamedTuple): rate: int + frame_period: float audio_chunk: int convert_chunk: int vocoder_buffer_size: int out_norm: float -def convert_worker( - config, - acoustic_converter, - super_resolution, +def encode_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, audio_config: AudioConfig, - queue_input_wave, - queue_output_wave, + queue_input: Queue, + queue_output: Queue, ): - vocoder = RealtimeVocoder( + wrapper.voice_changer_stream.vocoder = Vocoder( acoustic_feature_param=config.dataset.param.acoustic_feature_param, out_sampling_rate=audio_config.rate, - buffer_size=audio_config.vocoder_buffer_size, - number_of_pointers=16, ) - # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) - voice_changer = VoiceChanger( + start_time = 0 + time_length = audio_config.convert_chunk / audio_config.rate + + while True: + wave = queue_input.get() + + w = Wave(wave=wave, sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) + start_time += time_length + + feature = wrapper.pre_convert_next(time_length=time_length) + queue_output.put(feature) + + +def convert_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, + acoustic_converter: AcousticConverter, + super_resolution: SuperResolution, + audio_config: AudioConfig, + queue_input: Queue, + queue_output: Queue, +): + wrapper.voice_changer_stream.voice_changer = VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, - vocoder=vocoder, ) - voice_changer_stream = VoiceChangerStream( - voice_changer=voice_changer, - sampling_rate=audio_config.rate, - in_dtype=numpy.float32, - ) + start_time = 0 + time_length = audio_config.convert_chunk / audio_config.rate + while True: + in_feature: AcousticFeature = queue_input.get() + wrapper.voice_changer_stream.add_in_feature( + start_time=start_time, + feature=in_feature, + frame_period=audio_config.frame_period, + ) + start_time += time_length - wrapper = VoiceChangerStreamWrapper( - voice_changer_stream=voice_changer_stream, - extra_time=0.1, + out_feature = wrapper.convert_next(time_length=time_length) + queue_output.put(out_feature) + + +def decode_worker( + config: Config, + wrapper: VoiceChangerStreamWrapper, + audio_config: AudioConfig, + queue_input: Queue, + queue_output: Queue, +): + wrapper.voice_changer_stream.vocoder = RealtimeVocoder( + acoustic_feature_param=config.dataset.param.acoustic_feature_param, + out_sampling_rate=audio_config.rate, + buffer_size=audio_config.vocoder_buffer_size, + number_of_pointers=16, ) + # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) start_time = 0 - wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32) - wave = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave) - start_time += len(wave.wave) / wave.sampling_rate - wave = wrapper.convert_next(time_length=1) - time_length = audio_config.convert_chunk / audio_config.rate wave_fragment = numpy.empty(0) while True: - wave = queue_input_wave.get() - w = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) + feature: AcousticFeature = queue_input.get() + wrapper.voice_changer_stream.add_out_feature( + start_time=start_time, + feature=feature, + frame_period=audio_config.frame_period, + ) start_time += time_length - b = time.time() - wave = wrapper.convert_next(time_length=time_length).wave - print('time', time.time()-b, flush=True) - wrapper.remove_previous_wave() - print('converted wave', len(wave), flush=True) + wave = wrapper.post_convert_next(time_length=time_length).wave wave_fragment = numpy.concatenate([wave_fragment, wave]) if len(wave_fragment) >= audio_config.audio_chunk: wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:] - queue_output_wave.put(wave) + queue_output.put(wave) def main(): print('model loading...', flush=True) queue_input_wave = Queue() + queue_input_feature = Queue() + queue_output_feature = Queue() queue_output_wave = Queue() - model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz') - config_path = Path('./trained/harvest-innoise03/config.json') + model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz') + config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, gpu=0) print('model 1 loaded!', flush=True) @@ -113,23 +149,53 @@ def main(): audio_instance = pyaudio.PyAudio() audio_config = AudioConfig( rate=config.dataset.param.voice_param.sample_rate, + frame_period=config.dataset.param.acoustic_feature_param.frame_period, audio_chunk=config.dataset.param.voice_param.sample_rate, convert_chunk=config.dataset.param.voice_param.sample_rate, vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, out_norm=2.5, ) - process_converter = Process(target=convert_worker, kwargs=dict( + voice_changer_stream = VoiceChangerStream( + sampling_rate=audio_config.rate, + frame_period=config.dataset.param.acoustic_feature_param.frame_period, + in_dtype=numpy.float32, + ) + + wrapper = VoiceChangerStreamWrapper( + voice_changer_stream=voice_changer_stream, + extra_time_pre=0.2, + extra_time=0.1, + ) + + process_encoder = Process(target=encode_worker, kwargs=dict( config=config, + wrapper=wrapper, audio_config=audio_config, + queue_input=queue_input_wave, + queue_output=queue_input_feature, + )) + process_encoder.start() + + process_converter = Process(target=convert_worker, kwargs=dict( + config=config, + wrapper=wrapper, acoustic_converter=acoustic_converter, super_resolution=super_resolution, - queue_input_wave=queue_input_wave, - queue_output_wave=queue_output_wave, + audio_config=audio_config, + queue_input=queue_input_feature, + queue_output=queue_output_feature, )) process_converter.start() - signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate()) + process_decoder = Process(target=decode_worker, kwargs=dict( + config=config, + wrapper=wrapper, + audio_config=audio_config, + queue_input=queue_output_feature, + queue_output=queue_output_wave, + )) + process_decoder.start() audio_stream = audio_instance.open( format=pyaudio.paFloat32, @@ -149,6 +215,11 @@ def main(): print('input', len(wave), flush=True) queue_input_wave.put(wave) + print('queue_input_wave', queue_input_wave.qsize(), flush=True) + print('queue_input_feature', queue_input_feature.qsize(), flush=True) + print('queue_output_feature', queue_output_feature.qsize(), flush=True) + print('queue_output_wave', queue_output_wave.qsize(), flush=True) + # output try: wave = queue_output_wave.get_nowait() |
