diff options
| author | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-02-04 09:39:39 +0900 |
|---|---|---|
| committer | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-02-04 09:39:39 +0900 |
| commit | 29f0994ea34f3df7dd5eea0b330d429b2e492211 (patch) | |
| tree | 4c6827c3983ad7463f582c619db937e9f7e9f561 | |
| parent | 48addd22a87f248bb8041bca47e9c209a16175a4 (diff) | |
リアルタイム変換が可能
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 15 | ||||
| -rw-r--r-- | become_yukarin/vocoder.py | 6 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 12 | ||||
| -rw-r--r-- | scripts/realtime_voice_changer.py | 176 | ||||
| -rw-r--r-- | tests/test_voice_changer.py | 14 |
5 files changed, 134 insertions, 89 deletions
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 178844a..7e3acbf 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -87,8 +87,16 @@ class WaveFileLoadProcess(BaseDataProcess): class AcousticFeatureProcess(BaseDataProcess): - def __init__(self, frame_period, order, alpha, f0_estimating_method, f0_floor=71, f0_ceil=800, - dtype=numpy.float32) -> None: + def __init__( + self, + frame_period, + order, + alpha, + f0_estimating_method, + f0_floor=71, + f0_ceil=800, + dtype=numpy.float32, + ) -> None: self._frame_period = frame_period self._order = order self._alpha = alpha @@ -110,7 +118,8 @@ class AcousticFeatureProcess(BaseDataProcess): f0_ceil=self._f0_ceil, ) else: - _f0, t = pyworld.harvest( + from world4py.np import apis + _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, diff --git a/become_yukarin/vocoder.py b/become_yukarin/vocoder.py index f1a9f03..0674e0f 100644 --- a/become_yukarin/vocoder.py +++ b/become_yukarin/vocoder.py @@ -65,7 +65,7 @@ class RealtimeVocoder(Vocoder): number_of_pointers, # number of pointers self._synthesizer, ) - self._before_buffer = None # for holding memory + self._before_buffer = [] # for holding memory def decode( self, @@ -93,7 +93,9 @@ class RealtimeVocoder(Vocoder): sampling_rate=self.out_sampling_rate, ) - self._before_buffer = (f0_buffer, sp_buffer, ap_buffer) # for holding memory + self._before_buffer.append((f0_buffer, sp_buffer, ap_buffer)) # for holding memory + if len(self._before_buffer) > 16: + self._before_buffer.pop(0) return out_wave def warm_up(self, time_length: float): diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 05f5a96..5e0eac0 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -83,7 +83,7 @@ class VoiceChangerStream(object): def remove_wave(self, end_time: float): self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream)) - def convert(self, start_time: float, time_length: float): + def convert_to_feature(self, start_time: float, time_length: float): end_time = start_time + time_length buffer_list = [] stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream) @@ -121,12 +121,18 @@ class VoiceChangerStream(object): buffer_list.append(pad) buffer = numpy.concatenate(buffer_list) - print('buffer', len(buffer), flush=True) in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate) in_feature = self.vocoder.encode(in_wave) out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) return out_feature + def convert(self, start_time: float, time_length: float): + feature = self.convert_to_feature(start_time=start_time, time_length=time_length) + out_wave = self.vocoder.decode( + acoustic_feature=feature, + ) + return out_wave + def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float): """ :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。 @@ -136,7 +142,7 @@ class VoiceChangerStream(object): start_time -= extra_time time_length += extra_time * 2 - extra_feature = self.convert(start_time=start_time, time_length=time_length) + extra_feature = self.convert_to_feature(start_time=start_time, time_length=time_length) pad = int(extra_time / (frame_period / 1000)) feature = AcousticFeature( diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py index 4c50963..a5d1a21 100644 --- a/scripts/realtime_voice_changer.py +++ b/scripts/realtime_voice_changer.py @@ -1,7 +1,14 @@ -import queue +import world4py + +world4py._WORLD_LIBRARY_PATH = 'x64_world.dll' + from functools import partial from pathlib import Path +import signal +import time from typing import NamedTuple +from multiprocessing import Queue +from multiprocessing import Process import numpy import pyaudio @@ -19,66 +26,85 @@ from become_yukarin.voice_changer import VoiceChangerStreamWrapper class AudioConfig(NamedTuple): rate: int - chunk: int + audio_chunk: int + convert_chunk: int vocoder_buffer_size: int out_norm: float -queue_input_wave = queue.Queue() -queue_output_wave = queue.Queue() -queue_output_fragment_wave = queue.Queue(maxsize=1) +def convert_worker( + config, + acoustic_converter, + super_resolution, + audio_config: AudioConfig, + queue_input_wave, + queue_output_wave, +): + vocoder = RealtimeVocoder( + acoustic_feature_param=config.dataset.param.acoustic_feature_param, + out_sampling_rate=audio_config.rate, + buffer_size=audio_config.vocoder_buffer_size, + number_of_pointers=16, + ) + # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) + voice_changer = VoiceChanger( + super_resolution=super_resolution, + acoustic_converter=acoustic_converter, + vocoder=vocoder, + ) + + voice_changer_stream = VoiceChangerStream( + voice_changer=voice_changer, + sampling_rate=audio_config.rate, + in_dtype=numpy.float32, + ) + + wrapper = VoiceChangerStreamWrapper( + voice_changer_stream=voice_changer_stream, + extra_time=0.1, + ) -def convert_worker(audio_config: AudioConfig, wrapper: VoiceChangerStreamWrapper): start_time = 0 - time_length = audio_config.chunk / audio_config.rate + wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32) + wave = Wave(wave=wave, sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave) + start_time += len(wave.wave) / wave.sampling_rate + wave = wrapper.convert_next(time_length=1) + + time_length = audio_config.convert_chunk / audio_config.rate + wave_fragment = numpy.empty(0) while True: wave = queue_input_wave.get() - wave = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave) - start_time += len(wave.wave) / wave.sampling_rate + w = Wave(wave=wave, sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) + start_time += time_length - wave = wrapper.convert_next(time_length=time_length) - queue_output_wave.put(wave.wave) + b = time.time() + wave = wrapper.convert_next(time_length=time_length).wave + print('time', time.time()-b, flush=True) wrapper.remove_previous_wave() + print('converted wave', len(wave), flush=True) - -def input_callback(in_data, frame_count, time_info, status_flags, audio_config: AudioConfig): - print('input', status_flags, flush=True) - wave = numpy.fromstring(in_data, dtype=numpy.float32) - queue_input_wave.put(wave) - return None, pyaudio.paContinue - - -def output_callback(_, frame_count, time_info, status_flags, audio_config: AudioConfig): - print('output', status_flags, flush=True) - try: - wave = queue_output_fragment_wave.get_nowait() - except: - wave = numpy.empty(0) - - while len(wave) < audio_config.chunk: - wave_next = queue_output_wave.get() - wave = numpy.concatenate([wave, wave_next]) - - wave, wave_fragment = wave[:audio_config.chunk], wave[audio_config.chunk:] - queue_output_fragment_wave.put(wave_fragment) - - wave *= audio_config.out_norm - b = wave.astype(numpy.float32).tobytes() - return b, pyaudio.paContinue + wave_fragment = numpy.concatenate([wave_fragment, wave]) + if len(wave_fragment) >= audio_config.audio_chunk: + wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:] + queue_output_wave.put(wave) def main(): print('model loading...', flush=True) - model_path = Path('./trained/mfcc8-preconvert-innoise03/predictor_350000.npz') - config_path = Path('./trained/mfcc8-preconvert-innoise03/config.json') + queue_input_wave = Queue() + queue_output_wave = Queue() + + model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz') + config_path = Path('./trained/harvest-innoise03/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, gpu=0) print('model 1 loaded!', flush=True) - model_path = Path('./trained/sr-noise3/predictor_70000.npz') + model_path = Path('./trained/sr-noise3/predictor_180000.npz') config_path = Path('./trained/sr-noise3/config.json') sr_config = create_sr_config(config_path) super_resolution = SuperResolution(sr_config, model_path, gpu=0) @@ -87,55 +113,53 @@ def main(): audio_instance = pyaudio.PyAudio() audio_config = AudioConfig( rate=config.dataset.param.voice_param.sample_rate, - chunk=config.dataset.param.voice_param.sample_rate, + audio_chunk=config.dataset.param.voice_param.sample_rate, + convert_chunk=config.dataset.param.voice_param.sample_rate, vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, - out_norm=4.5, + out_norm=2.5, ) - vocoder = RealtimeVocoder( - acoustic_feature_param=config.dataset.param.acoustic_feature_param, - out_sampling_rate=audio_config.rate, - buffer_size=audio_config.vocoder_buffer_size, - number_of_pointers=16, - ) - vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) - - voice_changer = VoiceChanger( - super_resolution=super_resolution, + process_converter = Process(target=convert_worker, kwargs=dict( + config=config, + audio_config=audio_config, acoustic_converter=acoustic_converter, - vocoder=vocoder, - ) - - voice_changer_stream = VoiceChangerStream( - voice_changer=voice_changer, - sampling_rate=audio_config.rate, - in_dtype=numpy.float32, - ) + super_resolution=super_resolution, + queue_input_wave=queue_input_wave, + queue_output_wave=queue_output_wave, + )) + process_converter.start() - wrapper = VoiceChangerStreamWrapper( - voice_changer_stream=voice_changer_stream, - extra_time=0.2, - ) + signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate()) - input_audio_stream = audio_instance.open( + audio_stream = audio_instance.open( format=pyaudio.paFloat32, channels=1, rate=audio_config.rate, - frames_per_buffer=audio_config.chunk, + frames_per_buffer=audio_config.audio_chunk, input=True, - stream_callback=partial(input_callback, audio_config=audio_config) - ) - - output_audio_stream = audio_instance.open( - format=pyaudio.paFloat32, - channels=1, - rate=audio_config.rate, - frames_per_buffer=audio_config.chunk, output=True, - stream_callback=partial(output_callback, audio_config=audio_config) ) - convert_worker(audio_config, wrapper) + # process_converter.join() + + while True: + # input audio + in_data = audio_stream.read(audio_config.audio_chunk) + wave = numpy.fromstring(in_data, dtype=numpy.float32) + print('input', len(wave), flush=True) + queue_input_wave.put(wave) + + # output + try: + wave = queue_output_wave.get_nowait() + except: + wave = None + + if wave is not None: + print('output', len(wave), flush=True) + wave *= audio_config.out_norm + b = wave.astype(numpy.float32).tobytes() + audio_stream.write(b) if __name__ == '__main__': diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py index 2a42f88..ceddf9c 100644 --- a/tests/test_voice_changer.py +++ b/tests/test_voice_changer.py @@ -1,3 +1,7 @@ +import world4py +world4py._WORLD_LIBRARY_PATH = 'x64_world.dll' + + from pathlib import Path from typing import NamedTuple @@ -22,19 +26,19 @@ class AudioConfig(NamedTuple): out_norm: float -model_base_path = Path('~/trained/') +model_base_path = Path('~/Github/become-yukarin/trained/').expanduser() test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav') -test_output_path = Path('tests/output.wav') +test_output_path = Path('output.wav') print('model loading...', flush=True) -model_path = model_base_path / Path('harvest-innoise03/predictor_1340000.npz') +model_path = model_base_path / Path('harvest-innoise03/predictor_1390000.npz') config_path = model_base_path / Path('harvest-innoise03/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, gpu=0) print('model 1 loaded!', flush=True) -model_path = model_base_path / Path('sr-noise3/predictor_165000.npz') +model_path = model_base_path / Path('sr-noise3/predictor_180000.npz') config_path = model_base_path / Path('sr-noise3/config.json') sr_config = create_sr_config(config_path) super_resolution = SuperResolution(sr_config, model_path, gpu=0) @@ -42,7 +46,7 @@ print('model 2 loaded!', flush=True) audio_config = AudioConfig( rate=config.dataset.param.voice_param.sample_rate, - chunk=config.dataset.param.voice_param.sample_rate // 4, + chunk=config.dataset.param.voice_param.sample_rate, vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, out_norm=4.5, ) |
