Merge pull request #2 from Hiroshiba/harvest-realtime

Harvest realtime
author: Hiroshiba <Hiroshiba@users.noreply.github.com> 2018-03-09 03:00:11 +0900
committer: GitHub <noreply@github.com> 2018-03-09 03:00:11 +0900
commit: ef2be3c389412e69d33d7da0066a4831500689d2 (patch)
tree: c2a1973c7f720c5b8af2cad3f8844cdf4903825b /scripts
parent: 7b268e80d1c27be0db48854eb6cc918f8b61635d (diff)
parent: f279994afdba8e08fc5e042a25f50db548ddbae3 (diff)
1 files changed, 112 insertions, 41 deletions
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index a5d1a21..e96ce4e 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -14,92 +14,128 @@ import numpy
 import pyaudio
 
 from become_yukarin import AcousticConverter
+from become_yukarin import Vocoder
 from become_yukarin import RealtimeVocoder
 from become_yukarin import SuperResolution
 from become_yukarin import VoiceChanger
+from become_yukarin.config.config import Config
 from become_yukarin.config.config import create_from_json as create_config
 from become_yukarin.config.sr_config import create_from_json as create_sr_config
 from become_yukarin.data_struct import Wave
+from become_yukarin.data_struct import AcousticFeature
 from become_yukarin.voice_changer import VoiceChangerStream
 from become_yukarin.voice_changer import VoiceChangerStreamWrapper
 
 
 class AudioConfig(NamedTuple):
     rate: int
+    frame_period: float
     audio_chunk: int
     convert_chunk: int
     vocoder_buffer_size: int
     out_norm: float
 
 
-def convert_worker(
-        config,
-        acoustic_converter,
-        super_resolution,
+def encode_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
         audio_config: AudioConfig,
-        queue_input_wave,
-        queue_output_wave,
+        queue_input: Queue,
+        queue_output: Queue,
 ):
-    vocoder = RealtimeVocoder(
+    wrapper.voice_changer_stream.vocoder = Vocoder(
         acoustic_feature_param=config.dataset.param.acoustic_feature_param,
         out_sampling_rate=audio_config.rate,
-        buffer_size=audio_config.vocoder_buffer_size,
-        number_of_pointers=16,
     )
-    # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
 
-    voice_changer = VoiceChanger(
+    start_time = 0
+    time_length = audio_config.convert_chunk / audio_config.rate
+
+    while True:
+        wave = queue_input.get()
+
+        w = Wave(wave=wave, sampling_rate=audio_config.rate)
+        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+        start_time += time_length
+
+        feature = wrapper.pre_convert_next(time_length=time_length)
+        queue_output.put(feature)
+
+
+def convert_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
+        acoustic_converter: AcousticConverter,
+        super_resolution: SuperResolution,
+        audio_config: AudioConfig,
+        queue_input: Queue,
+        queue_output: Queue,
+):
+    wrapper.voice_changer_stream.voice_changer = VoiceChanger(
         super_resolution=super_resolution,
         acoustic_converter=acoustic_converter,
-        vocoder=vocoder,
     )
 
-    voice_changer_stream = VoiceChangerStream(
-        voice_changer=voice_changer,
-        sampling_rate=audio_config.rate,
-        in_dtype=numpy.float32,
-    )
+    start_time = 0
+    time_length = audio_config.convert_chunk / audio_config.rate
+    while True:
+        in_feature: AcousticFeature = queue_input.get()
+        wrapper.voice_changer_stream.add_in_feature(
+            start_time=start_time,
+            feature=in_feature,
+            frame_period=audio_config.frame_period,
+        )
+        start_time += time_length
 
-    wrapper = VoiceChangerStreamWrapper(
-        voice_changer_stream=voice_changer_stream,
-        extra_time=0.1,
+        out_feature = wrapper.convert_next(time_length=time_length)
+        queue_output.put(out_feature)
+
+
+def decode_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
+        audio_config: AudioConfig,
+        queue_input: Queue,
+        queue_output: Queue,
+):
+    wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
+        acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+        out_sampling_rate=audio_config.rate,
+        buffer_size=audio_config.vocoder_buffer_size,
+        number_of_pointers=16,
     )
+    # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
 
     start_time = 0
-    wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32)
-    wave = Wave(wave=wave, sampling_rate=audio_config.rate)
-    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
-    start_time += len(wave.wave) / wave.sampling_rate
-    wave = wrapper.convert_next(time_length=1)
-
     time_length = audio_config.convert_chunk / audio_config.rate
     wave_fragment = numpy.empty(0)
     while True:
-        wave = queue_input_wave.get()
-        w = Wave(wave=wave, sampling_rate=audio_config.rate)
-        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+        feature: AcousticFeature = queue_input.get()
+        wrapper.voice_changer_stream.add_out_feature(
+            start_time=start_time,
+            feature=feature,
+            frame_period=audio_config.frame_period,
+        )
         start_time += time_length
 
-        b = time.time()
-        wave = wrapper.convert_next(time_length=time_length).wave
-        print('time', time.time()-b, flush=True)
-        wrapper.remove_previous_wave()
-        print('converted wave', len(wave), flush=True)
+        wave = wrapper.post_convert_next(time_length=time_length).wave
 
         wave_fragment = numpy.concatenate([wave_fragment, wave])
         if len(wave_fragment) >= audio_config.audio_chunk:
             wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
-            queue_output_wave.put(wave)
+            queue_output.put(wave)
 
 
 def main():
     print('model loading...', flush=True)
 
     queue_input_wave = Queue()
+    queue_input_feature = Queue()
+    queue_output_feature = Queue()
     queue_output_wave = Queue()
 
-    model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz')
-    config_path = Path('./trained/harvest-innoise03/config.json')
+    model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+    config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json')
     config = create_config(config_path)
     acoustic_converter = AcousticConverter(config, model_path, gpu=0)
     print('model 1 loaded!', flush=True)
@@ -113,23 +149,53 @@ def main():
     audio_instance = pyaudio.PyAudio()
     audio_config = AudioConfig(
         rate=config.dataset.param.voice_param.sample_rate,
+        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
         audio_chunk=config.dataset.param.voice_param.sample_rate,
         convert_chunk=config.dataset.param.voice_param.sample_rate,
         vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
         out_norm=2.5,
     )
 
-    process_converter = Process(target=convert_worker, kwargs=dict(
+    voice_changer_stream = VoiceChangerStream(
+        sampling_rate=audio_config.rate,
+        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
+        in_dtype=numpy.float32,
+    )
+
+    wrapper = VoiceChangerStreamWrapper(
+        voice_changer_stream=voice_changer_stream,
+        extra_time_pre=0.2,
+        extra_time=0.1,
+    )
+
+    process_encoder = Process(target=encode_worker, kwargs=dict(
         config=config,
+        wrapper=wrapper,
         audio_config=audio_config,
+        queue_input=queue_input_wave,
+        queue_output=queue_input_feature,
+    ))
+    process_encoder.start()
+
+    process_converter = Process(target=convert_worker, kwargs=dict(
+        config=config,
+        wrapper=wrapper,
         acoustic_converter=acoustic_converter,
         super_resolution=super_resolution,
-        queue_input_wave=queue_input_wave,
-        queue_output_wave=queue_output_wave,
+        audio_config=audio_config,
+        queue_input=queue_input_feature,
+        queue_output=queue_output_feature,
     ))
     process_converter.start()
 
-    signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate())
+    process_decoder = Process(target=decode_worker, kwargs=dict(
+        config=config,
+        wrapper=wrapper,
+        audio_config=audio_config,
+        queue_input=queue_output_feature,
+        queue_output=queue_output_wave,
+    ))
+    process_decoder.start()
 
     audio_stream = audio_instance.open(
         format=pyaudio.paFloat32,
@@ -149,6 +215,11 @@ def main():
         print('input', len(wave), flush=True)
         queue_input_wave.put(wave)
 
+        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
+        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
+        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
+        print('queue_output_wave', queue_output_wave.qsize(), flush=True)
+
         # output
         try:
             wave = queue_output_wave.get_nowait()
author	Hiroshiba <Hiroshiba@users.noreply.github.com>	2018-03-09 03:00:11 +0900
committer	GitHub <noreply@github.com>	2018-03-09 03:00:11 +0900
commit	ef2be3c389412e69d33d7da0066a4831500689d2 (patch)
tree	c2a1973c7f720c5b8af2cad3f8844cdf4903825b /scripts
parent	7b268e80d1c27be0db48854eb6cc918f8b61635d (diff)
parent	f279994afdba8e08fc5e042a25f50db548ddbae3 (diff)