summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <hihokaruta@gmail.com>2018-02-04 09:39:39 +0900
committerHiroshiba Kazuyuki <hihokaruta@gmail.com>2018-02-04 09:39:39 +0900
commit29f0994ea34f3df7dd5eea0b330d429b2e492211 (patch)
tree4c6827c3983ad7463f582c619db937e9f7e9f561 /scripts
parent48addd22a87f248bb8041bca47e9c209a16175a4 (diff)
リアルタイム変換が可能
Diffstat (limited to 'scripts')
-rw-r--r--scripts/realtime_voice_changer.py176
1 files changed, 100 insertions, 76 deletions
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index 4c50963..a5d1a21 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -1,7 +1,14 @@
-import queue
+import world4py
+
+world4py._WORLD_LIBRARY_PATH = 'x64_world.dll'
+
from functools import partial
from pathlib import Path
+import signal
+import time
from typing import NamedTuple
+from multiprocessing import Queue
+from multiprocessing import Process
import numpy
import pyaudio
@@ -19,66 +26,85 @@ from become_yukarin.voice_changer import VoiceChangerStreamWrapper
class AudioConfig(NamedTuple):
rate: int
- chunk: int
+ audio_chunk: int
+ convert_chunk: int
vocoder_buffer_size: int
out_norm: float
-queue_input_wave = queue.Queue()
-queue_output_wave = queue.Queue()
-queue_output_fragment_wave = queue.Queue(maxsize=1)
+def convert_worker(
+ config,
+ acoustic_converter,
+ super_resolution,
+ audio_config: AudioConfig,
+ queue_input_wave,
+ queue_output_wave,
+):
+ vocoder = RealtimeVocoder(
+ acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+ out_sampling_rate=audio_config.rate,
+ buffer_size=audio_config.vocoder_buffer_size,
+ number_of_pointers=16,
+ )
+ # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
+ voice_changer = VoiceChanger(
+ super_resolution=super_resolution,
+ acoustic_converter=acoustic_converter,
+ vocoder=vocoder,
+ )
+
+ voice_changer_stream = VoiceChangerStream(
+ voice_changer=voice_changer,
+ sampling_rate=audio_config.rate,
+ in_dtype=numpy.float32,
+ )
+
+ wrapper = VoiceChangerStreamWrapper(
+ voice_changer_stream=voice_changer_stream,
+ extra_time=0.1,
+ )
-def convert_worker(audio_config: AudioConfig, wrapper: VoiceChangerStreamWrapper):
start_time = 0
- time_length = audio_config.chunk / audio_config.rate
+ wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32)
+ wave = Wave(wave=wave, sampling_rate=audio_config.rate)
+ wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
+ start_time += len(wave.wave) / wave.sampling_rate
+ wave = wrapper.convert_next(time_length=1)
+
+ time_length = audio_config.convert_chunk / audio_config.rate
+ wave_fragment = numpy.empty(0)
while True:
wave = queue_input_wave.get()
- wave = Wave(wave=wave, sampling_rate=audio_config.rate)
- wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
- start_time += len(wave.wave) / wave.sampling_rate
+ w = Wave(wave=wave, sampling_rate=audio_config.rate)
+ wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+ start_time += time_length
- wave = wrapper.convert_next(time_length=time_length)
- queue_output_wave.put(wave.wave)
+ b = time.time()
+ wave = wrapper.convert_next(time_length=time_length).wave
+ print('time', time.time()-b, flush=True)
wrapper.remove_previous_wave()
+ print('converted wave', len(wave), flush=True)
-
-def input_callback(in_data, frame_count, time_info, status_flags, audio_config: AudioConfig):
- print('input', status_flags, flush=True)
- wave = numpy.fromstring(in_data, dtype=numpy.float32)
- queue_input_wave.put(wave)
- return None, pyaudio.paContinue
-
-
-def output_callback(_, frame_count, time_info, status_flags, audio_config: AudioConfig):
- print('output', status_flags, flush=True)
- try:
- wave = queue_output_fragment_wave.get_nowait()
- except:
- wave = numpy.empty(0)
-
- while len(wave) < audio_config.chunk:
- wave_next = queue_output_wave.get()
- wave = numpy.concatenate([wave, wave_next])
-
- wave, wave_fragment = wave[:audio_config.chunk], wave[audio_config.chunk:]
- queue_output_fragment_wave.put(wave_fragment)
-
- wave *= audio_config.out_norm
- b = wave.astype(numpy.float32).tobytes()
- return b, pyaudio.paContinue
+ wave_fragment = numpy.concatenate([wave_fragment, wave])
+ if len(wave_fragment) >= audio_config.audio_chunk:
+ wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
+ queue_output_wave.put(wave)
def main():
print('model loading...', flush=True)
- model_path = Path('./trained/mfcc8-preconvert-innoise03/predictor_350000.npz')
- config_path = Path('./trained/mfcc8-preconvert-innoise03/config.json')
+ queue_input_wave = Queue()
+ queue_output_wave = Queue()
+
+ model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz')
+ config_path = Path('./trained/harvest-innoise03/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(config, model_path, gpu=0)
print('model 1 loaded!', flush=True)
- model_path = Path('./trained/sr-noise3/predictor_70000.npz')
+ model_path = Path('./trained/sr-noise3/predictor_180000.npz')
config_path = Path('./trained/sr-noise3/config.json')
sr_config = create_sr_config(config_path)
super_resolution = SuperResolution(sr_config, model_path, gpu=0)
@@ -87,55 +113,53 @@ def main():
audio_instance = pyaudio.PyAudio()
audio_config = AudioConfig(
rate=config.dataset.param.voice_param.sample_rate,
- chunk=config.dataset.param.voice_param.sample_rate,
+ audio_chunk=config.dataset.param.voice_param.sample_rate,
+ convert_chunk=config.dataset.param.voice_param.sample_rate,
vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
- out_norm=4.5,
+ out_norm=2.5,
)
- vocoder = RealtimeVocoder(
- acoustic_feature_param=config.dataset.param.acoustic_feature_param,
- out_sampling_rate=audio_config.rate,
- buffer_size=audio_config.vocoder_buffer_size,
- number_of_pointers=16,
- )
- vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
-
- voice_changer = VoiceChanger(
- super_resolution=super_resolution,
+ process_converter = Process(target=convert_worker, kwargs=dict(
+ config=config,
+ audio_config=audio_config,
acoustic_converter=acoustic_converter,
- vocoder=vocoder,
- )
-
- voice_changer_stream = VoiceChangerStream(
- voice_changer=voice_changer,
- sampling_rate=audio_config.rate,
- in_dtype=numpy.float32,
- )
+ super_resolution=super_resolution,
+ queue_input_wave=queue_input_wave,
+ queue_output_wave=queue_output_wave,
+ ))
+ process_converter.start()
- wrapper = VoiceChangerStreamWrapper(
- voice_changer_stream=voice_changer_stream,
- extra_time=0.2,
- )
+ signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate())
- input_audio_stream = audio_instance.open(
+ audio_stream = audio_instance.open(
format=pyaudio.paFloat32,
channels=1,
rate=audio_config.rate,
- frames_per_buffer=audio_config.chunk,
+ frames_per_buffer=audio_config.audio_chunk,
input=True,
- stream_callback=partial(input_callback, audio_config=audio_config)
- )
-
- output_audio_stream = audio_instance.open(
- format=pyaudio.paFloat32,
- channels=1,
- rate=audio_config.rate,
- frames_per_buffer=audio_config.chunk,
output=True,
- stream_callback=partial(output_callback, audio_config=audio_config)
)
- convert_worker(audio_config, wrapper)
+ # process_converter.join()
+
+ while True:
+ # input audio
+ in_data = audio_stream.read(audio_config.audio_chunk)
+ wave = numpy.fromstring(in_data, dtype=numpy.float32)
+ print('input', len(wave), flush=True)
+ queue_input_wave.put(wave)
+
+ # output
+ try:
+ wave = queue_output_wave.get_nowait()
+ except:
+ wave = None
+
+ if wave is not None:
+ print('output', len(wave), flush=True)
+ wave *= audio_config.out_norm
+ b = wave.astype(numpy.float32).tobytes()
+ audio_stream.write(b)
if __name__ == '__main__':