summaryrefslogtreecommitdiff
path: root/scripts/realtime_voice_changer.py
diff options
context:
space:
mode:
authorHiroshiba <Hiroshiba@users.noreply.github.com>2018-03-09 03:00:11 +0900
committerGitHub <noreply@github.com>2018-03-09 03:00:11 +0900
commitef2be3c389412e69d33d7da0066a4831500689d2 (patch)
treec2a1973c7f720c5b8af2cad3f8844cdf4903825b /scripts/realtime_voice_changer.py
parent7b268e80d1c27be0db48854eb6cc918f8b61635d (diff)
parentf279994afdba8e08fc5e042a25f50db548ddbae3 (diff)
Merge pull request #2 from Hiroshiba/harvest-realtime
Harvest realtime
Diffstat (limited to 'scripts/realtime_voice_changer.py')
-rw-r--r--scripts/realtime_voice_changer.py153
1 files changed, 112 insertions, 41 deletions
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index a5d1a21..e96ce4e 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -14,92 +14,128 @@ import numpy
import pyaudio
from become_yukarin import AcousticConverter
+from become_yukarin import Vocoder
from become_yukarin import RealtimeVocoder
from become_yukarin import SuperResolution
from become_yukarin import VoiceChanger
+from become_yukarin.config.config import Config
from become_yukarin.config.config import create_from_json as create_config
from become_yukarin.config.sr_config import create_from_json as create_sr_config
from become_yukarin.data_struct import Wave
+from become_yukarin.data_struct import AcousticFeature
from become_yukarin.voice_changer import VoiceChangerStream
from become_yukarin.voice_changer import VoiceChangerStreamWrapper
class AudioConfig(NamedTuple):
rate: int
+ frame_period: float
audio_chunk: int
convert_chunk: int
vocoder_buffer_size: int
out_norm: float
-def convert_worker(
- config,
- acoustic_converter,
- super_resolution,
+def encode_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
audio_config: AudioConfig,
- queue_input_wave,
- queue_output_wave,
+ queue_input: Queue,
+ queue_output: Queue,
):
- vocoder = RealtimeVocoder(
+ wrapper.voice_changer_stream.vocoder = Vocoder(
acoustic_feature_param=config.dataset.param.acoustic_feature_param,
out_sampling_rate=audio_config.rate,
- buffer_size=audio_config.vocoder_buffer_size,
- number_of_pointers=16,
)
- # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
- voice_changer = VoiceChanger(
+ start_time = 0
+ time_length = audio_config.convert_chunk / audio_config.rate
+
+ while True:
+ wave = queue_input.get()
+
+ w = Wave(wave=wave, sampling_rate=audio_config.rate)
+ wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+ start_time += time_length
+
+ feature = wrapper.pre_convert_next(time_length=time_length)
+ queue_output.put(feature)
+
+
+def convert_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
+ acoustic_converter: AcousticConverter,
+ super_resolution: SuperResolution,
+ audio_config: AudioConfig,
+ queue_input: Queue,
+ queue_output: Queue,
+):
+ wrapper.voice_changer_stream.voice_changer = VoiceChanger(
super_resolution=super_resolution,
acoustic_converter=acoustic_converter,
- vocoder=vocoder,
)
- voice_changer_stream = VoiceChangerStream(
- voice_changer=voice_changer,
- sampling_rate=audio_config.rate,
- in_dtype=numpy.float32,
- )
+ start_time = 0
+ time_length = audio_config.convert_chunk / audio_config.rate
+ while True:
+ in_feature: AcousticFeature = queue_input.get()
+ wrapper.voice_changer_stream.add_in_feature(
+ start_time=start_time,
+ feature=in_feature,
+ frame_period=audio_config.frame_period,
+ )
+ start_time += time_length
- wrapper = VoiceChangerStreamWrapper(
- voice_changer_stream=voice_changer_stream,
- extra_time=0.1,
+ out_feature = wrapper.convert_next(time_length=time_length)
+ queue_output.put(out_feature)
+
+
+def decode_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
+ audio_config: AudioConfig,
+ queue_input: Queue,
+ queue_output: Queue,
+):
+ wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
+ acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+ out_sampling_rate=audio_config.rate,
+ buffer_size=audio_config.vocoder_buffer_size,
+ number_of_pointers=16,
)
+ # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
start_time = 0
- wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32)
- wave = Wave(wave=wave, sampling_rate=audio_config.rate)
- wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
- start_time += len(wave.wave) / wave.sampling_rate
- wave = wrapper.convert_next(time_length=1)
-
time_length = audio_config.convert_chunk / audio_config.rate
wave_fragment = numpy.empty(0)
while True:
- wave = queue_input_wave.get()
- w = Wave(wave=wave, sampling_rate=audio_config.rate)
- wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+ feature: AcousticFeature = queue_input.get()
+ wrapper.voice_changer_stream.add_out_feature(
+ start_time=start_time,
+ feature=feature,
+ frame_period=audio_config.frame_period,
+ )
start_time += time_length
- b = time.time()
- wave = wrapper.convert_next(time_length=time_length).wave
- print('time', time.time()-b, flush=True)
- wrapper.remove_previous_wave()
- print('converted wave', len(wave), flush=True)
+ wave = wrapper.post_convert_next(time_length=time_length).wave
wave_fragment = numpy.concatenate([wave_fragment, wave])
if len(wave_fragment) >= audio_config.audio_chunk:
wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
- queue_output_wave.put(wave)
+ queue_output.put(wave)
def main():
print('model loading...', flush=True)
queue_input_wave = Queue()
+ queue_input_feature = Queue()
+ queue_output_feature = Queue()
queue_output_wave = Queue()
- model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz')
- config_path = Path('./trained/harvest-innoise03/config.json')
+ model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+ config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(config, model_path, gpu=0)
print('model 1 loaded!', flush=True)
@@ -113,23 +149,53 @@ def main():
audio_instance = pyaudio.PyAudio()
audio_config = AudioConfig(
rate=config.dataset.param.voice_param.sample_rate,
+ frame_period=config.dataset.param.acoustic_feature_param.frame_period,
audio_chunk=config.dataset.param.voice_param.sample_rate,
convert_chunk=config.dataset.param.voice_param.sample_rate,
vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
out_norm=2.5,
)
- process_converter = Process(target=convert_worker, kwargs=dict(
+ voice_changer_stream = VoiceChangerStream(
+ sampling_rate=audio_config.rate,
+ frame_period=config.dataset.param.acoustic_feature_param.frame_period,
+ in_dtype=numpy.float32,
+ )
+
+ wrapper = VoiceChangerStreamWrapper(
+ voice_changer_stream=voice_changer_stream,
+ extra_time_pre=0.2,
+ extra_time=0.1,
+ )
+
+ process_encoder = Process(target=encode_worker, kwargs=dict(
config=config,
+ wrapper=wrapper,
audio_config=audio_config,
+ queue_input=queue_input_wave,
+ queue_output=queue_input_feature,
+ ))
+ process_encoder.start()
+
+ process_converter = Process(target=convert_worker, kwargs=dict(
+ config=config,
+ wrapper=wrapper,
acoustic_converter=acoustic_converter,
super_resolution=super_resolution,
- queue_input_wave=queue_input_wave,
- queue_output_wave=queue_output_wave,
+ audio_config=audio_config,
+ queue_input=queue_input_feature,
+ queue_output=queue_output_feature,
))
process_converter.start()
- signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate())
+ process_decoder = Process(target=decode_worker, kwargs=dict(
+ config=config,
+ wrapper=wrapper,
+ audio_config=audio_config,
+ queue_input=queue_output_feature,
+ queue_output=queue_output_wave,
+ ))
+ process_decoder.start()
audio_stream = audio_instance.open(
format=pyaudio.paFloat32,
@@ -149,6 +215,11 @@ def main():
print('input', len(wave), flush=True)
queue_input_wave.put(wave)
+ print('queue_input_wave', queue_input_wave.qsize(), flush=True)
+ print('queue_input_feature', queue_input_feature.qsize(), flush=True)
+ print('queue_output_feature', queue_output_feature.qsize(), flush=True)
+ print('queue_output_wave', queue_output_wave.qsize(), flush=True)
+
# output
try:
wave = queue_output_wave.get_nowait()