summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--become_yukarin/voice_changer.py272
-rw-r--r--scripts/realtime_voice_changer.py153
-rw-r--r--tests/test_voice_changer.py38
3 files changed, 368 insertions, 95 deletions
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 5e0eac0..3b75fb1 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -15,7 +15,6 @@ class VoiceChanger(object):
self,
acoustic_converter: AcousticConverter,
super_resolution: SuperResolution,
- vocoder: Vocoder,
output_sampling_rate: int = None,
) -> None:
if output_sampling_rate is None:
@@ -23,18 +22,17 @@ class VoiceChanger(object):
self.acoustic_converter = acoustic_converter
self.super_resolution = super_resolution
- self.vocoder = vocoder
self.output_sampling_rate = output_sampling_rate
- def convert_from_wave_path(self, wave_path: str):
- w_in = self.acoustic_converter._wave_process(wave_path)
- return self.convert_from_wave(w_in)
-
- def convert_from_wave(self, wave: Wave):
- f_in = self.acoustic_converter._feature_process(wave)
- f_high = self.convert_from_acoustic_feature(f_in)
- wave = self.vocoder.decode(f_high)
- return wave
+ # def convert_from_wave_path(self, wave_path: str):
+ # w_in = self.acoustic_converter._wave_process(wave_path)
+ # return self.convert_from_wave(w_in)
+ #
+ # def convert_from_wave(self, wave: Wave):
+ # f_in = self.acoustic_converter._feature_process(wave)
+ # f_high = self.convert_from_acoustic_feature(f_in)
+ # wave = self.vocoder.decode(f_high)
+ # return wave
def convert_from_acoustic_feature(self, f_in: AcousticFeature):
f_low = self.acoustic_converter.convert_to_feature(f_in)
@@ -43,6 +41,20 @@ class VoiceChanger(object):
return f_high
+class FeatureSegment(NamedTuple):
+ start_time: float
+ feature: AcousticFeature
+ frame_period: float
+
+ @property
+ def time_length(self):
+ return len(self.feature.f0) * self.frame_period / 1000
+
+ @property
+ def end_time(self):
+ return self.time_length + self.start_time
+
+
class Segment(NamedTuple):
start_time: float
wave: Wave
@@ -59,18 +71,19 @@ class Segment(NamedTuple):
class VoiceChangerStream(object):
def __init__(
self,
- voice_changer: VoiceChanger,
sampling_rate: int,
+ frame_period: float,
in_dtype=numpy.float32,
):
- self.voice_changer = voice_changer
self.sampling_rate = sampling_rate
+ self.frame_period = frame_period
self.in_dtype = in_dtype
- self._data_stream = [] # type: List[Segment]
- @property
- def vocoder(self):
- return self.voice_changer.vocoder
+ self.voice_changer: VoiceChanger = None
+ self.vocoder: Vocoder = None
+ self._data_stream = [] # type: List[Segment]
+ self._in_feature_stream = [] # type: List[FeatureSegment]
+ self._out_feature_stream = [] # type: List[FeatureSegment]
def add_wave(self, start_time: float, wave: Wave):
# validation
@@ -80,10 +93,30 @@ class VoiceChangerStream(object):
segment = Segment(start_time=start_time, wave=wave)
self._data_stream.append(segment)
- def remove_wave(self, end_time: float):
+ def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
+ # validation
+ assert frame_period == self.frame_period
+ assert feature.f0.dtype == self.in_dtype
+
+ segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
+ self._in_feature_stream.append(segment)
+
+ def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
+ # validation
+ assert frame_period == self.frame_period
+
+ segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
+ self._out_feature_stream.append(segment)
+
+ def remove(self, end_time: float):
self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream))
+ self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream))
+ self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream))
+
+ def pre_convert(self, start_time: float, time_length: float, extra_time: float):
+ start_time -= extra_time
+ time_length += extra_time * 2
- def convert_to_feature(self, start_time: float, time_length: float):
end_time = start_time + time_length
buffer_list = []
stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream)
@@ -123,38 +156,161 @@ class VoiceChangerStream(object):
buffer = numpy.concatenate(buffer_list)
in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate)
in_feature = self.vocoder.encode(in_wave)
- out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
- return out_feature
- def convert(self, start_time: float, time_length: float):
- feature = self.convert_to_feature(start_time=start_time, time_length=time_length)
- out_wave = self.vocoder.decode(
- acoustic_feature=feature,
+ pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000))
+ in_feature = AcousticFeature(
+ f0=in_feature.f0[pad:-pad],
+ spectrogram=in_feature.spectrogram[pad:-pad],
+ aperiodicity=in_feature.aperiodicity[pad:-pad],
+ mfcc=in_feature.mfcc[pad:-pad],
+ voiced=in_feature.voiced[pad:-pad],
)
- return out_wave
-
- def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float):
- """
- :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。
- """
- frame_period = self.vocoder.acoustic_feature_param.frame_period
+ return in_feature
+ def convert(self, start_time: float, time_length: float, extra_time: float):
start_time -= extra_time
time_length += extra_time * 2
- extra_feature = self.convert_to_feature(start_time=start_time, time_length=time_length)
+ order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order
+
+ end_time = start_time + time_length
+ f0_buffer_list = []
+ mfcc_buffer_list = []
+ ap_buffer_list = []
+ voiced_buffer_list = []
+ stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._in_feature_stream)
+
+ start_time_buffer = start_time
+ remaining_time = time_length
+ for segment in stream:
+ # padding
+ if segment.start_time > start_time_buffer:
+ pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period)
+ dims = AcousticFeature.get_sizes(self.sampling_rate, order)
+
+ f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype))
+ mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype))
+ ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype))
+ voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool))
+
+ start_time_buffer = segment.start_time
+ if remaining_time > segment.end_time - start_time_buffer:
+ one_time_length = segment.end_time - start_time_buffer
+ else:
+ one_time_length = remaining_time
+
+ first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period)
+ last_index = int(first_index + one_time_length * 1000 / self.frame_period)
+
+ f0_buffer_list.append(segment.feature.f0[first_index:last_index])
+ mfcc_buffer_list.append(segment.feature.mfcc[first_index:last_index])
+ ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index])
+ voiced_buffer_list.append(segment.feature.voiced[first_index:last_index])
+
+ start_time_buffer += one_time_length
+ remaining_time -= one_time_length
+
+ if start_time_buffer >= end_time:
+ break
+ else:
+ # last padding
+ pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period)
+ dims = AcousticFeature.get_sizes(self.sampling_rate, order)
+
+ f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype))
+ mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype))
+ ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype))
+ voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool))
+
+ f0 = numpy.concatenate(f0_buffer_list)
+ mfcc = numpy.concatenate(mfcc_buffer_list)
+ aperiodicity = numpy.concatenate(ap_buffer_list)
+ voiced = numpy.concatenate(voiced_buffer_list)
+ in_feature = AcousticFeature(
+ f0=f0,
+ spectrogram=numpy.nan,
+ aperiodicity=aperiodicity,
+ mfcc=mfcc,
+ voiced=voiced,
+ )
+
+ out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
+
+ pad = int(extra_time * 1000 / self.frame_period)
+ out_feature= AcousticFeature(
+ f0=out_feature.f0[pad:-pad],
+ spectrogram=out_feature.spectrogram[pad:-pad],
+ aperiodicity=out_feature.aperiodicity[pad:-pad],
+ mfcc=out_feature.mfcc[pad:-pad],
+ voiced=out_feature.voiced[pad:-pad],
+ )
+ return out_feature
+
+ def post_convert(self, start_time: float, time_length: float):
+ end_time = start_time + time_length
+ f0_buffer_list = []
+ sp_buffer_list = []
+ ap_buffer_list = []
+ voiced_buffer_list = []
+ stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._out_feature_stream)
+
+ start_time_buffer = start_time
+ remaining_time = time_length
+ for segment in stream:
+ # padding
+ if segment.start_time > start_time_buffer:
+ pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period)
+ dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order)
+
+ f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype))
+ sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype))
+ ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype))
+ voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool))
+
+ start_time_buffer = segment.start_time
+
+ if remaining_time > segment.end_time - start_time_buffer:
+ one_time_length = segment.end_time - start_time_buffer
+ else:
+ one_time_length = remaining_time
+
+ first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period)
+ last_index = int(first_index + one_time_length * 1000 / self.frame_period)
+
+ f0_buffer_list.append(segment.feature.f0[first_index:last_index])
+ sp_buffer_list.append(segment.feature.spectrogram[first_index:last_index])
+ ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index])
+ voiced_buffer_list.append(segment.feature.voiced[first_index:last_index])
+
+ start_time_buffer += one_time_length
+ remaining_time -= one_time_length
+
+ if start_time_buffer >= end_time:
+ break
+ else:
+ # last padding
+ pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period)
+ dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order)
+
+ f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype))
+ sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype))
+ ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype))
+ voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype))
- pad = int(extra_time / (frame_period / 1000))
- feature = AcousticFeature(
- f0=extra_feature.f0[pad:-pad],
- spectrogram=extra_feature.spectrogram[pad:-pad],
- aperiodicity=extra_feature.aperiodicity[pad:-pad],
- mfcc=extra_feature.mfcc[pad:-pad],
- voiced=extra_feature.voiced[pad:-pad],
+ f0 = numpy.concatenate(f0_buffer_list)
+ spectrogram = numpy.concatenate(sp_buffer_list)
+ aperiodicity = numpy.concatenate(ap_buffer_list)
+ voiced = numpy.concatenate(voiced_buffer_list)
+ out_feature = AcousticFeature(
+ f0=f0,
+ spectrogram=spectrogram,
+ aperiodicity=aperiodicity,
+ mfcc=numpy.nan,
+ voiced=voiced,
)
out_wave = self.vocoder.decode(
- acoustic_feature=feature,
+ acoustic_feature=out_feature,
)
return out_wave
@@ -163,20 +319,46 @@ class VoiceChangerStreamWrapper(object):
def __init__(
self,
voice_changer_stream: VoiceChangerStream,
- extra_time: float = 0.0
+ extra_time_pre: float = 0.0,
+ extra_time: float = 0.0,
):
self.voice_changer_stream = voice_changer_stream
+ self.extra_time_pre = extra_time_pre
self.extra_time = extra_time
+ self._current_time_pre = 0
self._current_time = 0
+ self._current_time_post = 0
+
+ def pre_convert_next(self, time_length: float):
+ in_feature = self.voice_changer_stream.pre_convert(
+ start_time=self._current_time_pre,
+ time_length=time_length,
+ extra_time=self.extra_time_pre,
+ )
+ self._current_time_pre += time_length
+ return in_feature
def convert_next(self, time_length: float):
- out_wave = self.voice_changer_stream.convert_with_extra_time(
+ out_feature = self.voice_changer_stream.convert(
start_time=self._current_time,
time_length=time_length,
extra_time=self.extra_time,
)
self._current_time += time_length
+ return out_feature
+
+ def post_convert_next(self, time_length: float):
+ out_wave = self.voice_changer_stream.post_convert(
+ start_time=self._current_time_post,
+ time_length=time_length,
+ )
+ self._current_time_post += time_length
return out_wave
- def remove_previous_wave(self):
- self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time)
+ def remove_previous(self):
+ end_time = min(
+ self._current_time_pre - self.extra_time_pre,
+ self._current_time - self.extra_time,
+ self._current_time_post,
+ )
+ self.voice_changer_stream.remove(end_time=end_time)
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index a5d1a21..e96ce4e 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -14,92 +14,128 @@ import numpy
import pyaudio
from become_yukarin import AcousticConverter
+from become_yukarin import Vocoder
from become_yukarin import RealtimeVocoder
from become_yukarin import SuperResolution
from become_yukarin import VoiceChanger
+from become_yukarin.config.config import Config
from become_yukarin.config.config import create_from_json as create_config
from become_yukarin.config.sr_config import create_from_json as create_sr_config
from become_yukarin.data_struct import Wave
+from become_yukarin.data_struct import AcousticFeature
from become_yukarin.voice_changer import VoiceChangerStream
from become_yukarin.voice_changer import VoiceChangerStreamWrapper
class AudioConfig(NamedTuple):
rate: int
+ frame_period: float
audio_chunk: int
convert_chunk: int
vocoder_buffer_size: int
out_norm: float
-def convert_worker(
- config,
- acoustic_converter,
- super_resolution,
+def encode_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
audio_config: AudioConfig,
- queue_input_wave,
- queue_output_wave,
+ queue_input: Queue,
+ queue_output: Queue,
):
- vocoder = RealtimeVocoder(
+ wrapper.voice_changer_stream.vocoder = Vocoder(
acoustic_feature_param=config.dataset.param.acoustic_feature_param,
out_sampling_rate=audio_config.rate,
- buffer_size=audio_config.vocoder_buffer_size,
- number_of_pointers=16,
)
- # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
- voice_changer = VoiceChanger(
+ start_time = 0
+ time_length = audio_config.convert_chunk / audio_config.rate
+
+ while True:
+ wave = queue_input.get()
+
+ w = Wave(wave=wave, sampling_rate=audio_config.rate)
+ wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+ start_time += time_length
+
+ feature = wrapper.pre_convert_next(time_length=time_length)
+ queue_output.put(feature)
+
+
+def convert_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
+ acoustic_converter: AcousticConverter,
+ super_resolution: SuperResolution,
+ audio_config: AudioConfig,
+ queue_input: Queue,
+ queue_output: Queue,
+):
+ wrapper.voice_changer_stream.voice_changer = VoiceChanger(
super_resolution=super_resolution,
acoustic_converter=acoustic_converter,
- vocoder=vocoder,
)
- voice_changer_stream = VoiceChangerStream(
- voice_changer=voice_changer,
- sampling_rate=audio_config.rate,
- in_dtype=numpy.float32,
- )
+ start_time = 0
+ time_length = audio_config.convert_chunk / audio_config.rate
+ while True:
+ in_feature: AcousticFeature = queue_input.get()
+ wrapper.voice_changer_stream.add_in_feature(
+ start_time=start_time,
+ feature=in_feature,
+ frame_period=audio_config.frame_period,
+ )
+ start_time += time_length
- wrapper = VoiceChangerStreamWrapper(
- voice_changer_stream=voice_changer_stream,
- extra_time=0.1,
+ out_feature = wrapper.convert_next(time_length=time_length)
+ queue_output.put(out_feature)
+
+
+def decode_worker(
+ config: Config,
+ wrapper: VoiceChangerStreamWrapper,
+ audio_config: AudioConfig,
+ queue_input: Queue,
+ queue_output: Queue,
+):
+ wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
+ acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+ out_sampling_rate=audio_config.rate,
+ buffer_size=audio_config.vocoder_buffer_size,
+ number_of_pointers=16,
)
+ # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
start_time = 0
- wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32)
- wave = Wave(wave=wave, sampling_rate=audio_config.rate)
- wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
- start_time += len(wave.wave) / wave.sampling_rate
- wave = wrapper.convert_next(time_length=1)
-
time_length = audio_config.convert_chunk / audio_config.rate
wave_fragment = numpy.empty(0)
while True:
- wave = queue_input_wave.get()
- w = Wave(wave=wave, sampling_rate=audio_config.rate)
- wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+ feature: AcousticFeature = queue_input.get()
+ wrapper.voice_changer_stream.add_out_feature(
+ start_time=start_time,
+ feature=feature,
+ frame_period=audio_config.frame_period,
+ )
start_time += time_length
- b = time.time()
- wave = wrapper.convert_next(time_length=time_length).wave
- print('time', time.time()-b, flush=True)
- wrapper.remove_previous_wave()
- print('converted wave', len(wave), flush=True)
+ wave = wrapper.post_convert_next(time_length=time_length).wave
wave_fragment = numpy.concatenate([wave_fragment, wave])
if len(wave_fragment) >= audio_config.audio_chunk:
wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
- queue_output_wave.put(wave)
+ queue_output.put(wave)
def main():
print('model loading...', flush=True)
queue_input_wave = Queue()
+ queue_input_feature = Queue()
+ queue_output_feature = Queue()
queue_output_wave = Queue()
- model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz')
- config_path = Path('./trained/harvest-innoise03/config.json')
+ model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+ config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(config, model_path, gpu=0)
print('model 1 loaded!', flush=True)
@@ -113,23 +149,53 @@ def main():
audio_instance = pyaudio.PyAudio()
audio_config = AudioConfig(
rate=config.dataset.param.voice_param.sample_rate,
+ frame_period=config.dataset.param.acoustic_feature_param.frame_period,
audio_chunk=config.dataset.param.voice_param.sample_rate,
convert_chunk=config.dataset.param.voice_param.sample_rate,
vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
out_norm=2.5,
)
- process_converter = Process(target=convert_worker, kwargs=dict(
+ voice_changer_stream = VoiceChangerStream(
+ sampling_rate=audio_config.rate,
+ frame_period=config.dataset.param.acoustic_feature_param.frame_period,
+ in_dtype=numpy.float32,
+ )
+
+ wrapper = VoiceChangerStreamWrapper(
+ voice_changer_stream=voice_changer_stream,
+ extra_time_pre=0.2,
+ extra_time=0.1,
+ )
+
+ process_encoder = Process(target=encode_worker, kwargs=dict(
config=config,
+ wrapper=wrapper,
audio_config=audio_config,
+ queue_input=queue_input_wave,
+ queue_output=queue_input_feature,
+ ))
+ process_encoder.start()
+
+ process_converter = Process(target=convert_worker, kwargs=dict(
+ config=config,
+ wrapper=wrapper,
acoustic_converter=acoustic_converter,
super_resolution=super_resolution,
- queue_input_wave=queue_input_wave,
- queue_output_wave=queue_output_wave,
+ audio_config=audio_config,
+ queue_input=queue_input_feature,
+ queue_output=queue_output_feature,
))
process_converter.start()
- signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate())
+ process_decoder = Process(target=decode_worker, kwargs=dict(
+ config=config,
+ wrapper=wrapper,
+ audio_config=audio_config,
+ queue_input=queue_output_feature,
+ queue_output=queue_output_wave,
+ ))
+ process_decoder.start()
audio_stream = audio_instance.open(
format=pyaudio.paFloat32,
@@ -149,6 +215,11 @@ def main():
print('input', len(wave), flush=True)
queue_input_wave.put(wave)
+ print('queue_input_wave', queue_input_wave.qsize(), flush=True)
+ print('queue_input_feature', queue_input_feature.qsize(), flush=True)
+ print('queue_output_feature', queue_output_feature.qsize(), flush=True)
+ print('queue_output_wave', queue_output_wave.qsize(), flush=True)
+
# output
try:
wave = queue_output_wave.get_nowait()
diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py
index ceddf9c..66ea003 100644
--- a/tests/test_voice_changer.py
+++ b/tests/test_voice_changer.py
@@ -32,16 +32,16 @@ test_output_path = Path('output.wav')
print('model loading...', flush=True)
-model_path = model_base_path / Path('harvest-innoise03/predictor_1390000.npz')
-config_path = model_base_path / Path('harvest-innoise03/config.json')
+model_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+config_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/config.json')
config = create_config(config_path)
-acoustic_converter = AcousticConverter(config, model_path, gpu=0)
+acoustic_converter = AcousticConverter(config, model_path)
print('model 1 loaded!', flush=True)
model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
config_path = model_base_path / Path('sr-noise3/config.json')
sr_config = create_sr_config(config_path)
-super_resolution = SuperResolution(sr_config, model_path, gpu=0)
+super_resolution = SuperResolution(sr_config, model_path)
print('model 2 loaded!', flush=True)
audio_config = AudioConfig(
@@ -50,6 +50,7 @@ audio_config = AudioConfig(
vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
out_norm=4.5,
)
+frame_period = config.dataset.param.acoustic_feature_param.frame_period
vocoder = RealtimeVocoder(
acoustic_feature_param=config.dataset.param.acoustic_feature_param,
@@ -57,22 +58,24 @@ vocoder = RealtimeVocoder(
buffer_size=audio_config.vocoder_buffer_size,
number_of_pointers=16,
)
-# vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
voice_changer = VoiceChanger(
super_resolution=super_resolution,
acoustic_converter=acoustic_converter,
- vocoder=vocoder,
)
voice_changer_stream = VoiceChangerStream(
- voice_changer=voice_changer,
sampling_rate=audio_config.rate,
+ frame_period=acoustic_converter._param.acoustic_feature_param.frame_period,
in_dtype=numpy.float32,
)
+voice_changer_stream.voice_changer = voice_changer
+voice_changer_stream.vocoder = vocoder
+
wrapper = VoiceChangerStreamWrapper(
voice_changer_stream=voice_changer_stream,
+ extra_time_pre=1,
extra_time=0.2,
)
@@ -85,9 +88,26 @@ for i in range(0, len(raw_wave), audio_config.chunk):
wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in)
start_time += len(wave_in.wave) / wave_in.sampling_rate
- wave_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+ feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate)
+ wrapper.voice_changer_stream.add_in_feature(start_time=start_time, feature=feature_in, frame_period=frame_period)
+ start_time += audio_config.chunk / audio_config.rate
+ print('pre', i, flush=True)
+
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+ feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
+ wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period)
+ start_time += audio_config.chunk / audio_config.rate
+ print('cent', i, flush=True)
+
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+ wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.rate)
wave_out_list.append(wave_out)
- wrapper.remove_previous_wave()
+ start_time += audio_config.chunk / audio_config.rate
+ print('post', i, flush=True)
out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate)