diff options
| -rw-r--r-- | become_yukarin/__init__.py | 2 | ||||
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 20 | ||||
| -rw-r--r-- | become_yukarin/super_resolution.py | 15 | ||||
| -rw-r--r-- | become_yukarin/vocoder.py | 105 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 155 | ||||
| -rw-r--r-- | scripts/realtime_voice_changer.py | 142 |
6 files changed, 432 insertions, 7 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py index 7513f36..3c376cd 100644 --- a/become_yukarin/__init__.py +++ b/become_yukarin/__init__.py @@ -3,4 +3,6 @@ from . import dataset from . import param from .acoustic_converter import AcousticConverter from .super_resolution import SuperResolution +from .vocoder import RealtimeVocoder +from .vocoder import Vocoder from .voice_changer import VoiceChanger diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 1a1438a..178844a 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -87,7 +87,8 @@ class WaveFileLoadProcess(BaseDataProcess): class AcousticFeatureProcess(BaseDataProcess): - def __init__(self, frame_period, order, alpha, f0_estimating_method, f0_floor=71, f0_ceil=800, dtype=numpy.float32) -> None: + def __init__(self, frame_period, order, alpha, f0_estimating_method, f0_floor=71, f0_ceil=800, + dtype=numpy.float32) -> None: self._frame_period = frame_period self._order = order self._alpha = alpha @@ -101,10 +102,21 @@ class AcousticFeatureProcess(BaseDataProcess): fs = data.sampling_rate if self._f0_estimating_method == 'dio': - _f0, t = pyworld.dio(x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil) + _f0, t = pyworld.dio( + x, + fs, + frame_period=self._frame_period, + f0_floor=self._f0_floor, + f0_ceil=self._f0_ceil, + ) else: - _f0, t = pyworld.harvest(x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, - f0_ceil=self._f0_ceil) + _f0, t = pyworld.harvest( + x, + fs, + frame_period=self._frame_period, + f0_floor=self._f0_floor, + f0_ceil=self._f0_ceil, + ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) diff --git a/become_yukarin/super_resolution.py b/become_yukarin/super_resolution.py index 7c53b9d..fc91488 100644 --- a/become_yukarin/super_resolution.py +++ b/become_yukarin/super_resolution.py @@ -60,6 +60,21 @@ class SuperResolution(object): out = out[:-pad] return out + def convert_to_feature( + self, + spectrogram: numpy.ndarray, + acoustic_feature: AcousticFeature, + ): + acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) + f_out = AcousticFeature( + f0=acoustic_feature.f0, + spectrogram=spectrogram.astype(numpy.float64), + aperiodicity=acoustic_feature.aperiodicity, + mfcc=acoustic_feature.mfcc, + voiced=acoustic_feature.voiced, + ) + return f_out + def convert_to_audio( self, input: numpy.ndarray, diff --git a/become_yukarin/vocoder.py b/become_yukarin/vocoder.py new file mode 100644 index 0000000..a49e8f2 --- /dev/null +++ b/become_yukarin/vocoder.py @@ -0,0 +1,105 @@ +import numpy +import pyworld +from world4py.native import structures, apidefinitions, utils + +from become_yukarin.data_struct import AcousticFeature +from become_yukarin.data_struct import Wave +from become_yukarin.dataset.dataset import AcousticFeatureProcess +from become_yukarin.param import AcousticFeatureParam + + +class Vocoder(object): + def __init__( + self, + acoustic_feature_param: AcousticFeatureParam, + out_sampling_rate: int, + ): + self.acoustic_feature_param = acoustic_feature_param + self.out_sampling_rate = out_sampling_rate + self._encoder = AcousticFeatureProcess( + frame_period=acoustic_feature_param.frame_period, + order=acoustic_feature_param.order, + alpha=acoustic_feature_param.alpha, + ) + + def encode(self, wave: Wave): + return self._encoder(wave) + + def decode( + self, + acoustic_feature: AcousticFeature, + ): + acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) + out = pyworld.synthesize( + f0=acoustic_feature.f0.ravel(), + spectrogram=acoustic_feature.spectrogram, + aperiodicity=acoustic_feature.aperiodicity, + fs=self.out_sampling_rate, + frame_period=self.acoustic_feature_param.frame_period + ) + return Wave(out, sampling_rate=self.out_sampling_rate) + + +class RealtimeVocoder(Vocoder): + def __init__( + self, + acoustic_feature_param: AcousticFeatureParam, + out_sampling_rate: int, + buffer_size: int, + number_of_pointers: int, + ): + super().__init__( + acoustic_feature_param=acoustic_feature_param, + out_sampling_rate=out_sampling_rate, + ) + + self.buffer_size = buffer_size + + self._synthesizer = structures.WorldSynthesizer() + apidefinitions._InitializeSynthesizer( + self.out_sampling_rate, # sampling rate + self.acoustic_feature_param.frame_period, # frame period + pyworld.get_cheaptrick_fft_size(out_sampling_rate), # fft size + buffer_size, # buffer size + number_of_pointers, # number of pointers + self._synthesizer, + ) + self._before_buffer = None # for holding memory + + def decode( + self, + acoustic_feature: AcousticFeature, + ): + length = len(acoustic_feature.f0) + f0_buffer = utils.cast_1d_list_to_1d_pointer(acoustic_feature.f0.flatten().tolist()) + sp_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.spectrogram.tolist()) + ap_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.aperiodicity.tolist()) + apidefinitions._AddParameters(f0_buffer, length, sp_buffer, ap_buffer, self._synthesizer) + + ys = [] + while apidefinitions._Synthesis2(self._synthesizer) != 0: + y = numpy.array([self._synthesizer.buffer[i] for i in range(self.buffer_size)]) + ys.append(y) + + if len(ys) > 0: + out_wave = Wave( + wave=numpy.concatenate(ys), + sampling_rate=self.out_sampling_rate, + ) + else: + out_wave = Wave( + wave=numpy.empty(0), + sampling_rate=self.out_sampling_rate, + ) + + self._before_buffer = (f0_buffer, sp_buffer, ap_buffer) # for holding memory + return out_wave + + def warm_up(self, time_length: float): + y = numpy.zeros(int(time_length * self.out_sampling_rate)) + w = Wave(wave=y, sampling_rate=self.out_sampling_rate) + f = self.encode(w) + self.decode(f) + + def __del__(self): + apidefinitions._DestroySynthesizer(self._synthesizer) diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 7269053..05f5a96 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -1,7 +1,13 @@ +from typing import List +from typing import NamedTuple + import numpy from .acoustic_converter import AcousticConverter +from .data_struct import AcousticFeature +from .data_struct import Wave from .super_resolution import SuperResolution +from .vocoder import Vocoder class VoiceChanger(object): @@ -9,6 +15,7 @@ class VoiceChanger(object): self, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, + vocoder: Vocoder, output_sampling_rate: int = None, ) -> None: if output_sampling_rate is None: @@ -16,12 +23,154 @@ class VoiceChanger(object): self.acoustic_converter = acoustic_converter self.super_resolution = super_resolution + self.vocoder = vocoder self.output_sampling_rate = output_sampling_rate def convert_from_wave_path(self, wave_path: str): w_in = self.acoustic_converter._wave_process(wave_path) - f_in = self.acoustic_converter._feature_process(w_in) + return self.convert_from_wave(w_in) + + def convert_from_wave(self, wave: Wave): + f_in = self.acoustic_converter._feature_process(wave) + f_high = self.convert_from_acoustic_feature(f_in) + wave = self.vocoder.decode(f_high) + return wave + + def convert_from_acoustic_feature(self, f_in: AcousticFeature): f_low = self.acoustic_converter.convert_to_feature(f_in) s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32)) - wave = self.super_resolution(s_high, acoustic_feature=f_low, sampling_rate=self.output_sampling_rate) - return wave + f_high = self.super_resolution.convert_to_feature(s_high, f_low) + return f_high + + +class Segment(NamedTuple): + start_time: float + wave: Wave + + @property + def time_length(self): + return len(self.wave.wave) / self.wave.sampling_rate + + @property + def end_time(self): + return self.time_length + self.start_time + + +class VoiceChangerStream(object): + def __init__( + self, + voice_changer: VoiceChanger, + sampling_rate: int, + in_dtype=numpy.float32, + ): + self.voice_changer = voice_changer + self.sampling_rate = sampling_rate + self.in_dtype = in_dtype + self._data_stream = [] # type: List[Segment] + + @property + def vocoder(self): + return self.voice_changer.vocoder + + def add_wave(self, start_time: float, wave: Wave): + # validation + assert wave.sampling_rate == self.sampling_rate + assert wave.wave.dtype == self.in_dtype + + segment = Segment(start_time=start_time, wave=wave) + self._data_stream.append(segment) + + def remove_wave(self, end_time: float): + self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream)) + + def convert(self, start_time: float, time_length: float): + end_time = start_time + time_length + buffer_list = [] + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream) + + start_time_buffer = start_time + remaining_time = time_length + for segment in stream: + # padding + if segment.start_time > start_time_buffer: + pad = numpy.zeros( + shape=int((segment.start_time - start_time_buffer) * self.sampling_rate), + dtype=self.in_dtype, + ) + buffer_list.append(pad) + start_time_buffer = segment.start_time + + if remaining_time > segment.end_time - start_time_buffer: + one_time_length = segment.end_time - start_time_buffer + else: + one_time_length = remaining_time + + first_index = int((start_time_buffer - segment.start_time) * self.sampling_rate) + last_index = int(first_index + one_time_length * self.sampling_rate) + one_buffer = segment.wave.wave[first_index:last_index] + buffer_list.append(one_buffer) + + start_time_buffer += one_time_length + remaining_time -= one_time_length + + if start_time_buffer >= end_time: + break + else: + # last padding + pad = numpy.zeros(shape=int((end_time - start_time_buffer) * self.sampling_rate), dtype=self.in_dtype) + buffer_list.append(pad) + + buffer = numpy.concatenate(buffer_list) + print('buffer', len(buffer), flush=True) + in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate) + in_feature = self.vocoder.encode(in_wave) + out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) + return out_feature + + def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float): + """ + :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。 + """ + frame_period = self.vocoder.acoustic_feature_param.frame_period + + start_time -= extra_time + time_length += extra_time * 2 + + extra_feature = self.convert(start_time=start_time, time_length=time_length) + + pad = int(extra_time / (frame_period / 1000)) + feature = AcousticFeature( + f0=extra_feature.f0[pad:-pad], + spectrogram=extra_feature.spectrogram[pad:-pad], + aperiodicity=extra_feature.aperiodicity[pad:-pad], + mfcc=extra_feature.mfcc[pad:-pad], + voiced=extra_feature.voiced[pad:-pad], + ) + + out_wave = self.vocoder.decode( + acoustic_feature=feature, + ) + return out_wave + + +class VoiceChangerStreamWrapper(object): + def __init__( + self, + voice_changer_stream: VoiceChangerStream, + extra_time: float = 0.0 + ): + self.voice_changer_stream = voice_changer_stream + self.extra_time = extra_time + self._current_time = 0 + + def convert_next(self, time_length: float): + out_wave = self.voice_changer_stream.convert_with_extra_time( + start_time=self._current_time, + time_length=time_length, + extra_time=self.extra_time, + ) + self._current_time += time_length + return out_wave + + def remove_previous_wave(self): + self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time) diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py new file mode 100644 index 0000000..4c50963 --- /dev/null +++ b/scripts/realtime_voice_changer.py @@ -0,0 +1,142 @@ +import queue +from functools import partial +from pathlib import Path +from typing import NamedTuple + +import numpy +import pyaudio + +from become_yukarin import AcousticConverter +from become_yukarin import RealtimeVocoder +from become_yukarin import SuperResolution +from become_yukarin import VoiceChanger +from become_yukarin.config.config import create_from_json as create_config +from become_yukarin.config.sr_config import create_from_json as create_sr_config +from become_yukarin.data_struct import Wave +from become_yukarin.voice_changer import VoiceChangerStream +from become_yukarin.voice_changer import VoiceChangerStreamWrapper + + +class AudioConfig(NamedTuple): + rate: int + chunk: int + vocoder_buffer_size: int + out_norm: float + + +queue_input_wave = queue.Queue() +queue_output_wave = queue.Queue() +queue_output_fragment_wave = queue.Queue(maxsize=1) + + +def convert_worker(audio_config: AudioConfig, wrapper: VoiceChangerStreamWrapper): + start_time = 0 + time_length = audio_config.chunk / audio_config.rate + while True: + wave = queue_input_wave.get() + wave = Wave(wave=wave, sampling_rate=audio_config.rate) + wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave) + start_time += len(wave.wave) / wave.sampling_rate + + wave = wrapper.convert_next(time_length=time_length) + queue_output_wave.put(wave.wave) + wrapper.remove_previous_wave() + + +def input_callback(in_data, frame_count, time_info, status_flags, audio_config: AudioConfig): + print('input', status_flags, flush=True) + wave = numpy.fromstring(in_data, dtype=numpy.float32) + queue_input_wave.put(wave) + return None, pyaudio.paContinue + + +def output_callback(_, frame_count, time_info, status_flags, audio_config: AudioConfig): + print('output', status_flags, flush=True) + try: + wave = queue_output_fragment_wave.get_nowait() + except: + wave = numpy.empty(0) + + while len(wave) < audio_config.chunk: + wave_next = queue_output_wave.get() + wave = numpy.concatenate([wave, wave_next]) + + wave, wave_fragment = wave[:audio_config.chunk], wave[audio_config.chunk:] + queue_output_fragment_wave.put(wave_fragment) + + wave *= audio_config.out_norm + b = wave.astype(numpy.float32).tobytes() + return b, pyaudio.paContinue + + +def main(): + print('model loading...', flush=True) + + model_path = Path('./trained/mfcc8-preconvert-innoise03/predictor_350000.npz') + config_path = Path('./trained/mfcc8-preconvert-innoise03/config.json') + config = create_config(config_path) + acoustic_converter = AcousticConverter(config, model_path, gpu=0) + print('model 1 loaded!', flush=True) + + model_path = Path('./trained/sr-noise3/predictor_70000.npz') + config_path = Path('./trained/sr-noise3/config.json') + sr_config = create_sr_config(config_path) + super_resolution = SuperResolution(sr_config, model_path, gpu=0) + print('model 2 loaded!', flush=True) + + audio_instance = pyaudio.PyAudio() + audio_config = AudioConfig( + rate=config.dataset.param.voice_param.sample_rate, + chunk=config.dataset.param.voice_param.sample_rate, + vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, + out_norm=4.5, + ) + + vocoder = RealtimeVocoder( + acoustic_feature_param=config.dataset.param.acoustic_feature_param, + out_sampling_rate=audio_config.rate, + buffer_size=audio_config.vocoder_buffer_size, + number_of_pointers=16, + ) + vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) + + voice_changer = VoiceChanger( + super_resolution=super_resolution, + acoustic_converter=acoustic_converter, + vocoder=vocoder, + ) + + voice_changer_stream = VoiceChangerStream( + voice_changer=voice_changer, + sampling_rate=audio_config.rate, + in_dtype=numpy.float32, + ) + + wrapper = VoiceChangerStreamWrapper( + voice_changer_stream=voice_changer_stream, + extra_time=0.2, + ) + + input_audio_stream = audio_instance.open( + format=pyaudio.paFloat32, + channels=1, + rate=audio_config.rate, + frames_per_buffer=audio_config.chunk, + input=True, + stream_callback=partial(input_callback, audio_config=audio_config) + ) + + output_audio_stream = audio_instance.open( + format=pyaudio.paFloat32, + channels=1, + rate=audio_config.rate, + frames_per_buffer=audio_config.chunk, + output=True, + stream_callback=partial(output_callback, audio_config=audio_config) + ) + + convert_worker(audio_config, wrapper) + + +if __name__ == '__main__': + main() |
