diff options
| author | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-03-12 00:08:43 +0900 |
|---|---|---|
| committer | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-03-12 00:08:43 +0900 |
| commit | 210e8225e4f7c95d6d0c89309b5b1eb20e41e744 (patch) | |
| tree | 2bd9f737040217ffb1eefda7ce61f1a3198390d8 | |
| parent | f8823b1913c29ce2710f92d51b74cb84b74323b0 (diff) | |
リアルタイム機能を切り分け
| -rw-r--r-- | become_yukarin/voice_changer.py | 252 | ||||
| -rw-r--r-- | scripts/realtime_voice_changer.py | 241 | ||||
| -rw-r--r-- | tests/test_voice_changer.py | 114 |
3 files changed, 0 insertions, 607 deletions
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 7f7bbe4..698244c 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -1,15 +1,8 @@ -from abc import ABCMeta, abstractproperty, abstractmethod -from typing import List, Callable, Any -from typing import NamedTuple - import numpy -from become_yukarin.param import Param from .acoustic_converter import AcousticConverter from .data_struct import AcousticFeature -from .data_struct import Wave from .super_resolution import SuperResolution -from .vocoder import Vocoder class VoiceChanger(object): @@ -31,248 +24,3 @@ class VoiceChanger(object): s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32)) f_high = self.super_resolution.convert_to_feature(s_high, f_low) return f_high - - -class BaseSegment(ABCMeta): - start_time: float - - @property - @abstractmethod - def time_length(self) -> float: - pass - - @property - @abstractmethod - def end_time(self) -> float: - pass - - -class FeatureSegment(NamedTuple, BaseSegment): - start_time: float - feature: AcousticFeature - frame_period: float - - @property - def time_length(self): - return len(self.feature.f0) * self.frame_period / 1000 - - @property - def end_time(self): - return self.time_length + self.start_time - - -class WaveSegment(NamedTuple, BaseSegment): - start_time: float - wave: Wave - - @property - def time_length(self): - return len(self.wave.wave) / self.wave.sampling_rate - - @property - def end_time(self): - return self.time_length + self.start_time - - -class VoiceChangerStream(object): - def __init__( - self, - sampling_rate: int, - frame_period: float, - order: int, - in_dtype=numpy.float32, - ): - self.sampling_rate = sampling_rate - self.frame_period = frame_period - self.order = order - self.in_dtype = in_dtype - - self.voice_changer: VoiceChanger = None - self.vocoder: Vocoder = None - self._data_stream = [] # type: List[WaveSegment] - self._in_feature_stream = [] # type: List[FeatureSegment] - self._out_feature_stream = [] # type: List[FeatureSegment] - - def add_wave(self, start_time: float, wave: Wave): - # validation - assert wave.sampling_rate == self.sampling_rate - assert wave.wave.dtype == self.in_dtype - - segment = WaveSegment(start_time=start_time, wave=wave) - self._data_stream.append(segment) - - def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): - # validation - assert frame_period == self.frame_period - assert feature.f0.dtype == self.in_dtype - - segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) - self._in_feature_stream.append(segment) - - def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): - # validation - assert frame_period == self.frame_period - - segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) - self._out_feature_stream.append(segment) - - def remove(self, end_time: float): - self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream)) - self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream)) - self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream)) - - @staticmethod - def fetch( - start_time: float, - time_length: float, - data_stream: List[BaseSegment], - rate: float, - pad_function: Callable[[int], Any], - pick_function: Callable[[Any, int, int], Any], - concat_function: Callable[[List], Any], - extra_time: float = 0, - ): - start_time -= extra_time - time_length += extra_time * 2 - - end_time = start_time + time_length - buffer_list = [] - stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), data_stream) - - start_time_buffer = start_time - remaining_time = time_length - for segment in stream: - # padding - if segment.start_time > start_time_buffer: - length = int((segment.start_time - start_time_buffer) * rate) - pad = pad_function(length) - buffer_list.append(pad) - start_time_buffer = segment.start_time - - if remaining_time > segment.end_time - start_time_buffer: - one_time_length = segment.end_time - start_time_buffer - else: - one_time_length = remaining_time - - first_index = int((start_time_buffer - segment.start_time) * rate) - last_index = int(first_index + one_time_length * rate) - one_buffer = pick_function(segment, first_index, last_index) - buffer_list.append(one_buffer) - - start_time_buffer += one_time_length - remaining_time -= one_time_length - - if start_time_buffer >= end_time: - break - else: - # last padding - length = int((end_time - start_time_buffer) * rate) - pad = pad_function(length) - buffer_list.append(pad) - - buffer = concat_function(buffer_list) - return buffer - - def pre_convert(self, start_time: float, time_length: float, extra_time: float): - wave = self.fetch( - start_time=start_time, - time_length=time_length, - extra_time=extra_time, - data_stream=self._data_stream, - rate=self.sampling_rate, - pad_function=lambda length: numpy.zeros(shape=length, dtype=self.in_dtype), - pick_function=lambda segment, first, last: segment.wave.wave[first:last], - concat_function=numpy.concatenate, - ) - in_wave = Wave(wave=wave, sampling_rate=self.sampling_rate) - in_feature = self.vocoder.encode(in_wave) - - pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000)) - in_feature = in_feature.pick(pad, -pad) - return in_feature - - def convert(self, start_time: float, time_length: float, extra_time: float): - sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order) - keys = ['f0', 'aperiodicity', 'mfcc', 'voiced'] - in_feature = self.fetch( - start_time=start_time, - time_length=time_length, - extra_time=extra_time, - data_stream=self._in_feature_stream, - rate=1000 / self.frame_period, - pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), - pick_function=lambda segment, first, last: segment.feature.pick(first, last), - concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), - ) - out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) - - pad = int(extra_time * 1000 / self.frame_period) - out_feature = out_feature.pick(pad, -pad) - return out_feature - - def post_convert(self, start_time: float, time_length: float): - sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order) - keys = ['f0', 'aperiodicity', 'spectrogram', 'voiced'] - out_feature = self.fetch( - start_time=start_time, - time_length=time_length, - data_stream=self._out_feature_stream, - rate=1000 / self.frame_period, - pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), - pick_function=lambda segment, first, last: segment.feature.pick(first, last), - concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), - ) - - out_wave = self.vocoder.decode( - acoustic_feature=out_feature, - ) - return out_wave - - -class VoiceChangerStreamWrapper(object): - def __init__( - self, - voice_changer_stream: VoiceChangerStream, - extra_time_pre: float = 0.0, - extra_time: float = 0.0, - ): - self.voice_changer_stream = voice_changer_stream - self.extra_time_pre = extra_time_pre - self.extra_time = extra_time - self._current_time_pre = 0 - self._current_time = 0 - self._current_time_post = 0 - - def pre_convert_next(self, time_length: float): - in_feature = self.voice_changer_stream.pre_convert( - start_time=self._current_time_pre, - time_length=time_length, - extra_time=self.extra_time_pre, - ) - self._current_time_pre += time_length - return in_feature - - def convert_next(self, time_length: float): - out_feature = self.voice_changer_stream.convert( - start_time=self._current_time, - time_length=time_length, - extra_time=self.extra_time, - ) - self._current_time += time_length - return out_feature - - def post_convert_next(self, time_length: float): - out_wave = self.voice_changer_stream.post_convert( - start_time=self._current_time_post, - time_length=time_length, - ) - self._current_time_post += time_length - return out_wave - - def remove_previous(self): - end_time = min( - self._current_time_pre - self.extra_time_pre, - self._current_time - self.extra_time, - self._current_time_post, - ) - self.voice_changer_stream.remove(end_time=end_time) diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py deleted file mode 100644 index bda64dd..0000000 --- a/scripts/realtime_voice_changer.py +++ /dev/null @@ -1,241 +0,0 @@ -import librosa -import world4py - -world4py._WORLD_LIBRARY_PATH = 'x64_world.dll' - -from pathlib import Path -from typing import NamedTuple -from multiprocessing import Queue -from multiprocessing import Process - -import numpy -import pyaudio - -from become_yukarin import AcousticConverter -from become_yukarin import Vocoder -from become_yukarin import RealtimeVocoder -from become_yukarin import SuperResolution -from become_yukarin import VoiceChanger -from become_yukarin.config.config import Config -from become_yukarin.config.config import create_from_json as create_config -from become_yukarin.config.sr_config import create_from_json as create_sr_config -from become_yukarin.data_struct import Wave -from become_yukarin.data_struct import AcousticFeature -from become_yukarin.voice_changer import VoiceChangerStream -from become_yukarin.voice_changer import VoiceChangerStreamWrapper - - -class AudioConfig(NamedTuple): - rate: int - frame_period: float - audio_chunk: int - convert_chunk: int - vocoder_buffer_size: int - out_norm: float - silent_threshold: float - - -def encode_worker( - config: Config, - wrapper: VoiceChangerStreamWrapper, - audio_config: AudioConfig, - queue_input: Queue, - queue_output: Queue, -): - wrapper.voice_changer_stream.vocoder = Vocoder( - acoustic_feature_param=config.dataset.param.acoustic_feature_param, - out_sampling_rate=audio_config.rate, - ) - - start_time = 0 - time_length = audio_config.convert_chunk / audio_config.rate - - while True: - wave = queue_input.get() - - w = Wave(wave=wave, sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) - start_time += time_length - - feature = wrapper.pre_convert_next(time_length=time_length) - queue_output.put(feature) - - -def convert_worker( - config: Config, - wrapper: VoiceChangerStreamWrapper, - acoustic_converter: AcousticConverter, - super_resolution: SuperResolution, - audio_config: AudioConfig, - queue_input: Queue, - queue_output: Queue, -): - wrapper.voice_changer_stream.voice_changer = VoiceChanger( - super_resolution=super_resolution, - acoustic_converter=acoustic_converter, - ) - - start_time = 0 - time_length = audio_config.convert_chunk / audio_config.rate - while True: - in_feature: AcousticFeature = queue_input.get() - wrapper.voice_changer_stream.add_in_feature( - start_time=start_time, - feature=in_feature, - frame_period=audio_config.frame_period, - ) - start_time += time_length - - out_feature = wrapper.convert_next(time_length=time_length) - queue_output.put(out_feature) - - -def decode_worker( - config: Config, - wrapper: VoiceChangerStreamWrapper, - audio_config: AudioConfig, - queue_input: Queue, - queue_output: Queue, -): - wrapper.voice_changer_stream.vocoder = RealtimeVocoder( - acoustic_feature_param=config.dataset.param.acoustic_feature_param, - out_sampling_rate=audio_config.rate, - buffer_size=audio_config.vocoder_buffer_size, - number_of_pointers=16, - ) - # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate) - - start_time = 0 - time_length = audio_config.convert_chunk / audio_config.rate - wave_fragment = numpy.empty(0) - while True: - feature: AcousticFeature = queue_input.get() - wrapper.voice_changer_stream.add_out_feature( - start_time=start_time, - feature=feature, - frame_period=audio_config.frame_period, - ) - start_time += time_length - - wave = wrapper.post_convert_next(time_length=time_length).wave - - wave_fragment = numpy.concatenate([wave_fragment, wave]) - if len(wave_fragment) >= audio_config.audio_chunk: - wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:] - - power = librosa.core.power_to_db(numpy.abs(librosa.stft(wave)) ** 2).mean() - if power >= audio_config.silent_threshold: - queue_output.put(wave) - - -def main(): - print('model loading...', flush=True) - - queue_input_wave = Queue() - queue_input_feature = Queue() - queue_output_feature = Queue() - queue_output_wave = Queue() - - model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz') - config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json') - config = create_config(config_path) - acoustic_converter = AcousticConverter(config, model_path, gpu=0) - print('model 1 loaded!', flush=True) - - model_path = Path('./trained/sr-noise3/predictor_180000.npz') - config_path = Path('./trained/sr-noise3/config.json') - sr_config = create_sr_config(config_path) - super_resolution = SuperResolution(sr_config, model_path, gpu=0) - print('model 2 loaded!', flush=True) - - audio_instance = pyaudio.PyAudio() - audio_config = AudioConfig( - rate=config.dataset.param.voice_param.sample_rate, - frame_period=config.dataset.param.acoustic_feature_param.frame_period, - audio_chunk=config.dataset.param.voice_param.sample_rate, - convert_chunk=config.dataset.param.voice_param.sample_rate, - vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, - out_norm=2.5, - silent_threshold=-99.0, - ) - - voice_changer_stream = VoiceChangerStream( - sampling_rate=audio_config.rate, - frame_period=config.dataset.param.acoustic_feature_param.frame_period, - order=config.dataset.param.acoustic_feature_param.order, - in_dtype=numpy.float32, - ) - - wrapper = VoiceChangerStreamWrapper( - voice_changer_stream=voice_changer_stream, - extra_time_pre=0.2, - extra_time=0.1, - ) - - process_encoder = Process(target=encode_worker, kwargs=dict( - config=config, - wrapper=wrapper, - audio_config=audio_config, - queue_input=queue_input_wave, - queue_output=queue_input_feature, - )) - process_encoder.start() - - process_converter = Process(target=convert_worker, kwargs=dict( - config=config, - wrapper=wrapper, - acoustic_converter=acoustic_converter, - super_resolution=super_resolution, - audio_config=audio_config, - queue_input=queue_input_feature, - queue_output=queue_output_feature, - )) - process_converter.start() - - process_decoder = Process(target=decode_worker, kwargs=dict( - config=config, - wrapper=wrapper, - audio_config=audio_config, - queue_input=queue_output_feature, - queue_output=queue_output_wave, - )) - process_decoder.start() - - audio_stream = audio_instance.open( - format=pyaudio.paFloat32, - channels=1, - rate=audio_config.rate, - frames_per_buffer=audio_config.audio_chunk, - input=True, - output=True, - ) - - # process_converter.join() - - while True: - # input audio - in_data = audio_stream.read(audio_config.audio_chunk) - wave = numpy.fromstring(in_data, dtype=numpy.float32) - print('input', len(wave), flush=True) - queue_input_wave.put(wave) - - print('queue_input_wave', queue_input_wave.qsize(), flush=True) - print('queue_input_feature', queue_input_feature.qsize(), flush=True) - print('queue_output_feature', queue_output_feature.qsize(), flush=True) - print('queue_output_wave', queue_output_wave.qsize(), flush=True) - - # output - try: - wave = queue_output_wave.get_nowait() - except: - wave = None - - if wave is not None: - print('output', len(wave), flush=True) - wave *= audio_config.out_norm - b = wave.astype(numpy.float32).tobytes() - audio_stream.write(b) - - -if __name__ == '__main__': - main() diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py deleted file mode 100644 index 9772bbe..0000000 --- a/tests/test_voice_changer.py +++ /dev/null @@ -1,114 +0,0 @@ -import world4py -world4py._WORLD_LIBRARY_PATH = 'x64_world.dll' - - -from pathlib import Path -from typing import NamedTuple - -import librosa -import numpy - -from become_yukarin import AcousticConverter -from become_yukarin import RealtimeVocoder -from become_yukarin import SuperResolution -from become_yukarin import VoiceChanger -from become_yukarin.config.config import create_from_json as create_config -from become_yukarin.config.sr_config import create_from_json as create_sr_config -from become_yukarin.data_struct import Wave -from become_yukarin.voice_changer import VoiceChangerStream -from become_yukarin.voice_changer import VoiceChangerStreamWrapper - - -class AudioConfig(NamedTuple): - rate: int - chunk: int - vocoder_buffer_size: int - out_norm: float - - -model_base_path = Path('~/Github/become-yukarin/trained/').expanduser() -test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav') -test_output_path = Path('output.wav') - -print('model loading...', flush=True) - -model_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/predictor_120000.npz') -config_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/config.json') -config = create_config(config_path) -acoustic_converter = AcousticConverter(config, model_path) -print('model 1 loaded!', flush=True) - -model_path = model_base_path / Path('sr-noise3/predictor_180000.npz') -config_path = model_base_path / Path('sr-noise3/config.json') -sr_config = create_sr_config(config_path) -super_resolution = SuperResolution(sr_config, model_path) -print('model 2 loaded!', flush=True) - -audio_config = AudioConfig( - rate=config.dataset.param.voice_param.sample_rate, - chunk=config.dataset.param.voice_param.sample_rate, - vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16, - out_norm=4.5, -) -frame_period = config.dataset.param.acoustic_feature_param.frame_period - -vocoder = RealtimeVocoder( - acoustic_feature_param=config.dataset.param.acoustic_feature_param, - out_sampling_rate=audio_config.rate, - buffer_size=audio_config.vocoder_buffer_size, - number_of_pointers=16, -) - -voice_changer = VoiceChanger( - super_resolution=super_resolution, - acoustic_converter=acoustic_converter, -) - -voice_changer_stream = VoiceChangerStream( - sampling_rate=audio_config.rate, - frame_period=acoustic_converter._param.acoustic_feature_param.frame_period, - order=acoustic_converter._param.acoustic_feature_param.order, - in_dtype=numpy.float32, -) - -voice_changer_stream.voice_changer = voice_changer -voice_changer_stream.vocoder = vocoder - -wrapper = VoiceChangerStreamWrapper( - voice_changer_stream=voice_changer_stream, - extra_time_pre=1, - extra_time=0.2, -) - -raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate) -wave_out_list = [] - -start_time = 0 -for i in range(0, len(raw_wave), audio_config.chunk): - wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate) - wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in) - start_time += len(wave_in.wave) / wave_in.sampling_rate - -start_time = 0 -for i in range(len(raw_wave) // audio_config.chunk + 1): - feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate) - wrapper.voice_changer_stream.add_in_feature(start_time=start_time, feature=feature_in, frame_period=frame_period) - start_time += audio_config.chunk / audio_config.rate - print('pre', i, flush=True) - -start_time = 0 -for i in range(len(raw_wave) // audio_config.chunk + 1): - feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate) - wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period) - start_time += audio_config.chunk / audio_config.rate - print('cent', i, flush=True) - -start_time = 0 -for i in range(len(raw_wave) // audio_config.chunk + 1): - wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.rate) - wave_out_list.append(wave_out) - start_time += audio_config.chunk / audio_config.rate - print('post', i, flush=True) - -out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32) -librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate) |
