diff options
| -rw-r--r-- | become_yukarin/__init__.py | 1 | ||||
| -rw-r--r-- | become_yukarin/acoustic_converter.py | 141 | ||||
| -rw-r--r-- | become_yukarin/super_resolution.py | 10 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 148 |
4 files changed, 170 insertions, 130 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py index 810ea1f..7513f36 100644 --- a/become_yukarin/__init__.py +++ b/become_yukarin/__init__.py @@ -1,5 +1,6 @@ from . import config from . import dataset from . import param +from .acoustic_converter import AcousticConverter from .super_resolution import SuperResolution from .voice_changer import VoiceChanger diff --git a/become_yukarin/acoustic_converter.py b/become_yukarin/acoustic_converter.py new file mode 100644 index 0000000..13e6225 --- /dev/null +++ b/become_yukarin/acoustic_converter.py @@ -0,0 +1,141 @@ +from functools import partial +from pathlib import Path +from typing import Optional + +import chainer +import numpy +import pysptk +import pyworld + +from become_yukarin.config.config import Config +from become_yukarin.data_struct import AcousticFeature +from become_yukarin.data_struct import Wave +from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess +from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureProcess +from become_yukarin.dataset.dataset import DecodeFeatureProcess +from become_yukarin.dataset.dataset import EncodeFeatureProcess +from become_yukarin.dataset.dataset import WaveFileLoadProcess +from become_yukarin.model.model import create_predictor + + +class AcousticConverter(object): + def __init__(self, config: Config, model_path: Path, gpu: int = None): + self.config = config + self.model_path = model_path + self.gpu = gpu + + self.model = model = create_predictor(config.model) + chainer.serializers.load_npz(str(model_path), model) + if self.gpu is not None: + model.to_gpu(self.gpu) + + self._param = param = config.dataset.param + self._wave_process = WaveFileLoadProcess( + sample_rate=param.voice_param.sample_rate, + top_db=None, + ) + self._feature_process = AcousticFeatureProcess( + frame_period=param.acoustic_feature_param.frame_period, + order=param.acoustic_feature_param.order, + alpha=param.acoustic_feature_param.alpha, + ) + + self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess() + + input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True) + input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True) + target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True) + target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True) + self._feature_normalize = AcousticFeatureNormalizeProcess( + mean=input_mean, + var=input_var, + ) + self._feature_denormalize = AcousticFeatureDenormalizeProcess( + mean=target_mean, + var=target_var, + ) + + feature_sizes = AcousticFeature.get_sizes( + sampling_rate=param.voice_param.sample_rate, + order=param.acoustic_feature_param.order, + ) + self._encode_feature = EncodeFeatureProcess(config.dataset.features) + self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) + + def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): + if out_sampling_rate is None: + out_sampling_rate = self.config.dataset.param.voice_param.sample_rate + + input_feature = input + input = self._feature_normalize(input, test=True) + input = self._encode_feature(input, test=True) + + converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) + inputs = converter([input]) + + with chainer.using_config('train', False): + out = self.model(inputs).data[0] + + if self.gpu is not None: + out = chainer.cuda.to_cpu(out) + + out = self._decode_feature(out, test=True) + out = AcousticFeature( + f0=out.f0, + spectrogram=out.spectrogram, + aperiodicity=out.aperiodicity, + mfcc=out.mfcc, + voiced=input_feature.voiced, + ) + out = self._feature_denormalize(out, test=True) + out = AcousticFeature( + f0=out.f0, + spectrogram=out.spectrogram, + aperiodicity=input_feature.aperiodicity, + mfcc=out.mfcc, + voiced=out.voiced, + ) + + fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) + spectrogram = pysptk.mc2sp( + out.mfcc, + alpha=self._param.acoustic_feature_param.alpha, + fftlen=fftlen, + ) + + out = AcousticFeature( + f0=out.f0, + spectrogram=spectrogram, + aperiodicity=out.aperiodicity, + mfcc=out.mfcc, + voiced=out.voiced, + ).astype(numpy.float64) + return out + + def convert_from_audio_path(self, input: Path, out_sampling_rate: Optional[int] = None): + input = self._wave_process(str(input), test=True) + input = self._feature_process(input, test=True) + return self.convert_from_feature(input, out_sampling_rate) + + def convert_from_feature_path(self, input: Path, out_sampling_rate: Optional[int] = None): + input = self._acoustic_feature_load_process(input, test=True) + return self.convert_from_feature(input, out_sampling_rate) + + def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): + if out_sampling_rate is None: + out_sampling_rate = self.config.dataset.param.voice_param.sample_rate + + out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate) + out = pyworld.synthesize( + f0=out.f0.ravel(), + spectrogram=out.spectrogram, + aperiodicity=out.aperiodicity, + fs=out_sampling_rate, + frame_period=self._param.acoustic_feature_param.frame_period, + ) + return Wave(out, sampling_rate=out_sampling_rate) + + def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): + return self.convert_from_audio_path(voice_path, out_sampling_rate) diff --git a/become_yukarin/super_resolution.py b/become_yukarin/super_resolution.py index bdb2e61..163057d 100644 --- a/become_yukarin/super_resolution.py +++ b/become_yukarin/super_resolution.py @@ -15,12 +15,15 @@ from become_yukarin.model.sr_model import create_predictor_sr class SuperResolution(object): - def __init__(self, config: SRConfig, model_path: Path): + def __init__(self, config: SRConfig, model_path: Path, gpu: int = None): self.config = config self.model_path = model_path + self.gpu = gpu self.model = model = create_predictor_sr(config.model) chainer.serializers.load_npz(str(model_path), model) + if self.gpu is not None: + model.to_gpu(self.gpu) self._param = param = config.dataset.param self._wave_process = WaveFileLoadProcess( @@ -37,7 +40,7 @@ class SuperResolution(object): ) def convert(self, input: numpy.ndarray) -> numpy.ndarray: - converter = partial(chainer.dataset.convert.concat_examples, padding=0) + converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) pad = 128 - len(input) % 128 input = numpy.pad(input, [(0, pad), (0, 0)], mode='minimum') input = numpy.log(input)[:, :-1] @@ -47,6 +50,9 @@ class SuperResolution(object): with chainer.using_config('train', False): out = self.model(inputs).data[0] + if self.gpu is not None: + out = chainer.cuda.to_cpu(out) + out = out[0] out = numpy.pad(out, [(0, 0), (0, 1)], mode='edge') out = numpy.exp(out) diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 822d8c5..30fbf28 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -1,135 +1,27 @@ -from functools import partial -from pathlib import Path -from typing import Optional - -import chainer import numpy -import pysptk -import pyworld -from become_yukarin.config.config import Config -from become_yukarin.data_struct import AcousticFeature -from become_yukarin.data_struct import Wave -from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess -from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess -from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess -from become_yukarin.dataset.dataset import AcousticFeatureProcess -from become_yukarin.dataset.dataset import DecodeFeatureProcess -from become_yukarin.dataset.dataset import EncodeFeatureProcess -from become_yukarin.dataset.dataset import WaveFileLoadProcess -from become_yukarin.model.model import create_predictor +from .acoustic_converter import AcousticConverter +from .super_resolution import SuperResolution class VoiceChanger(object): - def __init__(self, config: Config, model_path: Path): - self.config = config - self.model_path = model_path - - self.model = model = create_predictor(config.model) - chainer.serializers.load_npz(str(model_path), model) - - self._param = param = config.dataset.param - self._wave_process = WaveFileLoadProcess( - sample_rate=param.voice_param.sample_rate, - top_db=None, - ) - self._feature_process = AcousticFeatureProcess( - frame_period=param.acoustic_feature_param.frame_period, - order=param.acoustic_feature_param.order, - alpha=param.acoustic_feature_param.alpha, - ) - - self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess() - - input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True) - input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True) - target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True) - target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True) - self._feature_normalize = AcousticFeatureNormalizeProcess( - mean=input_mean, - var=input_var, - ) - self._feature_denormalize = AcousticFeatureDenormalizeProcess( - mean=target_mean, - var=target_var, - ) - - feature_sizes = AcousticFeature.get_sizes( - sampling_rate=param.voice_param.sample_rate, - order=param.acoustic_feature_param.order, - ) - self._encode_feature = EncodeFeatureProcess(config.dataset.features) - self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) - - def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): - if out_sampling_rate is None: - out_sampling_rate = self.config.dataset.param.voice_param.sample_rate - - input_feature = input - input = self._feature_normalize(input, test=True) - input = self._encode_feature(input, test=True) - - converter = partial(chainer.dataset.convert.concat_examples, padding=0) - inputs = converter([input]) - - with chainer.using_config('train', False): - out = self.model(inputs).data[0] - - out = self._decode_feature(out, test=True) - out = AcousticFeature( - f0=out.f0, - spectrogram=out.spectrogram, - aperiodicity=out.aperiodicity, - mfcc=out.mfcc, - voiced=input_feature.voiced, - ) - out = self._feature_denormalize(out, test=True) - out = AcousticFeature( - f0=out.f0, - spectrogram=out.spectrogram, - aperiodicity=input_feature.aperiodicity, - mfcc=out.mfcc, - voiced=out.voiced, - ) - - fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) - spectrogram = pysptk.mc2sp( - out.mfcc, - alpha=self._param.acoustic_feature_param.alpha, - fftlen=fftlen, - ) - - out = AcousticFeature( - f0=out.f0, - spectrogram=spectrogram, - aperiodicity=out.aperiodicity, - mfcc=out.mfcc, - voiced=out.voiced, - ).astype(numpy.float64) - return out - - def convert_from_audio_path(self, input: Path, out_sampling_rate: Optional[int] = None): - input = self._wave_process(str(input), test=True) - input = self._feature_process(input, test=True) - return self.convert_from_feature(input, out_sampling_rate) - - def convert_from_feature_path(self, input: Path, out_sampling_rate: Optional[int] = None): - input = self._acoustic_feature_load_process(input, test=True) - return self.convert_from_feature(input, out_sampling_rate) - - def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): - if out_sampling_rate is None: - out_sampling_rate = self.config.dataset.param.voice_param.sample_rate + def __init__( + self, + acoustic_converter: AcousticConverter, + super_resolution: SuperResolution, + output_sampling_rate: int = None, + ): + if output_sampling_rate is None: + output_sampling_rate = super_resolution.config.dataset.param.voice_param.sample_rate - out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate) - out = pyworld.synthesize( - f0=out.f0.ravel(), - spectrogram=out.spectrogram, - aperiodicity=out.aperiodicity, - fs=out_sampling_rate, - frame_period=self._param.acoustic_feature_param.frame_period, - ) - return Wave(out, sampling_rate=out_sampling_rate) + self.acoustic_converter = acoustic_converter + self.super_resolution = super_resolution + self.output_sampling_rate = output_sampling_rate - def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): - return self.convert_from_audio_path(voice_path, out_sampling_rate) + def convert_from_wave_path(self, wave_path: str): + w_in = self.acoustic_converter._wave_process(wave_path) + f_in = self.acoustic_converter._feature_process(w_in) + f_low = self.acoustic_converter.convert_to_feature(f_in) + s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32)) + wave = self.super_resolution(s_high, acoustic_feature=f_low, sampling_rate=self.output_sampling_rate) + return wave |
