diff options
| author | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-01-19 22:34:45 +0900 |
|---|---|---|
| committer | Hiroshiba Kazuyuki <hihokaruta@gmail.com> | 2018-01-19 22:34:45 +0900 |
| commit | 4b581ca1c7552094221d236d596e7488aa69d0de (patch) | |
| tree | a7019ea6085c06bc42d5e62ae2c08a6de7e56de4 /become_yukarin/acoustic_converter.py | |
| parent | 86079f0cea1f79beb7cbbec08f6c19191929207a (diff) | |
on PUG
Diffstat (limited to 'become_yukarin/acoustic_converter.py')
| -rw-r--r-- | become_yukarin/acoustic_converter.py | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/become_yukarin/acoustic_converter.py b/become_yukarin/acoustic_converter.py new file mode 100644 index 0000000..13e6225 --- /dev/null +++ b/become_yukarin/acoustic_converter.py @@ -0,0 +1,141 @@ +from functools import partial +from pathlib import Path +from typing import Optional + +import chainer +import numpy +import pysptk +import pyworld + +from become_yukarin.config.config import Config +from become_yukarin.data_struct import AcousticFeature +from become_yukarin.data_struct import Wave +from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess +from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureProcess +from become_yukarin.dataset.dataset import DecodeFeatureProcess +from become_yukarin.dataset.dataset import EncodeFeatureProcess +from become_yukarin.dataset.dataset import WaveFileLoadProcess +from become_yukarin.model.model import create_predictor + + +class AcousticConverter(object): + def __init__(self, config: Config, model_path: Path, gpu: int = None): + self.config = config + self.model_path = model_path + self.gpu = gpu + + self.model = model = create_predictor(config.model) + chainer.serializers.load_npz(str(model_path), model) + if self.gpu is not None: + model.to_gpu(self.gpu) + + self._param = param = config.dataset.param + self._wave_process = WaveFileLoadProcess( + sample_rate=param.voice_param.sample_rate, + top_db=None, + ) + self._feature_process = AcousticFeatureProcess( + frame_period=param.acoustic_feature_param.frame_period, + order=param.acoustic_feature_param.order, + alpha=param.acoustic_feature_param.alpha, + ) + + self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess() + + input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True) + input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True) + target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True) + target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True) + self._feature_normalize = AcousticFeatureNormalizeProcess( + mean=input_mean, + var=input_var, + ) + self._feature_denormalize = AcousticFeatureDenormalizeProcess( + mean=target_mean, + var=target_var, + ) + + feature_sizes = AcousticFeature.get_sizes( + sampling_rate=param.voice_param.sample_rate, + order=param.acoustic_feature_param.order, + ) + self._encode_feature = EncodeFeatureProcess(config.dataset.features) + self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) + + def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): + if out_sampling_rate is None: + out_sampling_rate = self.config.dataset.param.voice_param.sample_rate + + input_feature = input + input = self._feature_normalize(input, test=True) + input = self._encode_feature(input, test=True) + + converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) + inputs = converter([input]) + + with chainer.using_config('train', False): + out = self.model(inputs).data[0] + + if self.gpu is not None: + out = chainer.cuda.to_cpu(out) + + out = self._decode_feature(out, test=True) + out = AcousticFeature( + f0=out.f0, + spectrogram=out.spectrogram, + aperiodicity=out.aperiodicity, + mfcc=out.mfcc, + voiced=input_feature.voiced, + ) + out = self._feature_denormalize(out, test=True) + out = AcousticFeature( + f0=out.f0, + spectrogram=out.spectrogram, + aperiodicity=input_feature.aperiodicity, + mfcc=out.mfcc, + voiced=out.voiced, + ) + + fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) + spectrogram = pysptk.mc2sp( + out.mfcc, + alpha=self._param.acoustic_feature_param.alpha, + fftlen=fftlen, + ) + + out = AcousticFeature( + f0=out.f0, + spectrogram=spectrogram, + aperiodicity=out.aperiodicity, + mfcc=out.mfcc, + voiced=out.voiced, + ).astype(numpy.float64) + return out + + def convert_from_audio_path(self, input: Path, out_sampling_rate: Optional[int] = None): + input = self._wave_process(str(input), test=True) + input = self._feature_process(input, test=True) + return self.convert_from_feature(input, out_sampling_rate) + + def convert_from_feature_path(self, input: Path, out_sampling_rate: Optional[int] = None): + input = self._acoustic_feature_load_process(input, test=True) + return self.convert_from_feature(input, out_sampling_rate) + + def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): + if out_sampling_rate is None: + out_sampling_rate = self.config.dataset.param.voice_param.sample_rate + + out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate) + out = pyworld.synthesize( + f0=out.f0.ravel(), + spectrogram=out.spectrogram, + aperiodicity=out.aperiodicity, + fs=out_sampling_rate, + frame_period=self._param.acoustic_feature_param.frame_period, + ) + return Wave(out, sampling_rate=out_sampling_rate) + + def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): + return self.convert_from_audio_path(voice_path, out_sampling_rate) |
