summaryrefslogtreecommitdiff
path: root/become_yukarin/voice_changer.py
diff options
context:
space:
mode:
Diffstat (limited to 'become_yukarin/voice_changer.py')
-rw-r--r--become_yukarin/voice_changer.py148
1 files changed, 20 insertions, 128 deletions
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 822d8c5..30fbf28 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -1,135 +1,27 @@
-from functools import partial
-from pathlib import Path
-from typing import Optional
-
-import chainer
import numpy
-import pysptk
-import pyworld
-from become_yukarin.config.config import Config
-from become_yukarin.data_struct import AcousticFeature
-from become_yukarin.data_struct import Wave
-from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess
-from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess
-from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess
-from become_yukarin.dataset.dataset import AcousticFeatureProcess
-from become_yukarin.dataset.dataset import DecodeFeatureProcess
-from become_yukarin.dataset.dataset import EncodeFeatureProcess
-from become_yukarin.dataset.dataset import WaveFileLoadProcess
-from become_yukarin.model.model import create_predictor
+from .acoustic_converter import AcousticConverter
+from .super_resolution import SuperResolution
class VoiceChanger(object):
- def __init__(self, config: Config, model_path: Path):
- self.config = config
- self.model_path = model_path
-
- self.model = model = create_predictor(config.model)
- chainer.serializers.load_npz(str(model_path), model)
-
- self._param = param = config.dataset.param
- self._wave_process = WaveFileLoadProcess(
- sample_rate=param.voice_param.sample_rate,
- top_db=None,
- )
- self._feature_process = AcousticFeatureProcess(
- frame_period=param.acoustic_feature_param.frame_period,
- order=param.acoustic_feature_param.order,
- alpha=param.acoustic_feature_param.alpha,
- )
-
- self._acoustic_feature_load_process = acoustic_feature_load_process = AcousticFeatureLoadProcess()
-
- input_mean = acoustic_feature_load_process(config.dataset.input_mean_path, test=True)
- input_var = acoustic_feature_load_process(config.dataset.input_var_path, test=True)
- target_mean = acoustic_feature_load_process(config.dataset.target_mean_path, test=True)
- target_var = acoustic_feature_load_process(config.dataset.target_var_path, test=True)
- self._feature_normalize = AcousticFeatureNormalizeProcess(
- mean=input_mean,
- var=input_var,
- )
- self._feature_denormalize = AcousticFeatureDenormalizeProcess(
- mean=target_mean,
- var=target_var,
- )
-
- feature_sizes = AcousticFeature.get_sizes(
- sampling_rate=param.voice_param.sample_rate,
- order=param.acoustic_feature_param.order,
- )
- self._encode_feature = EncodeFeatureProcess(config.dataset.features)
- self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes)
-
- def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
- if out_sampling_rate is None:
- out_sampling_rate = self.config.dataset.param.voice_param.sample_rate
-
- input_feature = input
- input = self._feature_normalize(input, test=True)
- input = self._encode_feature(input, test=True)
-
- converter = partial(chainer.dataset.convert.concat_examples, padding=0)
- inputs = converter([input])
-
- with chainer.using_config('train', False):
- out = self.model(inputs).data[0]
-
- out = self._decode_feature(out, test=True)
- out = AcousticFeature(
- f0=out.f0,
- spectrogram=out.spectrogram,
- aperiodicity=out.aperiodicity,
- mfcc=out.mfcc,
- voiced=input_feature.voiced,
- )
- out = self._feature_denormalize(out, test=True)
- out = AcousticFeature(
- f0=out.f0,
- spectrogram=out.spectrogram,
- aperiodicity=input_feature.aperiodicity,
- mfcc=out.mfcc,
- voiced=out.voiced,
- )
-
- fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate)
- spectrogram = pysptk.mc2sp(
- out.mfcc,
- alpha=self._param.acoustic_feature_param.alpha,
- fftlen=fftlen,
- )
-
- out = AcousticFeature(
- f0=out.f0,
- spectrogram=spectrogram,
- aperiodicity=out.aperiodicity,
- mfcc=out.mfcc,
- voiced=out.voiced,
- ).astype(numpy.float64)
- return out
-
- def convert_from_audio_path(self, input: Path, out_sampling_rate: Optional[int] = None):
- input = self._wave_process(str(input), test=True)
- input = self._feature_process(input, test=True)
- return self.convert_from_feature(input, out_sampling_rate)
-
- def convert_from_feature_path(self, input: Path, out_sampling_rate: Optional[int] = None):
- input = self._acoustic_feature_load_process(input, test=True)
- return self.convert_from_feature(input, out_sampling_rate)
-
- def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
- if out_sampling_rate is None:
- out_sampling_rate = self.config.dataset.param.voice_param.sample_rate
+ def __init__(
+ self,
+ acoustic_converter: AcousticConverter,
+ super_resolution: SuperResolution,
+ output_sampling_rate: int = None,
+ ):
+ if output_sampling_rate is None:
+ output_sampling_rate = super_resolution.config.dataset.param.voice_param.sample_rate
- out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate)
- out = pyworld.synthesize(
- f0=out.f0.ravel(),
- spectrogram=out.spectrogram,
- aperiodicity=out.aperiodicity,
- fs=out_sampling_rate,
- frame_period=self._param.acoustic_feature_param.frame_period,
- )
- return Wave(out, sampling_rate=out_sampling_rate)
+ self.acoustic_converter = acoustic_converter
+ self.super_resolution = super_resolution
+ self.output_sampling_rate = output_sampling_rate
- def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None):
- return self.convert_from_audio_path(voice_path, out_sampling_rate)
+ def convert_from_wave_path(self, wave_path: str):
+ w_in = self.acoustic_converter._wave_process(wave_path)
+ f_in = self.acoustic_converter._feature_process(w_in)
+ f_low = self.acoustic_converter.convert_to_feature(f_in)
+ s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32))
+ wave = self.super_resolution(s_high, acoustic_feature=f_low, sampling_rate=self.output_sampling_rate)
+ return wave