diff options
| -rw-r--r-- | become_yukarin/__init__.py | 2 | ||||
| -rw-r--r-- | become_yukarin/config.py | 10 | ||||
| -rw-r--r-- | become_yukarin/data_struct.py | 8 | ||||
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 51 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 97 | ||||
| -rw-r--r-- | scripts/extract_acoustic_feature.py | 10 |
6 files changed, 153 insertions, 25 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py index b54083d..81bea87 100644 --- a/become_yukarin/__init__.py +++ b/become_yukarin/__init__.py @@ -1,2 +1,4 @@ +from . import config from . import dataset from . import param +from .voice_changer import VoiceChanger diff --git a/become_yukarin/config.py b/become_yukarin/config.py index d00f179..ecfcfab 100644 --- a/become_yukarin/config.py +++ b/become_yukarin/config.py @@ -72,10 +72,10 @@ def create_from_json(s: Union[str, Path]): param=Param(), input_glob=d['dataset']['input_glob'], target_glob=d['dataset']['target_glob'], - input_mean_path=Path(d['dataset']['input_mean']), - input_var_path=Path(d['dataset']['input_var']), - target_mean_path=Path(d['dataset']['target_mean']), - target_var_path=Path(d['dataset']['target_var']), + input_mean_path=Path(d['dataset']['input_mean_path']).expanduser(), + input_var_path=Path(d['dataset']['input_var_path']).expanduser(), + target_mean_path=Path(d['dataset']['target_mean_path']).expanduser(), + target_var_path=Path(d['dataset']['target_var_path']).expanduser(), seed=d['dataset']['seed'], num_test=d['dataset']['num_test'], ), @@ -93,6 +93,6 @@ def create_from_json(s: Union[str, Path]): gpu=d['train']['gpu'], log_iteration=d['train']['log_iteration'], snapshot_iteration=d['train']['snapshot_iteration'], - output=Path(d['train']['output']), + output=Path(d['train']['output']).expanduser(), ), ) diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py index c215ecc..f0601a5 100644 --- a/become_yukarin/data_struct.py +++ b/become_yukarin/data_struct.py @@ -13,3 +13,11 @@ class AcousticFeature(NamedTuple): spectrogram: numpy.ndarray aperiodicity: numpy.ndarray mfcc: numpy.ndarray + + def astype(self, dtype): + return AcousticFeature( + f0=self.f0.astype(dtype), + spectrogram=self.spectrogram.astype(dtype), + aperiodicity=self.aperiodicity.astype(dtype), + mfcc=self.mfcc.astype(dtype), + ) diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 39331c3..7a6ce08 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -61,21 +61,23 @@ class SplitProcess(BaseDataProcess): class WaveFileLoadProcess(BaseDataProcess): - def __init__(self, sample_rate: int, top_db: float): + def __init__(self, sample_rate: int, top_db: float, dtype=numpy.float32): self._sample_rate = sample_rate self._top_db = top_db + self._dtype = dtype def __call__(self, data: str, test): - wave = librosa.core.load(data, sr=self._sample_rate)[0] + wave = librosa.core.load(data, sr=self._sample_rate, dtype=self._dtype)[0] wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db)) return Wave(wave, self._sample_rate) class AcousticFeatureProcess(BaseDataProcess): - def __init__(self, frame_period, order, alpha): + def __init__(self, frame_period, order, alpha, dtype=numpy.float32): self._frame_period = frame_period self._order = order self._alpha = alpha + self._dtype = dtype def __call__(self, data: Wave, test): x = data.wave.astype(numpy.float64) @@ -87,10 +89,10 @@ class AcousticFeatureProcess(BaseDataProcess): aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) return AcousticFeature( - f0=f0, - spectrogram=spectrogram, - aperiodicity=aperiodicity, - mfcc=mfcc, + f0=f0.astype(self._dtype), + spectrogram=spectrogram.astype(self._dtype), + aperiodicity=aperiodicity.astype(self._dtype), + mfcc=mfcc.astype(self._dtype), ) @@ -122,7 +124,21 @@ class AcousticFeatureNormalizeProcess(BaseDataProcess): ) -class ReshapeFeatureProcess(BaseDataProcess): +class AcousticFeatureDenormalizeProcess(BaseDataProcess): + def __init__(self, mean: AcousticFeature, var: AcousticFeature): + self._mean = mean + self._var = var + + def __call__(self, data: AcousticFeature, test): + return AcousticFeature( + f0=data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0, + spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram, + aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity, + mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc, + ) + + +class EncodeFeatureProcess(BaseDataProcess): def __init__(self, targets: List[str]): self._targets = targets @@ -132,6 +148,21 @@ class ReshapeFeatureProcess(BaseDataProcess): return feature +class DecodeFeatureProcess(BaseDataProcess): + def __init__(self, targets: List[str]): + self._targets = targets + + def __call__(self, data: numpy.ndarray, test): + # TODO: implement for other features + data = data.T + return AcousticFeature( + f0=numpy.nan, + spectrogram=numpy.nan, + aperiodicity=numpy.nan, + mfcc=data, + ) + + class ShapeAlignProcess(BaseDataProcess): def __call__(self, data, test): data1, data2 = data['input'], data['target'] @@ -173,13 +204,13 @@ def create(config: DatasetConfig): LambdaProcess(lambda d, test: d['input_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var), - ReshapeFeatureProcess(['mfcc']), + EncodeFeatureProcess(['mfcc']), ]), target=ChainProcess([ LambdaProcess(lambda d, test: d['target_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var), - ReshapeFeatureProcess(['mfcc']), + EncodeFeatureProcess(['mfcc']), ]), )), ShapeAlignProcess(), diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py new file mode 100644 index 0000000..2ff1b8e --- /dev/null +++ b/become_yukarin/voice_changer.py @@ -0,0 +1,97 @@ +from functools import partial +from pathlib import Path +from typing import Optional + +import chainer +import numpy +import pysptk +import pyworld + +from become_yukarin.config import Config +from become_yukarin.data_struct import AcousticFeature +from become_yukarin.data_struct import Wave +from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess +from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess +from become_yukarin.dataset.dataset import AcousticFeatureProcess +from become_yukarin.dataset.dataset import EncodeFeatureProcess +from become_yukarin.dataset.dataset import DecodeFeatureProcess +from become_yukarin.dataset.dataset import WaveFileLoadProcess +from become_yukarin.model import create as create_model + + +class VoiceChanger(object): + def __init__(self, config: Config, model_path: Path): + self.config = config + self.model_path = model_path + + self.model = model = create_model(config.model) + chainer.serializers.load_npz(str(model_path), model) + + self._param = param = config.dataset.param + self._wave_process = WaveFileLoadProcess( + sample_rate=param.voice_param.sample_rate, + top_db=param.voice_param.top_db, + ) + self._feature_process = AcousticFeatureProcess( + frame_period=param.acoustic_feature_param.frame_period, + order=param.acoustic_feature_param.order, + alpha=param.acoustic_feature_param.alpha, + ) + + _acoustic_feature_load_process = AcousticFeatureLoadProcess() + + input_mean = _acoustic_feature_load_process(config.dataset.input_mean_path, test=True) + input_var = _acoustic_feature_load_process(config.dataset.input_var_path, test=True) + target_mean = _acoustic_feature_load_process(config.dataset.target_mean_path, test=True) + target_var = _acoustic_feature_load_process(config.dataset.target_var_path, test=True) + self._feature_normalize = AcousticFeatureNormalizeProcess( + mean=input_mean, + var=input_var, + ) + self._feature_denormalize = AcousticFeatureDenormalizeProcess( + mean=target_mean, + var=target_var, + ) + + self._encode_feature = EncodeFeatureProcess(['mfcc']) + self._decode_feature = DecodeFeatureProcess(['mfcc']) + + def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): + input = input_wave = self._wave_process(str(voice_path), test=True) + if out_sampling_rate is None: + out_sampling_rate = input_wave.sampling_rate + + input = input_feature = self._feature_process(input, test=True) + input = self._feature_normalize(input, test=True) + input = self._encode_feature(input, test=True) + + converter = partial(chainer.dataset.convert.concat_examples, padding=0) + inputs = converter([input]) + + out = self.model(inputs).data[0] + out = self._decode_feature(out, test=True) + out = self._feature_denormalize(out, test=True) + + fftlen = pyworld.get_cheaptrick_fft_size(input_wave.sampling_rate) + spectrogram = pysptk.mc2sp( + out.mfcc, + alpha=self._param.acoustic_feature_param.alpha, + fftlen=fftlen, + ) + + out = AcousticFeature( + f0=input_feature.f0, + spectrogram=spectrogram, + aperiodicity=input_feature.aperiodicity, + mfcc=out.mfcc, + ).astype(numpy.float64) + out = pyworld.synthesize( + f0=out.f0, + spectrogram=out.spectrogram, + aperiodicity=out.aperiodicity, + fs=out_sampling_rate, + frame_period=self._param.acoustic_feature_param.frame_period, + ) + + return Wave(out, sampling_rate=out_sampling_rate) diff --git a/scripts/extract_acoustic_feature.py b/scripts/extract_acoustic_feature.py index b280db6..ccc8d66 100644 --- a/scripts/extract_acoustic_feature.py +++ b/scripts/extract_acoustic_feature.py @@ -73,16 +73,6 @@ def generate_feature(path1, path2): aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity) mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc) - # convert type - f0_1 = f0_1.astype(numpy.float32) - f0_2 = f0_2.astype(numpy.float32) - spectrogram_1 = spectrogram_1.astype(numpy.float32) - spectrogram_2 = spectrogram_2.astype(numpy.float32) - aperiodicity_1 = aperiodicity_1.astype(numpy.float32) - aperiodicity_2 = aperiodicity_2.astype(numpy.float32) - mfcc_1 = mfcc_1.astype(numpy.float32) - mfcc_2 = mfcc_2.astype(numpy.float32) - # save path = Path(arguments.output1_directory, path1.stem + '.npy') numpy.save(path.absolute(), dict(f0=f0_1, spectrogram=spectrogram_1, aperiodicity=aperiodicity_1, mfcc=mfcc_1)) |
