diff options
| author | Hiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp> | 2017-11-14 23:49:37 +0900 |
|---|---|---|
| committer | Hiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp> | 2017-11-14 23:49:37 +0900 |
| commit | 1ad9c9a59a6ccc9fbb9d27d17c95c23d3cbabcc3 (patch) | |
| tree | dbdee215d2ac9aa3a5dfacdb06fcf4a2d25f42da /become_yukarin | |
| parent | 725dfcb2977ced3f374f3c92486a9a24a40b1bff (diff) | |
[WIP] add f0
Diffstat (limited to 'become_yukarin')
| -rw-r--r-- | become_yukarin/config.py | 3 | ||||
| -rw-r--r-- | become_yukarin/data_struct.py | 13 | ||||
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 38 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 10 |
4 files changed, 49 insertions, 15 deletions
diff --git a/become_yukarin/config.py b/become_yukarin/config.py index 50694b7..f74c83e 100644 --- a/become_yukarin/config.py +++ b/become_yukarin/config.py @@ -1,5 +1,6 @@ import json from pathlib import Path +from typing import List from typing import NamedTuple from typing import Union @@ -14,6 +15,7 @@ class DatasetConfig(NamedTuple): input_var_path: Path target_mean_path: Path target_var_path: Path + features: List[str] seed: int num_test: int @@ -80,6 +82,7 @@ def create_from_json(s: Union[str, Path]): input_var_path=Path(d['dataset']['input_var_path']).expanduser(), target_mean_path=Path(d['dataset']['target_mean_path']).expanduser(), target_var_path=Path(d['dataset']['target_var_path']).expanduser(), + features=d['dataset']['features'], seed=d['dataset']['seed'], num_test=d['dataset']['num_test'], ), diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py index 63043e2..7b220f0 100644 --- a/become_yukarin/data_struct.py +++ b/become_yukarin/data_struct.py @@ -1,5 +1,7 @@ from typing import NamedTuple +import pyworld + import numpy @@ -23,3 +25,14 @@ class AcousticFeature(NamedTuple): mfcc=self.mfcc.astype(dtype), voiced=self.mfcc.astype(dtype), ) + + @staticmethod + def get_sizes(sampling_rate: int, order: int): + fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) + return dict( + f0=1, + spectrogram=fft_size // 2 + 1, + aperiodicity=fft_size // 2 + 1, + mfcc=order + 1, + voiced=1, + ) diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 93619e3..09931b3 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -1,5 +1,6 @@ import typing from abc import ABCMeta, abstractmethod +from collections import defaultdict from pathlib import Path from typing import Callable from typing import Dict @@ -119,8 +120,10 @@ class AcousticFeatureNormalizeProcess(BaseDataProcess): self._var = var def __call__(self, data: AcousticFeature, test): + f0 = (data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0) + f0[~data.voiced] = 0 return AcousticFeature( - f0=(data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0), + f0=f0, spectrogram=(data.spectrogram - self._mean.spectrogram) / numpy.sqrt(self._var.spectrogram), aperiodicity=(data.aperiodicity - self._mean.aperiodicity) / numpy.sqrt(self._var.aperiodicity), mfcc=(data.mfcc - self._mean.mfcc) / numpy.sqrt(self._var.mfcc), @@ -134,8 +137,10 @@ class AcousticFeatureDenormalizeProcess(BaseDataProcess): self._var = var def __call__(self, data: AcousticFeature, test): + f0 = data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0 + f0[~data.voiced] = 0 return AcousticFeature( - f0=data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0, + f0=f0, spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram, aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity, mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc, @@ -148,24 +153,33 @@ class EncodeFeatureProcess(BaseDataProcess): self._targets = targets def __call__(self, data: AcousticFeature, test): - feature = numpy.concatenate([getattr(data, t) for t in self._targets]) + feature = numpy.concatenate([getattr(data, t) for t in self._targets], axis=1) feature = feature.T return feature class DecodeFeatureProcess(BaseDataProcess): - def __init__(self, targets: List[str]): + def __init__(self, targets: List[str], sizes: Dict[str, int]): + assert all(t in sizes for t in targets) self._targets = targets + self._sizes = sizes def __call__(self, data: numpy.ndarray, test): - # TODO: implement for other features data = data.T + + lens = [self._sizes[t] for t in self._targets] + assert data.shape[1] == sum(lens) + + d = defaultdict(lambda: numpy.nan, **{ + t: data[:, bef:aft] + for t, bef, aft in zip(self._targets, [0] + lens[:-1], lens) + }) return AcousticFeature( - f0=numpy.nan, - spectrogram=numpy.nan, - aperiodicity=numpy.nan, - mfcc=data, - voiced=numpy.nan, + f0=d['f0'], + spectrogram=d['spectrogram'], + aperiodicity=d['aperiodicity'], + mfcc=d['mfcc'], + voiced=d['voiced'], ) @@ -210,13 +224,13 @@ def create(config: DatasetConfig): LambdaProcess(lambda d, test: d['input_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var), - EncodeFeatureProcess(['mfcc']), + EncodeFeatureProcess(config.features), ]), target=ChainProcess([ LambdaProcess(lambda d, test: d['target_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var), - EncodeFeatureProcess(['mfcc']), + EncodeFeatureProcess(config.features), ]), )), ShapeAlignProcess(), diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index e40069c..d6d39c6 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -14,8 +14,8 @@ from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess from become_yukarin.dataset.dataset import AcousticFeatureProcess -from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import DecodeFeatureProcess +from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import WaveFileLoadProcess from become_yukarin.model import create as create_model @@ -54,8 +54,12 @@ class VoiceChanger(object): var=target_var, ) - self._encode_feature = EncodeFeatureProcess(['mfcc']) - self._decode_feature = DecodeFeatureProcess(['mfcc']) + feature_sizes = AcousticFeature.get_sizes( + sampling_rate=param.voice_param.sample_rate, + order=param.acoustic_feature_param.order, + ) + self._encode_feature = EncodeFeatureProcess(config.dataset.features) + self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): input = input_wave = self._wave_process(str(voice_path), test=True) |
