summaryrefslogtreecommitdiff
path: root/become_yukarin
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2017-11-14 23:49:37 +0900
committerHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2017-11-14 23:49:37 +0900
commit1ad9c9a59a6ccc9fbb9d27d17c95c23d3cbabcc3 (patch)
treedbdee215d2ac9aa3a5dfacdb06fcf4a2d25f42da /become_yukarin
parent725dfcb2977ced3f374f3c92486a9a24a40b1bff (diff)
[WIP] add f0
Diffstat (limited to 'become_yukarin')
-rw-r--r--become_yukarin/config.py3
-rw-r--r--become_yukarin/data_struct.py13
-rw-r--r--become_yukarin/dataset/dataset.py38
-rw-r--r--become_yukarin/voice_changer.py10
4 files changed, 49 insertions, 15 deletions
diff --git a/become_yukarin/config.py b/become_yukarin/config.py
index 50694b7..f74c83e 100644
--- a/become_yukarin/config.py
+++ b/become_yukarin/config.py
@@ -1,5 +1,6 @@
import json
from pathlib import Path
+from typing import List
from typing import NamedTuple
from typing import Union
@@ -14,6 +15,7 @@ class DatasetConfig(NamedTuple):
input_var_path: Path
target_mean_path: Path
target_var_path: Path
+ features: List[str]
seed: int
num_test: int
@@ -80,6 +82,7 @@ def create_from_json(s: Union[str, Path]):
input_var_path=Path(d['dataset']['input_var_path']).expanduser(),
target_mean_path=Path(d['dataset']['target_mean_path']).expanduser(),
target_var_path=Path(d['dataset']['target_var_path']).expanduser(),
+ features=d['dataset']['features'],
seed=d['dataset']['seed'],
num_test=d['dataset']['num_test'],
),
diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py
index 63043e2..7b220f0 100644
--- a/become_yukarin/data_struct.py
+++ b/become_yukarin/data_struct.py
@@ -1,5 +1,7 @@
from typing import NamedTuple
+import pyworld
+
import numpy
@@ -23,3 +25,14 @@ class AcousticFeature(NamedTuple):
mfcc=self.mfcc.astype(dtype),
voiced=self.mfcc.astype(dtype),
)
+
+ @staticmethod
+ def get_sizes(sampling_rate: int, order: int):
+ fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate)
+ return dict(
+ f0=1,
+ spectrogram=fft_size // 2 + 1,
+ aperiodicity=fft_size // 2 + 1,
+ mfcc=order + 1,
+ voiced=1,
+ )
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py
index 93619e3..09931b3 100644
--- a/become_yukarin/dataset/dataset.py
+++ b/become_yukarin/dataset/dataset.py
@@ -1,5 +1,6 @@
import typing
from abc import ABCMeta, abstractmethod
+from collections import defaultdict
from pathlib import Path
from typing import Callable
from typing import Dict
@@ -119,8 +120,10 @@ class AcousticFeatureNormalizeProcess(BaseDataProcess):
self._var = var
def __call__(self, data: AcousticFeature, test):
+ f0 = (data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0)
+ f0[~data.voiced] = 0
return AcousticFeature(
- f0=(data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0),
+ f0=f0,
spectrogram=(data.spectrogram - self._mean.spectrogram) / numpy.sqrt(self._var.spectrogram),
aperiodicity=(data.aperiodicity - self._mean.aperiodicity) / numpy.sqrt(self._var.aperiodicity),
mfcc=(data.mfcc - self._mean.mfcc) / numpy.sqrt(self._var.mfcc),
@@ -134,8 +137,10 @@ class AcousticFeatureDenormalizeProcess(BaseDataProcess):
self._var = var
def __call__(self, data: AcousticFeature, test):
+ f0 = data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0
+ f0[~data.voiced] = 0
return AcousticFeature(
- f0=data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0,
+ f0=f0,
spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram,
aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity,
mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc,
@@ -148,24 +153,33 @@ class EncodeFeatureProcess(BaseDataProcess):
self._targets = targets
def __call__(self, data: AcousticFeature, test):
- feature = numpy.concatenate([getattr(data, t) for t in self._targets])
+ feature = numpy.concatenate([getattr(data, t) for t in self._targets], axis=1)
feature = feature.T
return feature
class DecodeFeatureProcess(BaseDataProcess):
- def __init__(self, targets: List[str]):
+ def __init__(self, targets: List[str], sizes: Dict[str, int]):
+ assert all(t in sizes for t in targets)
self._targets = targets
+ self._sizes = sizes
def __call__(self, data: numpy.ndarray, test):
- # TODO: implement for other features
data = data.T
+
+ lens = [self._sizes[t] for t in self._targets]
+ assert data.shape[1] == sum(lens)
+
+ d = defaultdict(lambda: numpy.nan, **{
+ t: data[:, bef:aft]
+ for t, bef, aft in zip(self._targets, [0] + lens[:-1], lens)
+ })
return AcousticFeature(
- f0=numpy.nan,
- spectrogram=numpy.nan,
- aperiodicity=numpy.nan,
- mfcc=data,
- voiced=numpy.nan,
+ f0=d['f0'],
+ spectrogram=d['spectrogram'],
+ aperiodicity=d['aperiodicity'],
+ mfcc=d['mfcc'],
+ voiced=d['voiced'],
)
@@ -210,13 +224,13 @@ def create(config: DatasetConfig):
LambdaProcess(lambda d, test: d['input_path']),
acoustic_feature_load_process,
AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var),
- EncodeFeatureProcess(['mfcc']),
+ EncodeFeatureProcess(config.features),
]),
target=ChainProcess([
LambdaProcess(lambda d, test: d['target_path']),
acoustic_feature_load_process,
AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var),
- EncodeFeatureProcess(['mfcc']),
+ EncodeFeatureProcess(config.features),
]),
)),
ShapeAlignProcess(),
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index e40069c..d6d39c6 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -14,8 +14,8 @@ from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess
from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess
from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess
from become_yukarin.dataset.dataset import AcousticFeatureProcess
-from become_yukarin.dataset.dataset import EncodeFeatureProcess
from become_yukarin.dataset.dataset import DecodeFeatureProcess
+from become_yukarin.dataset.dataset import EncodeFeatureProcess
from become_yukarin.dataset.dataset import WaveFileLoadProcess
from become_yukarin.model import create as create_model
@@ -54,8 +54,12 @@ class VoiceChanger(object):
var=target_var,
)
- self._encode_feature = EncodeFeatureProcess(['mfcc'])
- self._decode_feature = DecodeFeatureProcess(['mfcc'])
+ feature_sizes = AcousticFeature.get_sizes(
+ sampling_rate=param.voice_param.sample_rate,
+ order=param.acoustic_feature_param.order,
+ )
+ self._encode_feature = EncodeFeatureProcess(config.dataset.features)
+ self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes)
def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None):
input = input_wave = self._wave_process(str(voice_path), test=True)