From 1ad9c9a59a6ccc9fbb9d27d17c95c23d3cbabcc3 Mon Sep 17 00:00:00 2001 From: Hiroshiba Kazuyuki Date: Tue, 14 Nov 2017 23:49:37 +0900 Subject: [WIP] add f0 --- become_yukarin/config.py | 3 +++ become_yukarin/data_struct.py | 13 ++++++++++ become_yukarin/dataset/dataset.py | 38 ++++++++++++++++++++--------- become_yukarin/voice_changer.py | 10 +++++--- tests/__init__.py | 0 tests/test_dataset.py | 51 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 100 insertions(+), 15 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_dataset.py diff --git a/become_yukarin/config.py b/become_yukarin/config.py index 50694b7..f74c83e 100644 --- a/become_yukarin/config.py +++ b/become_yukarin/config.py @@ -1,5 +1,6 @@ import json from pathlib import Path +from typing import List from typing import NamedTuple from typing import Union @@ -14,6 +15,7 @@ class DatasetConfig(NamedTuple): input_var_path: Path target_mean_path: Path target_var_path: Path + features: List[str] seed: int num_test: int @@ -80,6 +82,7 @@ def create_from_json(s: Union[str, Path]): input_var_path=Path(d['dataset']['input_var_path']).expanduser(), target_mean_path=Path(d['dataset']['target_mean_path']).expanduser(), target_var_path=Path(d['dataset']['target_var_path']).expanduser(), + features=d['dataset']['features'], seed=d['dataset']['seed'], num_test=d['dataset']['num_test'], ), diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py index 63043e2..7b220f0 100644 --- a/become_yukarin/data_struct.py +++ b/become_yukarin/data_struct.py @@ -1,5 +1,7 @@ from typing import NamedTuple +import pyworld + import numpy @@ -23,3 +25,14 @@ class AcousticFeature(NamedTuple): mfcc=self.mfcc.astype(dtype), voiced=self.mfcc.astype(dtype), ) + + @staticmethod + def get_sizes(sampling_rate: int, order: int): + fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) + return dict( + f0=1, + spectrogram=fft_size // 2 + 1, + aperiodicity=fft_size // 2 + 1, + mfcc=order + 1, + voiced=1, + ) diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 93619e3..09931b3 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -1,5 +1,6 @@ import typing from abc import ABCMeta, abstractmethod +from collections import defaultdict from pathlib import Path from typing import Callable from typing import Dict @@ -119,8 +120,10 @@ class AcousticFeatureNormalizeProcess(BaseDataProcess): self._var = var def __call__(self, data: AcousticFeature, test): + f0 = (data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0) + f0[~data.voiced] = 0 return AcousticFeature( - f0=(data.f0 - self._mean.f0) / numpy.sqrt(self._var.f0), + f0=f0, spectrogram=(data.spectrogram - self._mean.spectrogram) / numpy.sqrt(self._var.spectrogram), aperiodicity=(data.aperiodicity - self._mean.aperiodicity) / numpy.sqrt(self._var.aperiodicity), mfcc=(data.mfcc - self._mean.mfcc) / numpy.sqrt(self._var.mfcc), @@ -134,8 +137,10 @@ class AcousticFeatureDenormalizeProcess(BaseDataProcess): self._var = var def __call__(self, data: AcousticFeature, test): + f0 = data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0 + f0[~data.voiced] = 0 return AcousticFeature( - f0=data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0, + f0=f0, spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram, aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity, mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc, @@ -148,24 +153,33 @@ class EncodeFeatureProcess(BaseDataProcess): self._targets = targets def __call__(self, data: AcousticFeature, test): - feature = numpy.concatenate([getattr(data, t) for t in self._targets]) + feature = numpy.concatenate([getattr(data, t) for t in self._targets], axis=1) feature = feature.T return feature class DecodeFeatureProcess(BaseDataProcess): - def __init__(self, targets: List[str]): + def __init__(self, targets: List[str], sizes: Dict[str, int]): + assert all(t in sizes for t in targets) self._targets = targets + self._sizes = sizes def __call__(self, data: numpy.ndarray, test): - # TODO: implement for other features data = data.T + + lens = [self._sizes[t] for t in self._targets] + assert data.shape[1] == sum(lens) + + d = defaultdict(lambda: numpy.nan, **{ + t: data[:, bef:aft] + for t, bef, aft in zip(self._targets, [0] + lens[:-1], lens) + }) return AcousticFeature( - f0=numpy.nan, - spectrogram=numpy.nan, - aperiodicity=numpy.nan, - mfcc=data, - voiced=numpy.nan, + f0=d['f0'], + spectrogram=d['spectrogram'], + aperiodicity=d['aperiodicity'], + mfcc=d['mfcc'], + voiced=d['voiced'], ) @@ -210,13 +224,13 @@ def create(config: DatasetConfig): LambdaProcess(lambda d, test: d['input_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var), - EncodeFeatureProcess(['mfcc']), + EncodeFeatureProcess(config.features), ]), target=ChainProcess([ LambdaProcess(lambda d, test: d['target_path']), acoustic_feature_load_process, AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var), - EncodeFeatureProcess(['mfcc']), + EncodeFeatureProcess(config.features), ]), )), ShapeAlignProcess(), diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index e40069c..d6d39c6 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -14,8 +14,8 @@ from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess from become_yukarin.dataset.dataset import AcousticFeatureProcess -from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import DecodeFeatureProcess +from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import WaveFileLoadProcess from become_yukarin.model import create as create_model @@ -54,8 +54,12 @@ class VoiceChanger(object): var=target_var, ) - self._encode_feature = EncodeFeatureProcess(['mfcc']) - self._decode_feature = DecodeFeatureProcess(['mfcc']) + feature_sizes = AcousticFeature.get_sizes( + sampling_rate=param.voice_param.sample_rate, + order=param.acoustic_feature_param.order, + ) + self._encode_feature = EncodeFeatureProcess(config.dataset.features) + self._decode_feature = DecodeFeatureProcess(config.dataset.features, feature_sizes) def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None): input = input_wave = self._wave_process(str(voice_path), test=True) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 0000000..5283f7d --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,51 @@ +import unittest + +import numpy +from become_yukarin.dataset import dataset + + +class TestDataset(unittest.TestCase): + def setUp(self): + self.sample_rate = 24000 + self.len_time = len_time = 100 + self.fft_size = fft_size = 1024 + self.order = order = 59 + self.dummy_feature = dataset.AcousticFeature( + f0=numpy.arange(len_time).reshape((len_time, -1)), + spectrogram=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)), + aperiodicity=numpy.arange(len_time * (fft_size // 2 + 1)).reshape((len_time, -1)), + mfcc=numpy.arange(len_time * (order + 1)).reshape((len_time, -1)), + voiced=(numpy.arange(len_time) % 2 == 1).reshape((len_time, -1)), + ) + self.feature_sizes = dataset.AcousticFeature.get_sizes( + sampling_rate=self.sample_rate, + order=self.order, + ) + + def test_encode_decode_feature(self): + encode_feature = dataset.EncodeFeatureProcess(['mfcc']) + decode_feature = dataset.DecodeFeatureProcess(['mfcc'], self.feature_sizes) + e = encode_feature(self.dummy_feature, test=True) + d = decode_feature(e, test=True) + self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc)) + + def test_encode_decode_feature2(self): + encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0']) + decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes) + e = encode_feature(self.dummy_feature, test=True) + d = decode_feature(e, test=True) + self.assertTrue(numpy.all(self.dummy_feature.mfcc == d.mfcc)) + self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0)) + + def test_encode_decode_feature3(self): + encode_feature = dataset.EncodeFeatureProcess(['mfcc', 'f0']) + decode_feature = dataset.DecodeFeatureProcess(['mfcc', 'f0'], self.feature_sizes) + e = encode_feature(self.dummy_feature, test=True) + e[0] = numpy.nan + d = decode_feature(e, test=True) + self.assertFalse(numpy.all(self.dummy_feature.mfcc == d.mfcc)) + self.assertTrue(numpy.all(self.dummy_feature.f0 == d.f0)) + + +if __name__ == '__main__': + unittest.main() -- cgit v1.2.3-70-g09d2