From 2be3f03adc5695f82c6ab86da780108f786ed014 Mon Sep 17 00:00:00 2001 From: Hiroshiba Kazuyuki Date: Sun, 14 Jan 2018 07:40:07 +0900 Subject: 超解像 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- become_yukarin/config.py | 170 -------------------- become_yukarin/config/__init__.py | 2 + become_yukarin/config/config.py | 170 ++++++++++++++++++++ become_yukarin/config/sr_config.py | 122 +++++++++++++++ become_yukarin/data_struct.py | 10 ++ become_yukarin/dataset/__init__.py | 1 + become_yukarin/dataset/dataset.py | 125 ++++++++++++++- become_yukarin/model.py | 292 ----------------------------------- become_yukarin/model/__init__.py | 2 + become_yukarin/model/model.py | 292 +++++++++++++++++++++++++++++++++++ become_yukarin/model/sr_model.py | 119 ++++++++++++++ become_yukarin/param.py | 2 +- become_yukarin/updater.py | 106 ------------- become_yukarin/updater/__init__.py | 2 + become_yukarin/updater/sr_updater.py | 69 +++++++++ become_yukarin/updater/updater.py | 103 ++++++++++++ become_yukarin/voice_changer.py | 4 +- scripts/extract_acoustic_feature.py | 13 -- train.py | 7 +- train_sr.py | 98 ++++++++++++ 20 files changed, 1114 insertions(+), 595 deletions(-) delete mode 100644 become_yukarin/config.py create mode 100644 become_yukarin/config/__init__.py create mode 100644 become_yukarin/config/config.py create mode 100644 become_yukarin/config/sr_config.py delete mode 100644 become_yukarin/model.py create mode 100644 become_yukarin/model/__init__.py create mode 100644 become_yukarin/model/model.py create mode 100644 become_yukarin/model/sr_model.py delete mode 100644 become_yukarin/updater.py create mode 100644 become_yukarin/updater/__init__.py create mode 100644 become_yukarin/updater/sr_updater.py create mode 100644 become_yukarin/updater/updater.py create mode 100644 train_sr.py diff --git a/become_yukarin/config.py b/become_yukarin/config.py deleted file mode 100644 index 4ba953e..0000000 --- a/become_yukarin/config.py +++ /dev/null @@ -1,170 +0,0 @@ -import json -from pathlib import Path -from typing import Dict -from typing import List -from typing import NamedTuple -from typing import Optional -from typing import Union - -from .param import Param - - -class DatasetConfig(NamedTuple): - param: Param - input_glob: Path - target_glob: Path - input_mean_path: Path - input_var_path: Path - target_mean_path: Path - target_var_path: Path - features: List[str] - train_crop_size: int - input_global_noise: float - input_local_noise: float - target_global_noise: float - target_local_noise: float - seed: int - num_test: int - - -class DiscriminatorModelConfig(NamedTuple): - in_channels: int - hidden_channels_list: List[int] - - -class ModelConfig(NamedTuple): - in_channels: int - conv_bank_out_channels: int - conv_bank_k: int - max_pooling_k: int - conv_projections_hidden_channels: int - highway_layers: int - out_channels: int - out_size: int - aligner_out_time_length: int - disable_last_rnn: bool - enable_aligner: bool - discriminator: Optional[DiscriminatorModelConfig] - - -class LossConfig(NamedTuple): - l1: float - predictor_fake: float - discriminator_true: float - discriminator_fake: float - discriminator_grad: float - - -class TrainConfig(NamedTuple): - batchsize: int - gpu: int - log_iteration: int - snapshot_iteration: int - - -class ProjectConfig(NamedTuple): - name: str - tags: List[str] - - -class Config(NamedTuple): - dataset: DatasetConfig - model: ModelConfig - loss: LossConfig - train: TrainConfig - project: ProjectConfig - - def save_as_json(self, path): - d = _namedtuple_to_dict(self) - json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path) - - -def _default_path(o): - if isinstance(o, Path): - return str(o) - raise TypeError(repr(o) + " is not JSON serializable") - - -def _namedtuple_to_dict(o: NamedTuple): - return { - k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v) - for k, v in o._asdict().items() - } - - -def create_from_json(s: Union[str, Path]): - try: - d = json.loads(s) - except TypeError: - d = json.load(open(s)) - - backward_compatible(d) - - if d['model']['discriminator'] is not None: - discriminator_model_config = DiscriminatorModelConfig( - in_channels=d['model']['discriminator']['in_channels'], - hidden_channels_list=d['model']['discriminator']['hidden_channels_list'], - ) - else: - discriminator_model_config = None - - return Config( - dataset=DatasetConfig( - param=Param(), - input_glob=Path(d['dataset']['input_glob']), - target_glob=Path(d['dataset']['target_glob']), - input_mean_path=Path(d['dataset']['input_mean_path']), - input_var_path=Path(d['dataset']['input_var_path']), - target_mean_path=Path(d['dataset']['target_mean_path']), - target_var_path=Path(d['dataset']['target_var_path']), - features=d['dataset']['features'], - train_crop_size=d['dataset']['train_crop_size'], - input_global_noise=d['dataset']['input_global_noise'], - input_local_noise=d['dataset']['input_local_noise'], - target_global_noise=d['dataset']['target_global_noise'], - target_local_noise=d['dataset']['target_local_noise'], - seed=d['dataset']['seed'], - num_test=d['dataset']['num_test'], - ), - model=ModelConfig( - in_channels=d['model']['in_channels'], - conv_bank_out_channels=d['model']['conv_bank_out_channels'], - conv_bank_k=d['model']['conv_bank_k'], - max_pooling_k=d['model']['max_pooling_k'], - conv_projections_hidden_channels=d['model']['conv_projections_hidden_channels'], - highway_layers=d['model']['highway_layers'], - out_channels=d['model']['out_channels'], - out_size=d['model']['out_size'], - aligner_out_time_length=d['model']['aligner_out_time_length'], - disable_last_rnn=d['model']['disable_last_rnn'], - enable_aligner=d['model']['enable_aligner'], - discriminator=discriminator_model_config, - ), - loss=LossConfig( - l1=d['loss']['l1'], - predictor_fake=d['loss']['predictor_fake'], - discriminator_true=d['loss']['discriminator_true'], - discriminator_fake=d['loss']['discriminator_fake'], - discriminator_grad=d['loss']['discriminator_grad'], - ), - train=TrainConfig( - batchsize=d['train']['batchsize'], - gpu=d['train']['gpu'], - log_iteration=d['train']['log_iteration'], - snapshot_iteration=d['train']['snapshot_iteration'], - ), - project=ProjectConfig( - name=d['project']['name'], - tags=d['project']['tags'], - ) - ) - - -def backward_compatible(d: Dict): - if 'input_global_noise' not in d['dataset']: - d['dataset']['input_global_noise'] = d['dataset']['global_noise'] - d['dataset']['input_local_noise'] = d['dataset']['local_noise'] - - if 'target_global_noise' not in d['dataset']: - d['dataset']['target_global_noise'] = d['dataset']['global_noise'] - d['dataset']['target_local_noise'] = d['dataset']['local_noise'] diff --git a/become_yukarin/config/__init__.py b/become_yukarin/config/__init__.py new file mode 100644 index 0000000..fb73169 --- /dev/null +++ b/become_yukarin/config/__init__.py @@ -0,0 +1,2 @@ +from . import config +from . import sr_config diff --git a/become_yukarin/config/config.py b/become_yukarin/config/config.py new file mode 100644 index 0000000..ee1d68f --- /dev/null +++ b/become_yukarin/config/config.py @@ -0,0 +1,170 @@ +import json +from pathlib import Path +from typing import Dict +from typing import List +from typing import NamedTuple +from typing import Optional +from typing import Union + +from become_yukarin.param import Param + + +class DatasetConfig(NamedTuple): + param: Param + input_glob: Path + target_glob: Path + input_mean_path: Path + input_var_path: Path + target_mean_path: Path + target_var_path: Path + features: List[str] + train_crop_size: int + input_global_noise: float + input_local_noise: float + target_global_noise: float + target_local_noise: float + seed: int + num_test: int + + +class DiscriminatorModelConfig(NamedTuple): + in_channels: int + hidden_channels_list: List[int] + + +class ModelConfig(NamedTuple): + in_channels: int + conv_bank_out_channels: int + conv_bank_k: int + max_pooling_k: int + conv_projections_hidden_channels: int + highway_layers: int + out_channels: int + out_size: int + aligner_out_time_length: int + disable_last_rnn: bool + enable_aligner: bool + discriminator: Optional[DiscriminatorModelConfig] + + +class LossConfig(NamedTuple): + l1: float + predictor_fake: float + discriminator_true: float + discriminator_fake: float + discriminator_grad: float + + +class TrainConfig(NamedTuple): + batchsize: int + gpu: int + log_iteration: int + snapshot_iteration: int + + +class ProjectConfig(NamedTuple): + name: str + tags: List[str] + + +class Config(NamedTuple): + dataset: DatasetConfig + model: ModelConfig + loss: LossConfig + train: TrainConfig + project: ProjectConfig + + def save_as_json(self, path): + d = _namedtuple_to_dict(self) + json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path) + + +def _default_path(o): + if isinstance(o, Path): + return str(o) + raise TypeError(repr(o) + " is not JSON serializable") + + +def _namedtuple_to_dict(o: NamedTuple): + return { + k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v) + for k, v in o._asdict().items() + } + + +def create_from_json(s: Union[str, Path]): + try: + d = json.loads(s) + except TypeError: + d = json.load(open(s)) + + backward_compatible(d) + + if d['model']['discriminator'] is not None: + discriminator_model_config = DiscriminatorModelConfig( + in_channels=d['model']['discriminator']['in_channels'], + hidden_channels_list=d['model']['discriminator']['hidden_channels_list'], + ) + else: + discriminator_model_config = None + + return Config( + dataset=DatasetConfig( + param=Param(), + input_glob=Path(d['dataset']['input_glob']), + target_glob=Path(d['dataset']['target_glob']), + input_mean_path=Path(d['dataset']['input_mean_path']), + input_var_path=Path(d['dataset']['input_var_path']), + target_mean_path=Path(d['dataset']['target_mean_path']), + target_var_path=Path(d['dataset']['target_var_path']), + features=d['dataset']['features'], + train_crop_size=d['dataset']['train_crop_size'], + input_global_noise=d['dataset']['input_global_noise'], + input_local_noise=d['dataset']['input_local_noise'], + target_global_noise=d['dataset']['target_global_noise'], + target_local_noise=d['dataset']['target_local_noise'], + seed=d['dataset']['seed'], + num_test=d['dataset']['num_test'], + ), + model=ModelConfig( + in_channels=d['model']['in_channels'], + conv_bank_out_channels=d['model']['conv_bank_out_channels'], + conv_bank_k=d['model']['conv_bank_k'], + max_pooling_k=d['model']['max_pooling_k'], + conv_projections_hidden_channels=d['model']['conv_projections_hidden_channels'], + highway_layers=d['model']['highway_layers'], + out_channels=d['model']['out_channels'], + out_size=d['model']['out_size'], + aligner_out_time_length=d['model']['aligner_out_time_length'], + disable_last_rnn=d['model']['disable_last_rnn'], + enable_aligner=d['model']['enable_aligner'], + discriminator=discriminator_model_config, + ), + loss=LossConfig( + l1=d['loss']['l1'], + predictor_fake=d['loss']['predictor_fake'], + discriminator_true=d['loss']['discriminator_true'], + discriminator_fake=d['loss']['discriminator_fake'], + discriminator_grad=d['loss']['discriminator_grad'], + ), + train=TrainConfig( + batchsize=d['train']['batchsize'], + gpu=d['train']['gpu'], + log_iteration=d['train']['log_iteration'], + snapshot_iteration=d['train']['snapshot_iteration'], + ), + project=ProjectConfig( + name=d['project']['name'], + tags=d['project']['tags'], + ) + ) + + +def backward_compatible(d: Dict): + if 'input_global_noise' not in d['dataset']: + d['dataset']['input_global_noise'] = d['dataset']['global_noise'] + d['dataset']['input_local_noise'] = d['dataset']['local_noise'] + + if 'target_global_noise' not in d['dataset']: + d['dataset']['target_global_noise'] = d['dataset']['global_noise'] + d['dataset']['target_local_noise'] = d['dataset']['local_noise'] diff --git a/become_yukarin/config/sr_config.py b/become_yukarin/config/sr_config.py new file mode 100644 index 0000000..93db424 --- /dev/null +++ b/become_yukarin/config/sr_config.py @@ -0,0 +1,122 @@ +import json +from pathlib import Path +from typing import Dict +from typing import List +from typing import NamedTuple +from typing import Union + +from become_yukarin.param import Param + + +class SRDatasetConfig(NamedTuple): + param: Param + input_glob: Path + train_crop_size: int + seed: int + num_test: int + + +class SRModelConfig(NamedTuple): + in_channels: int + conv_bank_out_channels: int + conv_bank_k: int + max_pooling_k: int + conv_projections_hidden_channels: int + highway_layers: int + out_channels: int + out_size: int + aligner_out_time_length: int + disable_last_rnn: bool + enable_aligner: bool + + +class SRLossConfig(NamedTuple): + mse: float + adversarial: float + + +class SRTrainConfig(NamedTuple): + batchsize: int + gpu: int + log_iteration: int + snapshot_iteration: int + + +class SRProjectConfig(NamedTuple): + name: str + tags: List[str] + + +class SRConfig(NamedTuple): + dataset: SRDatasetConfig + model: SRModelConfig + loss: SRLossConfig + train: SRTrainConfig + project: SRProjectConfig + + def save_as_json(self, path): + d = _namedtuple_to_dict(self) + json.dump(d, open(path, 'w'), indent=2, sort_keys=True, default=_default_path) + + +def _default_path(o): + if isinstance(o, Path): + return str(o) + raise TypeError(repr(o) + " is not JSON serializable") + + +def _namedtuple_to_dict(o: NamedTuple): + return { + k: v if not hasattr(v, '_asdict') else _namedtuple_to_dict(v) + for k, v in o._asdict().items() + } + + +def create_from_json(s: Union[str, Path]): + try: + d = json.loads(s) + except TypeError: + d = json.load(open(s)) + + backward_compatible(d) + + return SRConfig( + dataset=SRDatasetConfig( + param=Param(), + input_glob=Path(d['dataset']['input_glob']), + train_crop_size=d['dataset']['train_crop_size'], + seed=d['dataset']['seed'], + num_test=d['dataset']['num_test'], + ), + model=SRModelConfig( + in_channels=d['model']['in_channels'], + conv_bank_out_channels=d['model']['conv_bank_out_channels'], + conv_bank_k=d['model']['conv_bank_k'], + max_pooling_k=d['model']['max_pooling_k'], + conv_projections_hidden_channels=d['model']['conv_projections_hidden_channels'], + highway_layers=d['model']['highway_layers'], + out_channels=d['model']['out_channels'], + out_size=d['model']['out_size'], + aligner_out_time_length=d['model']['aligner_out_time_length'], + disable_last_rnn=d['model']['disable_last_rnn'], + enable_aligner=d['model']['enable_aligner'], + ), + loss=SRLossConfig( + mse=d['loss']['mse'], + adversarial=d['loss']['adversarial'], + ), + train=SRTrainConfig( + batchsize=d['train']['batchsize'], + gpu=d['train']['gpu'], + log_iteration=d['train']['log_iteration'], + snapshot_iteration=d['train']['snapshot_iteration'], + ), + project=SRProjectConfig( + name=d['project']['name'], + tags=d['project']['tags'], + ) + ) + + +def backward_compatible(d: Dict): + pass diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py index 73b9b3b..78c8cf3 100644 --- a/become_yukarin/data_struct.py +++ b/become_yukarin/data_struct.py @@ -60,3 +60,13 @@ class AcousticFeature(NamedTuple): mfcc=order + 1, voiced=1, ) + + +class LowHighSpectrogramFeature(NamedTuple): + low: numpy.ndarray + high: numpy.ndarray + + def validate(self): + assert self.low.ndim == 2 + assert self.high.ndim == 2 + assert self.low.shape == self.high.shape diff --git a/become_yukarin/dataset/__init__.py b/become_yukarin/dataset/__init__.py index 4606e7b..591eaa4 100644 --- a/become_yukarin/dataset/__init__.py +++ b/become_yukarin/dataset/__init__.py @@ -1,3 +1,4 @@ from . import dataset from . import utility from .dataset import create +from .dataset import create_sr diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 368073d..b0f9807 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -1,4 +1,5 @@ import copy +import glob import typing from abc import ABCMeta, abstractmethod from collections import defaultdict @@ -13,8 +14,10 @@ import numpy import pysptk import pyworld -from ..config import DatasetConfig +from ..config.config import DatasetConfig +from ..config.sr_config import SRDatasetConfig from ..data_struct import AcousticFeature +from ..data_struct import LowHighSpectrogramFeature from ..data_struct import Wave @@ -112,6 +115,35 @@ class AcousticFeatureProcess(BaseDataProcess): return feature +class LowHighSpectrogramFeatureProcess(BaseDataProcess): + def __init__(self, frame_period, order, alpha, dtype=numpy.float32): + self._acoustic_feature_process = AcousticFeatureProcess( + frame_period=frame_period, + order=order, + alpha=alpha, + ) + self._dtype = dtype + self._alpha = alpha + + def __call__(self, data: Wave, test): + acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype) + high_spectrogram = acoustic_feature.spectrogram + + fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate) + low_spectrogram = pysptk.mc2sp( + acoustic_feature.mfcc, + alpha=self._alpha, + fftlen=fftlen, + ) + + feature = LowHighSpectrogramFeature( + low=low_spectrogram, + high=high_spectrogram, + ) + feature.validate() + return feature + + class AcousticFeatureLoadProcess(BaseDataProcess): def __init__(self, validate=False): self._validate = validate @@ -130,6 +162,21 @@ class AcousticFeatureLoadProcess(BaseDataProcess): return feature +class LowHighSpectrogramFeatureLoadProcess(BaseDataProcess): + def __init__(self, validate=False): + self._validate = validate + + def __call__(self, path: Path, test=None): + d = numpy.load(path.expanduser()).item() # type: dict + feature = LowHighSpectrogramFeature( + low=d['low'], + high=d['high'], + ) + if self._validate: + feature.validate() + return feature + + class AcousticFeatureSaveProcess(BaseDataProcess): def __init__(self, validate=False, ignore: List[str] = None): self._validate = validate @@ -353,11 +400,6 @@ class DataProcessDataset(chainer.dataset.DatasetMixin): def create(config: DatasetConfig): - import glob - input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))])) - target_paths = list(sorted([Path(p) for p in glob.glob(str(config.target_glob))])) - assert len(input_paths) == len(target_paths) - acoustic_feature_load_process = AcousticFeatureLoadProcess() input_mean = acoustic_feature_load_process(config.input_mean_path, test=True) input_var = acoustic_feature_load_process(config.input_var_path, test=True) @@ -433,7 +475,7 @@ def create(config: DatasetConfig): ]), ))) - data_process_test = data_process_base + data_process_test = copy.deepcopy(data_process_base) if config.train_crop_size is not None: data_process_test.append(SplitProcess(dict( input=ChainProcess([ @@ -453,6 +495,10 @@ def create(config: DatasetConfig): ]), ))) + input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))])) + target_paths = list(sorted([Path(p) for p in glob.glob(str(config.target_glob))])) + assert len(input_paths) == len(target_paths) + num_test = config.num_test pairs = [ dict(input_path=input_path, target_path=target_path) @@ -468,3 +514,68 @@ def create(config: DatasetConfig): 'test': DataProcessDataset(test_paths, data_process_test), 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test), } + + +def create_sr(config: SRDatasetConfig): + data_process_base = ChainProcess([ + LowHighSpectrogramFeatureLoadProcess(validate=True), + SplitProcess(dict( + input=LambdaProcess(lambda d, test: numpy.log(d.low)), + target=LambdaProcess(lambda d, test: numpy.log(d.high)), + )), + ]) + + data_process_train = copy.deepcopy(data_process_base) + + # cropping + if config.train_crop_size is not None: + def add_seed(): + return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 32), **d)) + + def padding(s): + return ChainProcess([ + LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), + RandomPaddingProcess(min_size=config.train_crop_size), + ]) + + def crop(s): + return ChainProcess([ + LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), + RandomCropProcess(crop_size=config.train_crop_size), + ]) + + data_process_train.append(ChainProcess([ + add_seed(), + SplitProcess(dict(input=padding('input'), target=padding('target'))), + add_seed(), + SplitProcess(dict(input=crop('input'), target=crop('target'))), + ])) + + data_process_test = copy.deepcopy(data_process_base) + if config.train_crop_size is not None: + data_process_test.append(SplitProcess(dict( + input=ChainProcess([ + LambdaProcess(lambda d, test: d['input']), + LastPaddingProcess(min_size=config.train_crop_size), + FirstCropProcess(crop_size=config.train_crop_size), + ]), + target=ChainProcess([ + LambdaProcess(lambda d, test: d['target']), + LastPaddingProcess(min_size=config.train_crop_size), + FirstCropProcess(crop_size=config.train_crop_size), + ]), + ))) + + input_paths = list(sorted([Path(p) for p in glob.glob(str(config.input_glob))])) + + num_test = config.num_test + numpy.random.RandomState(config.seed).shuffle(input_paths) + train_paths = input_paths[num_test:] + test_paths = input_paths[:num_test] + train_for_evaluate_paths = train_paths[:num_test] + + return { + 'train': DataProcessDataset(train_paths, data_process_train), + 'test': DataProcessDataset(test_paths, data_process_test), + 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process_test), + } diff --git a/become_yukarin/model.py b/become_yukarin/model.py deleted file mode 100644 index 8a6af14..0000000 --- a/become_yukarin/model.py +++ /dev/null @@ -1,292 +0,0 @@ -from functools import partial -from typing import List - -import chainer - -from .config import DiscriminatorModelConfig -from .config import ModelConfig - - -class Convolution1D(chainer.links.ConvolutionND): - def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, - nobias=False, initialW=None, initial_bias=None, - cover_all=False): - super().__init__( - ndim=1, - in_channels=in_channels, - out_channels=out_channels, - ksize=ksize, - stride=stride, - pad=pad, - nobias=nobias, - initialW=initialW, - initial_bias=initial_bias, - cover_all=cover_all, - ) - - -class LegacyConvolution1D(chainer.links.Convolution2D): - def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, - nobias=False, initialW=None, initial_bias=None, **kwargs): - assert ksize is None or isinstance(ksize, int) - assert isinstance(stride, int) - assert isinstance(pad, int) - super().__init__( - in_channels=in_channels, - out_channels=out_channels, - ksize=(ksize, 1), - stride=(stride, 1), - pad=(pad, 0), - nobias=nobias, - initialW=initialW, - initial_bias=initial_bias, - **kwargs, - ) - - def __call__(self, x): - assert x.shape[-1] == 1 - return super().__call__(x) - - -class ConvHighway(chainer.link.Chain): - def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu, - init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1): - super().__init__() - self.activate = activate - - with self.init_scope(): - self.plain = Convolution1D( - in_out_size, in_out_size, 1, nobias=nobias, - initialW=init_Wh, initial_bias=init_bh) - self.transform = Convolution1D( - in_out_size, in_out_size, 1, nobias=nobias, - initialW=init_Wt, initial_bias=init_bt) - - def __call__(self, x): - out_plain = self.activate(self.plain(x)) - out_transform = chainer.functions.sigmoid(self.transform(x)) - y = out_plain * out_transform + x * (1 - out_transform) - return y - - -class PreNet(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels: int, out_channels: int): - super().__init__() - with self.init_scope(): - self.conv1 = Convolution1D(in_channels, hidden_channels, 1) - self.conv2 = Convolution1D(hidden_channels, out_channels, 1) - - def __call__(self, x): - h = x - h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5)) - h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5)) - return h - - -class Conv1DBank(chainer.link.Chain): - def __init__(self, in_channels: int, out_channels: int, k: int): - super().__init__() - self.stacked_channels = out_channels * k - self.pads = [ - partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant') - for i in range(k) - ] - - with self.init_scope(): - self.convs = chainer.link.ChainList( - *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k)) - ) - self.bn = chainer.links.BatchNormalization(out_channels * k) - - def __call__(self, x): - h = x - h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)]) - h = chainer.functions.relu(self.bn(h)) - return h - - -class Conv1DProjections(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels: int, out_channels: int): - super().__init__() - - with self.init_scope(): - self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True) - self.bn1 = chainer.links.BatchNormalization(hidden_channels) - self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True) - self.bn2 = chainer.links.BatchNormalization(out_channels) - - def __call__(self, x): - h = x - h = chainer.functions.relu(self.bn1(self.conv1(h))) - h = chainer.functions.relu(self.bn2(self.conv2(h))) - return h - - -class CBHG(chainer.link.Chain): - def __init__( - self, - in_channels: int, - conv_bank_out_channels: int, - conv_bank_k: int, - max_pooling_k: int, - conv_projections_hidden_channels: int, - highway_layers: int, - out_channels: int, - disable_last_rnn: bool, - ): - super().__init__() - self.max_pooling_padding = partial( - chainer.functions.pad, - pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)), - mode='constant', - ) - self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) - self.out_size = out_channels * (1 if disable_last_rnn else 2) - - with self.init_scope(): - self.conv_bank = Conv1DBank( - in_channels=in_channels, - out_channels=conv_bank_out_channels, - k=conv_bank_k, - ) - self.conv_projectoins = Conv1DProjections( - in_channels=self.conv_bank.stacked_channels, - hidden_channels=conv_projections_hidden_channels, - out_channels=out_channels, - ) - self.highways = chainer.link.ChainList( - *([ConvHighway(out_channels) for _ in range(highway_layers)]) - ) - if not disable_last_rnn: - self.gru = chainer.links.NStepBiGRU( - n_layers=1, - in_size=out_channels, - out_size=out_channels, - dropout=0.0, - ) - - def __call__(self, x): - h = x - h = self.conv_bank(h) - h = self.max_pooling(self.max_pooling_padding(h)) - h = self.conv_projectoins(h) - h = h + x - for highway in self.highways: - h = highway(h) - - if hasattr(self, 'gru'): - h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) - _, h = self.gru(None, h) - h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) - return h - - -class Predictor(chainer.link.Chain): - def __init__(self, network, out_size: int): - super().__init__() - with self.init_scope(): - self.network = network - self.last = Convolution1D(network.out_size, out_size, 1) - - def __call__(self, x): - h = x - h = self.network(h) - h = self.last(h) - return h - - -class Aligner(chainer.link.Chain): - def __init__(self, in_size: int, out_time_length: int): - super().__init__() - with self.init_scope(): - self.gru = chainer.links.NStepBiGRU( - n_layers=1, - in_size=in_size, - out_size=in_size // 2, - dropout=0.0, - ) - self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) - - def __call__(self, x): - """ - :param x: (batch, channel, timeA) - """ - h = x - h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) - _, h = self.gru(None, h) # h: batch * (timeA, ?) - h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) - h = chainer.functions.softmax(self.last(h), axis=1) # h: (batch, timeB, timeA) - - h = chainer.functions.matmul(x, h) # h: (batch, channel, time) - return h - - -class Discriminator(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels_list: List[int]): - super().__init__() - with self.init_scope(): - self.convs = chainer.link.ChainList(*( - LegacyConvolution1D(i_c, o_c, ksize=2, stride=2) - for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list) - )) - self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1) - - def __call__(self, x): - """ - :param x: (batch, channel, time) - """ - h = x - h = chainer.functions.reshape(h, h.shape + (1,)) - for conv in self.convs.children(): - h = chainer.functions.relu(conv(h)) - h = self.last_conv(h) - h = chainer.functions.reshape(h, h.shape[:-1]) - return h - - -def create_predictor(config: ModelConfig): - network = CBHG( - in_channels=config.in_channels, - conv_bank_out_channels=config.conv_bank_out_channels, - conv_bank_k=config.conv_bank_k, - max_pooling_k=config.max_pooling_k, - conv_projections_hidden_channels=config.conv_projections_hidden_channels, - highway_layers=config.highway_layers, - out_channels=config.out_channels, - disable_last_rnn=config.disable_last_rnn, - ) - predictor = Predictor( - network=network, - out_size=config.out_size, - ) - return predictor - - -def create_aligner(config: ModelConfig): - assert config.enable_aligner - aligner = Aligner( - in_size=config.in_channels, - out_time_length=config.aligner_out_time_length, - ) - return aligner - - -def create_discriminator(config: DiscriminatorModelConfig): - discriminator = Discriminator( - in_channels=config.in_channels, - hidden_channels_list=config.hidden_channels_list, - ) - return discriminator - - -def create(config: ModelConfig): - predictor = create_predictor(config) - if config.enable_aligner: - aligner = create_aligner(config) - else: - aligner = None - if config.discriminator is not None: - discriminator = create_discriminator(config.discriminator) - else: - discriminator = None - return predictor, aligner, discriminator diff --git a/become_yukarin/model/__init__.py b/become_yukarin/model/__init__.py new file mode 100644 index 0000000..6e6c130 --- /dev/null +++ b/become_yukarin/model/__init__.py @@ -0,0 +1,2 @@ +from . import model +from . import sr_model diff --git a/become_yukarin/model/model.py b/become_yukarin/model/model.py new file mode 100644 index 0000000..fc2d722 --- /dev/null +++ b/become_yukarin/model/model.py @@ -0,0 +1,292 @@ +from functools import partial +from typing import List + +import chainer + +from become_yukarin.config.config import DiscriminatorModelConfig +from become_yukarin.config.config import ModelConfig + + +class Convolution1D(chainer.links.ConvolutionND): + def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, + nobias=False, initialW=None, initial_bias=None, + cover_all=False): + super().__init__( + ndim=1, + in_channels=in_channels, + out_channels=out_channels, + ksize=ksize, + stride=stride, + pad=pad, + nobias=nobias, + initialW=initialW, + initial_bias=initial_bias, + cover_all=cover_all, + ) + + +class LegacyConvolution1D(chainer.links.Convolution2D): + def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, + nobias=False, initialW=None, initial_bias=None, **kwargs): + assert ksize is None or isinstance(ksize, int) + assert isinstance(stride, int) + assert isinstance(pad, int) + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + ksize=(ksize, 1), + stride=(stride, 1), + pad=(pad, 0), + nobias=nobias, + initialW=initialW, + initial_bias=initial_bias, + **kwargs, + ) + + def __call__(self, x): + assert x.shape[-1] == 1 + return super().__call__(x) + + +class ConvHighway(chainer.link.Chain): + def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu, + init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1): + super().__init__() + self.activate = activate + + with self.init_scope(): + self.plain = Convolution1D( + in_out_size, in_out_size, 1, nobias=nobias, + initialW=init_Wh, initial_bias=init_bh) + self.transform = Convolution1D( + in_out_size, in_out_size, 1, nobias=nobias, + initialW=init_Wt, initial_bias=init_bt) + + def __call__(self, x): + out_plain = self.activate(self.plain(x)) + out_transform = chainer.functions.sigmoid(self.transform(x)) + y = out_plain * out_transform + x * (1 - out_transform) + return y + + +class PreNet(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int): + super().__init__() + with self.init_scope(): + self.conv1 = Convolution1D(in_channels, hidden_channels, 1) + self.conv2 = Convolution1D(hidden_channels, out_channels, 1) + + def __call__(self, x): + h = x + h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5)) + h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5)) + return h + + +class Conv1DBank(chainer.link.Chain): + def __init__(self, in_channels: int, out_channels: int, k: int): + super().__init__() + self.stacked_channels = out_channels * k + self.pads = [ + partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant') + for i in range(k) + ] + + with self.init_scope(): + self.convs = chainer.link.ChainList( + *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k)) + ) + self.bn = chainer.links.BatchNormalization(out_channels * k) + + def __call__(self, x): + h = x + h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)]) + h = chainer.functions.relu(self.bn(h)) + return h + + +class Conv1DProjections(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int): + super().__init__() + + with self.init_scope(): + self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True) + self.bn1 = chainer.links.BatchNormalization(hidden_channels) + self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True) + self.bn2 = chainer.links.BatchNormalization(out_channels) + + def __call__(self, x): + h = x + h = chainer.functions.relu(self.bn1(self.conv1(h))) + h = chainer.functions.relu(self.bn2(self.conv2(h))) + return h + + +class CBHG(chainer.link.Chain): + def __init__( + self, + in_channels: int, + conv_bank_out_channels: int, + conv_bank_k: int, + max_pooling_k: int, + conv_projections_hidden_channels: int, + highway_layers: int, + out_channels: int, + disable_last_rnn: bool, + ): + super().__init__() + self.max_pooling_padding = partial( + chainer.functions.pad, + pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)), + mode='constant', + ) + self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) + self.out_size = out_channels * (1 if disable_last_rnn else 2) + + with self.init_scope(): + self.conv_bank = Conv1DBank( + in_channels=in_channels, + out_channels=conv_bank_out_channels, + k=conv_bank_k, + ) + self.conv_projectoins = Conv1DProjections( + in_channels=self.conv_bank.stacked_channels, + hidden_channels=conv_projections_hidden_channels, + out_channels=out_channels, + ) + self.highways = chainer.link.ChainList( + *([ConvHighway(out_channels) for _ in range(highway_layers)]) + ) + if not disable_last_rnn: + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=out_channels, + out_size=out_channels, + dropout=0.0, + ) + + def __call__(self, x): + h = x + h = self.conv_bank(h) + h = self.max_pooling(self.max_pooling_padding(h)) + h = self.conv_projectoins(h) + h = h + x + for highway in self.highways: + h = highway(h) + + if hasattr(self, 'gru'): + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) + _, h = self.gru(None, h) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) + return h + + +class Predictor(chainer.link.Chain): + def __init__(self, network, out_size: int): + super().__init__() + with self.init_scope(): + self.network = network + self.last = Convolution1D(network.out_size, out_size, 1) + + def __call__(self, x): + h = x + h = self.network(h) + h = self.last(h) + return h + + +class Aligner(chainer.link.Chain): + def __init__(self, in_size: int, out_time_length: int): + super().__init__() + with self.init_scope(): + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=in_size, + out_size=in_size // 2, + dropout=0.0, + ) + self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) + + def __call__(self, x): + """ + :param x: (batch, channel, timeA) + """ + h = x + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) + _, h = self.gru(None, h) # h: batch * (timeA, ?) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) + h = chainer.functions.softmax(self.last(h), axis=1) # h: (batch, timeB, timeA) + + h = chainer.functions.matmul(x, h) # h: (batch, channel, time) + return h + + +class Discriminator(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels_list: List[int]): + super().__init__() + with self.init_scope(): + self.convs = chainer.link.ChainList(*( + LegacyConvolution1D(i_c, o_c, ksize=2, stride=2) + for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list) + )) + self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1) + + def __call__(self, x): + """ + :param x: (batch, channel, time) + """ + h = x + h = chainer.functions.reshape(h, h.shape + (1,)) + for conv in self.convs.children(): + h = chainer.functions.relu(conv(h)) + h = self.last_conv(h) + h = chainer.functions.reshape(h, h.shape[:-1]) + return h + + +def create_predictor(config: ModelConfig): + network = CBHG( + in_channels=config.in_channels, + conv_bank_out_channels=config.conv_bank_out_channels, + conv_bank_k=config.conv_bank_k, + max_pooling_k=config.max_pooling_k, + conv_projections_hidden_channels=config.conv_projections_hidden_channels, + highway_layers=config.highway_layers, + out_channels=config.out_channels, + disable_last_rnn=config.disable_last_rnn, + ) + predictor = Predictor( + network=network, + out_size=config.out_size, + ) + return predictor + + +def create_aligner(config: ModelConfig): + assert config.enable_aligner + aligner = Aligner( + in_size=config.in_channels, + out_time_length=config.aligner_out_time_length, + ) + return aligner + + +def create_discriminator(config: DiscriminatorModelConfig): + discriminator = Discriminator( + in_channels=config.in_channels, + hidden_channels_list=config.hidden_channels_list, + ) + return discriminator + + +def create(config: ModelConfig): + predictor = create_predictor(config) + if config.enable_aligner: + aligner = create_aligner(config) + else: + aligner = None + if config.discriminator is not None: + discriminator = create_discriminator(config.discriminator) + else: + discriminator = None + return predictor, aligner, discriminator diff --git a/become_yukarin/model/sr_model.py b/become_yukarin/model/sr_model.py new file mode 100644 index 0000000..74119a4 --- /dev/null +++ b/become_yukarin/model/sr_model.py @@ -0,0 +1,119 @@ +import chainer +import chainer.functions as F +import chainer.links as L + +from become_yukarin.config.sr_config import SRModelConfig + + +class CBR(chainer.Chain): + def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False): + super().__init__() + self.bn = bn + self.activation = activation + self.dropout = dropout + + w = chainer.initializers.Normal(0.02) + with self.init_scope(): + if sample == 'down': + self.c = L.Convolution2D(ch0, ch1, 4, 2, 1, initialW=w) + else: + self.c = L.Deconvolution2D(ch0, ch1, 4, 2, 1, initialW=w) + if bn: + self.batchnorm = L.BatchNormalization(ch1) + + def __call__(self, x): + h = self.c(x) + if self.bn: + h = self.batchnorm(h) + if self.dropout: + h = F.dropout(h) + if self.activation is not None: + h = self.activation(h) + return h + + +class Encoder(chainer.Chain): + def __init__(self, in_ch): + super().__init__() + w = chainer.initializers.Normal(0.02) + with self.init_scope(): + self.c0 = L.Convolution2D(in_ch, 64, 3, 1, 1, initialW=w) + self.c1 = CBR(64, 128, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c2 = CBR(128, 256, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c3 = CBR(256, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c4 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c5 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c6 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c7 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + + def __call__(self, x): + x = F.reshape(x, (len(x), 1) + x.shape[1:]) + hs = [F.leaky_relu(self.c0(x))] + for i in range(1, 8): + hs.append(self['c%d' % i](hs[i - 1])) + return hs + + +class Decoder(chainer.Chain): + def __init__(self, out_ch): + super().__init__() + w = chainer.initializers.Normal(0.02) + with self.init_scope(): + self.c0 = CBR(512, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c1 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c2 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c3 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=False) + self.c4 = CBR(1024, 256, bn=True, sample='up', activation=F.relu, dropout=False) + self.c5 = CBR(512, 128, bn=True, sample='up', activation=F.relu, dropout=False) + self.c6 = CBR(256, 64, bn=True, sample='up', activation=F.relu, dropout=False) + self.c7 = L.Convolution2D(128, out_ch, 3, 1, 1, initialW=w) + + def __call__(self, hs): + h = self.c0(hs[-1]) + for i in range(1, 8): + h = F.concat([h, hs[-i - 1]]) + if i < 7: + h = self['c%d' % i](h) + else: + h = self.c7(h) + return h + + +class SRPredictor(chainer.Chain): + def __init__(self, in_ch, out_ch): + super().__init__() + with self.init_scope(): + self.encoder = Encoder(in_ch) + self.decoder = Decoder(out_ch) + + def __call__(self, x): + return self.decoder(self.encoder(x)) + + +class SRDiscriminator(chainer.Chain): + def __init__(self, in_ch, out_ch): + super().__init__() + w = chainer.initializers.Normal(0.02) + with self.init_scope(): + self.c0_0 = CBR(in_ch, 32, bn=False, sample='down', activation=F.leaky_relu, dropout=False) + self.c0_1 = CBR(out_ch, 32, bn=False, sample='down', activation=F.leaky_relu, dropout=False) + self.c1 = CBR(64, 128, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c2 = CBR(128, 256, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c3 = CBR(256, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c4 = L.Convolution2D(512, 1, 3, 1, 1, initialW=w) + + def __call__(self, x_0, x_1): + x_0 = F.reshape(x_0, (len(x_0), 1) + x_0.shape[1:]) + h = F.concat([self.c0_0(x_0), self.c0_1(x_1)]) + h = self.c1(h) + h = self.c2(h) + h = self.c3(h) + h = self.c4(h) + # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0) + return h + + +def create_sr(config: SRModelConfig): + predictor = SRPredictor(in_ch=1, out_ch=3) + discriminator = SRDiscriminator(in_ch=1, out_ch=3) + return predictor, discriminator diff --git a/become_yukarin/param.py b/become_yukarin/param.py index e6f46bc..5a43d74 100644 --- a/become_yukarin/param.py +++ b/become_yukarin/param.py @@ -9,7 +9,7 @@ class VoiceParam(NamedTuple): class AcousticFeatureParam(NamedTuple): frame_period: int = 5 - order: int = 25 + order: int = 8 alpha: float = 0.466 diff --git a/become_yukarin/updater.py b/become_yukarin/updater.py deleted file mode 100644 index f6444d0..0000000 --- a/become_yukarin/updater.py +++ /dev/null @@ -1,106 +0,0 @@ -import chainer -import numpy -from chainer import reporter - -from .config import LossConfig -from .config import ModelConfig -from .model import Aligner -from .model import Discriminator -from .model import Predictor - - -class Updater(chainer.training.StandardUpdater): - def __init__( - self, - loss_config: LossConfig, - model_config: ModelConfig, - predictor: Predictor, - aligner: Aligner = None, - discriminator: Discriminator = None, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.loss_config = loss_config - self.model_config = model_config - self.predictor = predictor - self.aligner = aligner - self.discriminator = discriminator - - def forward(self, input, target, mask): - xp = self.predictor.xp - - input = chainer.as_variable(input) - target = chainer.as_variable(target) - mask = chainer.as_variable(mask) - - if self.aligner is not None: - input = self.aligner(input) - y = self.predictor(input) - - loss_l1 = chainer.functions.sum(chainer.functions.absolute_error(y, target) * mask) - loss_l1 = loss_l1 / chainer.functions.sum(mask) - reporter.report({'l1': loss_l1}, self.predictor) - - if self.discriminator is not None: - pair_fake = chainer.functions.concat([y * mask, input]) - pair_true = chainer.functions.concat([target * mask, input]) - - # DRAGAN - if chainer.config.train: # grad is not available on test - std = xp.std(pair_true.data, axis=0, keepdims=True) - rand = xp.random.uniform(0, 1, pair_true.shape).astype(xp.float32) - perturb = chainer.Variable(pair_true.data + 0.5 * rand * std) - grad, = chainer.grad([self.discriminator(perturb)], [perturb], enable_double_backprop=True) - grad = chainer.functions.sqrt(chainer.functions.batch_l2_norm_squared(grad)) - loss_grad = chainer.functions.mean_squared_error(grad, xp.ones_like(grad.data, numpy.float32)) - reporter.report({'grad': loss_grad}, self.discriminator) - - if xp.any(xp.isnan(loss_grad.data)): - import code - code.interact(local=locals()) - - # GAN - d_fake = self.discriminator(pair_fake) - d_true = self.discriminator(pair_true) - loss_dis_f = chainer.functions.average(chainer.functions.softplus(d_fake)) - loss_dis_t = chainer.functions.average(chainer.functions.softplus(-d_true)) - loss_gen_f = chainer.functions.average(chainer.functions.softplus(-d_fake)) - reporter.report({'fake': loss_dis_f}, self.discriminator) - reporter.report({'true': loss_dis_t}, self.discriminator) - - tp = (d_true.data > 0.5).sum() - fp = (d_fake.data > 0.5).sum() - fn = (d_true.data <= 0.5).sum() - tn = (d_fake.data <= 0.5).sum() - accuracy = (tp + tn) / (tp + fp + fn + tn) - precision = tp / (tp + fp) - recall = tp / (tp + fn) - reporter.report({'accuracy': accuracy}, self.discriminator) - reporter.report({'precision': precision}, self.discriminator) - reporter.report({'recall': recall}, self.discriminator) - - loss = {'predictor': loss_l1 * self.loss_config.l1} - - if self.aligner is not None: - loss['aligner'] = loss_l1 * self.loss_config.l1 - reporter.report({'loss': loss['aligner']}, self.aligner) - - if self.discriminator is not None: - loss['discriminator'] = \ - loss_dis_f * self.loss_config.discriminator_fake + \ - loss_dis_t * self.loss_config.discriminator_true - if chainer.config.train: # grad is not available on test - loss['discriminator'] += loss_grad * self.loss_config.discriminator_grad - reporter.report({'loss': loss['discriminator']}, self.discriminator) - loss['predictor'] += loss_gen_f * self.loss_config.predictor_fake - - reporter.report({'loss': loss['predictor']}, self.predictor) - return loss - - def update_core(self): - batch = self.get_iterator('main').next() - loss = self.forward(**self.converter(batch, self.device)) - - for k, opt in self.get_all_optimizers().items(): - opt.update(loss.get, k) diff --git a/become_yukarin/updater/__init__.py b/become_yukarin/updater/__init__.py new file mode 100644 index 0000000..d85003a --- /dev/null +++ b/become_yukarin/updater/__init__.py @@ -0,0 +1,2 @@ +from . import sr_updater +from . import updater diff --git a/become_yukarin/updater/sr_updater.py b/become_yukarin/updater/sr_updater.py new file mode 100644 index 0000000..a6b1d22 --- /dev/null +++ b/become_yukarin/updater/sr_updater.py @@ -0,0 +1,69 @@ +import chainer +import chainer.functions as F +from become_yukarin.config.sr_config import SRLossConfig + +from become_yukarin.model.sr_model import SRDiscriminator +from become_yukarin.model.sr_model import SRPredictor + + +class SRUpdater(chainer.training.StandardUpdater): + def __init__( + self, + loss_config: SRLossConfig, + predictor: SRPredictor, + discriminator: SRDiscriminator, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.loss_config = loss_config + self.predictor = predictor + self.discriminator = discriminator + + def _loss_predictor(self, predictor, output, target, d_fake): + b, _, w, h = d_fake.data.shape + + loss_mse = (F.mean_absolute_error(output, target)) + chainer.report({'mse': loss_mse}, predictor) + + loss_adv = F.sum(F.softplus(-d_fake)) / (b * w * h) + chainer.report({'adversarial': loss_adv}, predictor) + + loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv + chainer.report({'loss': loss}, predictor) + return loss + + def _loss_discriminator(self, discriminator, y_in, y_out): + b, _, w, h = y_in.data.shape + + loss_real = F.sum(F.softplus(-y_in)) / (b * w * h) + chainer.report({'real': loss_real}, discriminator) + + loss_fake = F.sum(F.softplus(y_out)) / (b * w * h) + chainer.report({'fake': loss_fake}, discriminator) + + loss = loss_real + loss_fake + chainer.report({'loss': loss}, discriminator) + return loss + + def forward(self, input, target): + output = self.predictor(input) + d_fake = self.discriminator(input, output) + d_real = self.discriminator(input, target) + + loss = { + 'predictor': self._loss_predictor(self.predictor, output, target, d_fake), + 'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake), + } + return loss + + def update_core(self): + opt_predictor = self.get_optimizer('predictor') + opt_discriminator = self.get_optimizer('discriminator') + + batch = self.get_iterator('main').next() + batch = self.converter(batch, self.device) + loss = self.forward(**batch) + + opt_predictor.update(loss.get, 'predictor') + opt_discriminator.update(loss.get, 'discriminator') diff --git a/become_yukarin/updater/updater.py b/become_yukarin/updater/updater.py new file mode 100644 index 0000000..ef77e77 --- /dev/null +++ b/become_yukarin/updater/updater.py @@ -0,0 +1,103 @@ +import chainer +import numpy +from chainer import reporter + +from become_yukarin.config.config import LossConfig +from become_yukarin.model.model import Aligner +from become_yukarin.model.model import Discriminator +from become_yukarin.model.model import Predictor + + +class Updater(chainer.training.StandardUpdater): + def __init__( + self, + loss_config: LossConfig, + predictor: Predictor, + aligner: Aligner = None, + discriminator: Discriminator = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.loss_config = loss_config + self.predictor = predictor + self.aligner = aligner + self.discriminator = discriminator + + def forward(self, input, target, mask): + xp = self.predictor.xp + + input = chainer.as_variable(input) + target = chainer.as_variable(target) + mask = chainer.as_variable(mask) + + if self.aligner is not None: + input = self.aligner(input) + y = self.predictor(input) + + loss_l1 = chainer.functions.sum(chainer.functions.absolute_error(y, target) * mask) + loss_l1 = loss_l1 / chainer.functions.sum(mask) + reporter.report({'l1': loss_l1}, self.predictor) + + if self.discriminator is not None: + pair_fake = chainer.functions.concat([y * mask, input]) + pair_true = chainer.functions.concat([target * mask, input]) + + # DRAGAN + if chainer.config.train: # grad is not available on test + std = xp.std(pair_true.data, axis=0, keepdims=True) + rand = xp.random.uniform(0, 1, pair_true.shape).astype(xp.float32) + perturb = chainer.Variable(pair_true.data + 0.5 * rand * std) + grad, = chainer.grad([self.discriminator(perturb)], [perturb], enable_double_backprop=True) + grad = chainer.functions.sqrt(chainer.functions.batch_l2_norm_squared(grad)) + loss_grad = chainer.functions.mean_squared_error(grad, xp.ones_like(grad.data, numpy.float32)) + reporter.report({'grad': loss_grad}, self.discriminator) + + if xp.any(xp.isnan(loss_grad.data)): + import code + code.interact(local=locals()) + + # GAN + d_fake = self.discriminator(pair_fake) + d_true = self.discriminator(pair_true) + loss_dis_f = chainer.functions.average(chainer.functions.softplus(d_fake)) + loss_dis_t = chainer.functions.average(chainer.functions.softplus(-d_true)) + loss_gen_f = chainer.functions.average(chainer.functions.softplus(-d_fake)) + reporter.report({'fake': loss_dis_f}, self.discriminator) + reporter.report({'true': loss_dis_t}, self.discriminator) + + tp = (d_true.data > 0.5).sum() + fp = (d_fake.data > 0.5).sum() + fn = (d_true.data <= 0.5).sum() + tn = (d_fake.data <= 0.5).sum() + accuracy = (tp + tn) / (tp + fp + fn + tn) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + reporter.report({'accuracy': accuracy}, self.discriminator) + reporter.report({'precision': precision}, self.discriminator) + reporter.report({'recall': recall}, self.discriminator) + + loss = {'predictor': loss_l1 * self.loss_config.l1} + + if self.aligner is not None: + loss['aligner'] = loss_l1 * self.loss_config.l1 + reporter.report({'loss': loss['aligner']}, self.aligner) + + if self.discriminator is not None: + loss['discriminator'] = \ + loss_dis_f * self.loss_config.discriminator_fake + \ + loss_dis_t * self.loss_config.discriminator_true + if chainer.config.train: # grad is not available on test + loss['discriminator'] += loss_grad * self.loss_config.discriminator_grad + reporter.report({'loss': loss['discriminator']}, self.discriminator) + loss['predictor'] += loss_gen_f * self.loss_config.predictor_fake + + reporter.report({'loss': loss['predictor']}, self.predictor) + return loss + + def update_core(self): + batch = self.get_iterator('main').next() + loss = self.forward(**self.converter(batch, self.device)) + + for k, opt in self.get_all_optimizers().items(): + opt.update(loss.get, k) diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 140a2a0..a8a207a 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -7,7 +7,7 @@ import numpy import pysptk import pyworld -from become_yukarin.config import Config +from become_yukarin.config.config import Config from become_yukarin.data_struct import AcousticFeature from become_yukarin.data_struct import Wave from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess @@ -17,7 +17,7 @@ from become_yukarin.dataset.dataset import AcousticFeatureProcess from become_yukarin.dataset.dataset import DecodeFeatureProcess from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import WaveFileLoadProcess -from become_yukarin.model import create_predictor +from become_yukarin.model.model import create_predictor class VoiceChanger(object): diff --git a/scripts/extract_acoustic_feature.py b/scripts/extract_acoustic_feature.py index 297c10b..7943639 100644 --- a/scripts/extract_acoustic_feature.py +++ b/scripts/extract_acoustic_feature.py @@ -40,19 +40,6 @@ arguments = parser.parse_args() pprint(dir(arguments)) -def make_feature( - path, - sample_rate, - top_db, - frame_period, - order, - alpha, -): - wave = WaveFileLoadProcess(sample_rate=sample_rate, top_db=top_db)(path, test=True) - feature = AcousticFeatureProcess(frame_period=frame_period, order=order, alpha=alpha)(wave, test=True) - return feature - - def generate_feature(path1, path2): out1 = Path(arguments.output1_directory, path1.stem + '.npy') out2 = Path(arguments.output2_directory, path2.stem + '.npy') diff --git a/train.py b/train.py index 346ec4b..3e8cced 100644 --- a/train.py +++ b/train.py @@ -10,10 +10,10 @@ from chainer.iterators import MultiprocessIterator from chainer.training import extensions from chainerui.utils import save_args -from become_yukarin.config import create_from_json +from become_yukarin.config.config import create_from_json from become_yukarin.dataset import create as create_dataset -from become_yukarin.model import create -from become_yukarin.updater import Updater +from become_yukarin.model.model import create +from become_yukarin.updater.updater import Updater parser = argparse.ArgumentParser() parser.add_argument('config_json_path', type=Path) @@ -54,7 +54,6 @@ opts = {key: create_optimizer(model) for key, model in models.items()} converter = partial(convert.concat_examples, padding=0) updater = Updater( loss_config=config.loss, - model_config=config.model, predictor=predictor, aligner=aligner, discriminator=discriminator, diff --git a/train_sr.py b/train_sr.py new file mode 100644 index 0000000..c714aa0 --- /dev/null +++ b/train_sr.py @@ -0,0 +1,98 @@ +import argparse +from functools import partial +from pathlib import Path + +from chainer import cuda +from chainer import optimizers +from chainer import training +from chainer.dataset import convert +from chainer.iterators import MultiprocessIterator +from chainer.training import extensions +from chainerui.utils import save_args + +from become_yukarin.config.sr_config import create_from_json +from become_yukarin.dataset import create_sr as create_sr_dataset +from become_yukarin.model.sr_model import create_sr as create_sr_model +from become_yukarin.updater.sr_updater import SRUpdater + +parser = argparse.ArgumentParser() +parser.add_argument('config_json_path', type=Path) +parser.add_argument('output', type=Path) +arguments = parser.parse_args() + +config = create_from_json(arguments.config_json_path) +arguments.output.mkdir(exist_ok=True) +config.save_as_json((arguments.output / 'config.json').absolute()) + +# model +if config.train.gpu >= 0: + cuda.get_device_from_id(config.train.gpu).use() +predictor, discriminator = create_sr_model(config.model) +models = { + 'predictor': predictor, + 'discriminator': discriminator, +} + +# dataset +dataset = create_sr_dataset(config.dataset) +train_iter = MultiprocessIterator(dataset['train'], config.train.batchsize) +test_iter = MultiprocessIterator(dataset['test'], config.train.batchsize, repeat=False, shuffle=False) +train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batchsize, repeat=False, shuffle=False) + + +# optimizer +def create_optimizer(model): + optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999) + optimizer.setup(model) + return optimizer + + +opts = {key: create_optimizer(model) for key, model in models.items()} + +# updater +converter = partial(convert.concat_examples, padding=0) +updater = SRUpdater( + loss_config=config.loss, + predictor=predictor, + discriminator=discriminator, + device=config.train.gpu, + iterator=train_iter, + optimizer=opts, + converter=converter, +) + +# trainer +trigger_log = (config.train.log_iteration, 'iteration') +trigger_snapshot = (config.train.snapshot_iteration, 'iteration') + +trainer = training.Trainer(updater, out=arguments.output) + +ext = extensions.Evaluator(test_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) +trainer.extend(ext, name='test', trigger=trigger_log) +ext = extensions.Evaluator(train_eval_iter, models, converter, device=config.train.gpu, eval_func=updater.forward) +trainer.extend(ext, name='train', trigger=trigger_log) + +trainer.extend(extensions.dump_graph('predictor/loss')) + +ext = extensions.snapshot_object(predictor, filename='predictor_{.updater.iteration}.npz') +trainer.extend(ext, trigger=trigger_snapshot) + +trainer.extend(extensions.LogReport(trigger=trigger_log)) + +if extensions.PlotReport.available(): + trainer.extend(extensions.PlotReport( + y_keys=[ + 'predictor/loss', + 'predictor/mse', + 'predictor/adversarial', + 'discriminator/accuracy', + 'discriminator/fake', + 'discriminator/real', + ], + x_key='iteration', + file_name='loss.png', + trigger=trigger_log, + )) + +save_args(arguments, arguments.output) +trainer.run() -- cgit v1.2.3-70-g09d2