diff options
| author | Hiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp> | 2017-11-20 03:06:39 +0900 |
|---|---|---|
| committer | Hiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp> | 2017-11-20 03:06:39 +0900 |
| commit | 16b4e72fe6728e2e64d4c6357b7c73ac06868c1c (patch) | |
| tree | 657f0398b9a237ab46327d08f58a230b9581669b | |
| parent | 437a869590c989c184d33990b1d788149d073ee9 (diff) | |
aligner
| -rw-r--r-- | become_yukarin/config.py | 4 | ||||
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 102 | ||||
| -rw-r--r-- | become_yukarin/loss.py | 12 | ||||
| -rw-r--r-- | become_yukarin/model.py | 66 | ||||
| -rw-r--r-- | become_yukarin/voice_changer.py | 4 | ||||
| -rw-r--r-- | train.py | 15 |
6 files changed, 156 insertions, 47 deletions
diff --git a/become_yukarin/config.py b/become_yukarin/config.py index 05b0790..07f35fd 100644 --- a/become_yukarin/config.py +++ b/become_yukarin/config.py @@ -32,6 +32,8 @@ class ModelConfig(NamedTuple): highway_layers: int out_channels: int out_size: int + aligner_out_time_length: int + disable_last_rnn: bool class LossConfig(NamedTuple): @@ -100,6 +102,8 @@ def create_from_json(s: Union[str, Path]): highway_layers=d['model']['highway_layers'], out_channels=d['model']['out_channels'], out_size=d['model']['out_size'], + aligner_out_time_length=d['model']['aligner_out_time_length'], + disable_last_rnn=d['model']['disable_last_rnn'], ), loss=LossConfig( l1=d['loss']['l1'], diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py index 83936b1..329226c 100644 --- a/become_yukarin/dataset/dataset.py +++ b/become_yukarin/dataset/dataset.py @@ -252,7 +252,41 @@ class ShapeAlignProcess(BaseDataProcess): return data -class CropProcess(BaseDataProcess): +class RandomPaddingProcess(BaseDataProcess): + def __init__(self, min_size: int, time_axis: int = 1): + assert time_axis == 1 + self._min_size = min_size + self._time_axis = time_axis + + def __call__(self, datas: Dict[str, any], test=True): + assert not test + + data, seed = datas['data'], datas['seed'] + random = numpy.random.RandomState(seed) + + if data.shape[self._time_axis] >= self._min_size: + return data + + pre = random.randint(self._min_size - data.shape[self._time_axis] + 1) + post = self._min_size - pre + return numpy.pad(data, ((0, 0), (pre, post)), mode='constant') + + +class LastPaddingProcess(BaseDataProcess): + def __init__(self, min_size: int, time_axis: int = 1): + assert time_axis == 1 + self._min_size = min_size + self._time_axis = time_axis + + def __call__(self, data: numpy.ndarray, test=None): + if data.shape[self._time_axis] >= self._min_size: + return data + + pre = self._min_size - data.shape[self._time_axis] + return numpy.pad(data, ((0, 0), (pre, 0)), mode='constant') + + +class RandomCropProcess(BaseDataProcess): def __init__(self, crop_size: int, time_axis: int = 1): self._crop_size = crop_size self._time_axis = time_axis @@ -270,6 +304,15 @@ class CropProcess(BaseDataProcess): return numpy.split(data, [start, start + self._crop_size], axis=self._time_axis)[1] +class FirstCropProcess(BaseDataProcess): + def __init__(self, crop_size: int, time_axis: int = 1): + self._crop_size = crop_size + self._time_axis = time_axis + + def __call__(self, data: numpy.ndarray, test=None): + return numpy.split(data, [0, self._crop_size], axis=self._time_axis)[1] + + class AddNoiseProcess(BaseDataProcess): def __init__(self, p_global: float = None, p_local: float = None): assert p_global is None or 0 <= p_global @@ -338,24 +381,28 @@ def create(config: DatasetConfig): ]) data_process_train = copy.deepcopy(data_process_base) - if config.train_crop_size is not None: - data_process_train.append(ChainProcess([ - LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 32), **d)), - SplitProcess(dict( - input=ChainProcess([ - LambdaProcess(lambda d, test: dict(data=d['input'], seed=d['seed'])), - CropProcess(crop_size=config.train_crop_size), - ]), - target=ChainProcess([ - LambdaProcess(lambda d, test: dict(data=d['target'], seed=d['seed'])), - CropProcess(crop_size=config.train_crop_size), - ]), - mask=ChainProcess([ - LambdaProcess(lambda d, test: dict(data=d['mask'], seed=d['seed'])), - CropProcess(crop_size=config.train_crop_size), - ]), - )), - ])) + + def add_seed(): + return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 32), **d)) + + def padding(s): + return ChainProcess([ + LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), + RandomPaddingProcess(min_size=config.train_crop_size), + ]) + + def crop(s): + return ChainProcess([ + LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])), + RandomCropProcess(crop_size=config.train_crop_size), + ]) + + data_process_train.append(ChainProcess([ + add_seed(), + SplitProcess(dict(input=padding('input'), target=padding('target'), mask=padding('mask'))), + add_seed(), + SplitProcess(dict(input=crop('input'), target=crop('target'), mask=crop('mask'))), + ])) # add noise data_process_train.append(SplitProcess(dict( @@ -373,6 +420,23 @@ def create(config: DatasetConfig): ))) data_process_test = data_process_base + data_process_test.append(SplitProcess(dict( + input=ChainProcess([ + LambdaProcess(lambda d, test: d['input']), + LastPaddingProcess(min_size=config.train_crop_size), + FirstCropProcess(crop_size=config.train_crop_size), + ]), + target=ChainProcess([ + LambdaProcess(lambda d, test: d['target']), + LastPaddingProcess(min_size=config.train_crop_size), + FirstCropProcess(crop_size=config.train_crop_size), + ]), + mask=ChainProcess([ + LambdaProcess(lambda d, test: d['mask']), + LastPaddingProcess(min_size=config.train_crop_size), + FirstCropProcess(crop_size=config.train_crop_size), + ]), + ))) num_test = config.num_test pairs = [ diff --git a/become_yukarin/loss.py b/become_yukarin/loss.py index 3d89908..c59747a 100644 --- a/become_yukarin/loss.py +++ b/become_yukarin/loss.py @@ -1,18 +1,19 @@ -from .config import LossConfig -from .model import Model - import chainer - from chainer import reporter +from .config import LossConfig +from .model import Aligner +from .model import Predictor + class Loss(chainer.link.Chain): - def __init__(self, config: LossConfig, predictor: Model): + def __init__(self, config: LossConfig, predictor: Predictor, aligner: Aligner): super().__init__() self.config = config with self.init_scope(): self.predictor = predictor + self.aligner = aligner def __call__(self, input, target, mask): input = chainer.as_variable(input) @@ -20,6 +21,7 @@ class Loss(chainer.link.Chain): mask = chainer.as_variable(mask) h = input + h = self.aligner(h) y = self.predictor(h) loss = chainer.functions.sum(chainer.functions.absolute_error(y, target) * mask) diff --git a/become_yukarin/model.py b/become_yukarin/model.py index 9d69378..6dfb2c1 100644 --- a/become_yukarin/model.py +++ b/become_yukarin/model.py @@ -105,6 +105,7 @@ class CBHG(chainer.link.Chain): conv_projections_hidden_channels: int, highway_layers: int, out_channels: int, + disable_last_rnn: bool, ): super().__init__() self.max_pooling_padding = chainer.functions.Pad( @@ -112,7 +113,7 @@ class CBHG(chainer.link.Chain): mode='constant', ) self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) - self.out_size = out_channels * 2 + self.out_size = out_channels * (1 if disable_last_rnn else 2) with self.init_scope(): self.conv_bank = Conv1DBank( @@ -128,12 +129,13 @@ class CBHG(chainer.link.Chain): self.highways = chainer.link.ChainList( *([ConvHighway(out_channels) for _ in range(highway_layers)]) ) - self.gru = chainer.links.NStepBiGRU( - n_layers=1, - in_size=out_channels, - out_size=out_channels, - dropout=0.0, - ) + if not disable_last_rnn: + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=out_channels, + out_size=out_channels, + dropout=0.0, + ) def __call__(self, x): h = x @@ -144,13 +146,14 @@ class CBHG(chainer.link.Chain): for highway in self.highways: h = highway(h) - h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) - _, h = self.gru(None, h) - h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) + if hasattr(self, 'gru'): + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) + _, h = self.gru(None, h) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) return h -class Model(chainer.link.Chain): +class Predictor(chainer.link.Chain): def __init__(self, network, out_size: int): super().__init__() with self.init_scope(): @@ -164,7 +167,33 @@ class Model(chainer.link.Chain): return h -def create(config: ModelConfig): +class Aligner(chainer.link.Chain): + def __init__(self, in_size: int, out_time_length: int): + super().__init__() + with self.init_scope(): + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=in_size, + out_size=in_size // 2, + dropout=0.0, + ) + self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) + + def __call__(self, x): + """ + :param x: (batch, channel, timeA) + """ + h = x + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) + _, h = self.gru(None, h) # h: batch * (timeA, ?) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) + h = chainer.functions.softmax(self.last(h), axis=2) # h: (batch, timeB, timeA) + + h = chainer.functions.matmul(x, h, transb=True) # h: (batch, channel, timeB) + return h + + +def create_predictor(config: ModelConfig): network = CBHG( in_channels=config.in_channels, conv_bank_out_channels=config.conv_bank_out_channels, @@ -173,9 +202,18 @@ def create(config: ModelConfig): conv_projections_hidden_channels=config.conv_projections_hidden_channels, highway_layers=config.highway_layers, out_channels=config.out_channels, + disable_last_rnn=config.disable_last_rnn, ) - model = Model( + predictor = Predictor( network=network, out_size=config.out_size, ) - return model + return predictor + + +def create_aligner(config: ModelConfig): + aligner = Aligner( + in_size=config.in_channels, + out_time_length=config.aligner_out_time_length, + ) + return aligner diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 935b95c..268de24 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -17,7 +17,7 @@ from become_yukarin.dataset.dataset import AcousticFeatureProcess from become_yukarin.dataset.dataset import DecodeFeatureProcess from become_yukarin.dataset.dataset import EncodeFeatureProcess from become_yukarin.dataset.dataset import WaveFileLoadProcess -from become_yukarin.model import create as create_model +from become_yukarin.model import create_predictor class VoiceChanger(object): @@ -25,7 +25,7 @@ class VoiceChanger(object): self.config = config self.model_path = model_path - self.model = model = create_model(config.model) + self.model = model = create_predictor(config.model) chainer.serializers.load_npz(str(model_path), model) self._param = param = config.dataset.param @@ -1,18 +1,18 @@ import argparse +from functools import partial from pathlib import Path -from chainer.iterators import MultiprocessIterator from chainer import optimizers from chainer import training -from chainer.training import extensions from chainer.dataset import convert +from chainer.iterators import MultiprocessIterator +from chainer.training import extensions from become_yukarin.config import create_from_json from become_yukarin.dataset import create as create_dataset -from become_yukarin.model import create as create_model from become_yukarin.loss import Loss - -from functools import partial +from become_yukarin.model import create_aligner +from become_yukarin.model import create_predictor parser = argparse.ArgumentParser() parser.add_argument('config_json_path', type=Path) @@ -24,8 +24,9 @@ arguments.output.mkdir(exist_ok=True) config.save_as_json((arguments.output / 'config.json').absolute()) # model -predictor = create_model(config.model) -model = Loss(config.loss, predictor=predictor) +predictor = create_predictor(config.model) +aligner = create_aligner(config.model) +model = Loss(config.loss, predictor=predictor, aligner=aligner) # dataset dataset = create_dataset(config.dataset) |
