diff options
| -rw-r--r-- | become_yukarin/acoustic_converter.py | 4 | ||||
| -rw-r--r-- | become_yukarin/config/config.py | 47 | ||||
| -rw-r--r-- | become_yukarin/config/old_config.py | 31 | ||||
| -rw-r--r-- | become_yukarin/model/cbhg_model.py | 292 | ||||
| -rw-r--r-- | become_yukarin/model/model.py | 313 | ||||
| -rw-r--r-- | become_yukarin/updater/updater.py | 116 | ||||
| -rw-r--r-- | train.py | 14 |
7 files changed, 476 insertions, 341 deletions
diff --git a/become_yukarin/acoustic_converter.py b/become_yukarin/acoustic_converter.py index 498bdb1..62eacff 100644 --- a/become_yukarin/acoustic_converter.py +++ b/become_yukarin/acoustic_converter.py @@ -73,6 +73,9 @@ class AcousticConverter(object): input = self._feature_normalize(input, test=True) input = self._encode_feature(input, test=True) + pad = 128 - input.shape[1] % 128 + input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') + converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) @@ -81,6 +84,7 @@ class AcousticConverter(object): if self.gpu is not None: out = chainer.cuda.to_cpu(out) + out = out[:, :-pad] out = self._decode_feature(out, test=True) out = AcousticFeature( diff --git a/become_yukarin/config/config.py b/become_yukarin/config/config.py index ee1d68f..f49b185 100644 --- a/become_yukarin/config/config.py +++ b/become_yukarin/config/config.py @@ -27,32 +27,14 @@ class DatasetConfig(NamedTuple): num_test: int -class DiscriminatorModelConfig(NamedTuple): - in_channels: int - hidden_channels_list: List[int] - - class ModelConfig(NamedTuple): in_channels: int - conv_bank_out_channels: int - conv_bank_k: int - max_pooling_k: int - conv_projections_hidden_channels: int - highway_layers: int out_channels: int - out_size: int - aligner_out_time_length: int - disable_last_rnn: bool - enable_aligner: bool - discriminator: Optional[DiscriminatorModelConfig] class LossConfig(NamedTuple): - l1: float - predictor_fake: float - discriminator_true: float - discriminator_fake: float - discriminator_grad: float + mse: float + adversarial: float class TrainConfig(NamedTuple): @@ -100,14 +82,6 @@ def create_from_json(s: Union[str, Path]): backward_compatible(d) - if d['model']['discriminator'] is not None: - discriminator_model_config = DiscriminatorModelConfig( - in_channels=d['model']['discriminator']['in_channels'], - hidden_channels_list=d['model']['discriminator']['hidden_channels_list'], - ) - else: - discriminator_model_config = None - return Config( dataset=DatasetConfig( param=Param(), @@ -128,24 +102,11 @@ def create_from_json(s: Union[str, Path]): ), model=ModelConfig( in_channels=d['model']['in_channels'], - conv_bank_out_channels=d['model']['conv_bank_out_channels'], - conv_bank_k=d['model']['conv_bank_k'], - max_pooling_k=d['model']['max_pooling_k'], - conv_projections_hidden_channels=d['model']['conv_projections_hidden_channels'], - highway_layers=d['model']['highway_layers'], out_channels=d['model']['out_channels'], - out_size=d['model']['out_size'], - aligner_out_time_length=d['model']['aligner_out_time_length'], - disable_last_rnn=d['model']['disable_last_rnn'], - enable_aligner=d['model']['enable_aligner'], - discriminator=discriminator_model_config, ), loss=LossConfig( - l1=d['loss']['l1'], - predictor_fake=d['loss']['predictor_fake'], - discriminator_true=d['loss']['discriminator_true'], - discriminator_fake=d['loss']['discriminator_fake'], - discriminator_grad=d['loss']['discriminator_grad'], + mse=d['loss']['mse'], + adversarial=d['loss']['adversarial'], ), train=TrainConfig( batchsize=d['train']['batchsize'], diff --git a/become_yukarin/config/old_config.py b/become_yukarin/config/old_config.py new file mode 100644 index 0000000..002e2b2 --- /dev/null +++ b/become_yukarin/config/old_config.py @@ -0,0 +1,31 @@ +from typing import List +from typing import NamedTuple +from typing import Optional + + +class CBHGDiscriminatorModelConfig(NamedTuple): + in_channels: int + hidden_channels_list: List[int] + + +class CBHGModelConfig(NamedTuple): + in_channels: int + conv_bank_out_channels: int + conv_bank_k: int + max_pooling_k: int + conv_projections_hidden_channels: int + highway_layers: int + out_channels: int + out_size: int + aligner_out_time_length: int + disable_last_rnn: bool + enable_aligner: bool + discriminator: Optional[CBHGDiscriminatorModelConfig] + + +class CBHGLossConfig(NamedTuple): + l1: float + predictor_fake: float + discriminator_true: float + discriminator_fake: float + discriminator_grad: float diff --git a/become_yukarin/model/cbhg_model.py b/become_yukarin/model/cbhg_model.py new file mode 100644 index 0000000..59c6c71 --- /dev/null +++ b/become_yukarin/model/cbhg_model.py @@ -0,0 +1,292 @@ +from functools import partial +from typing import List + +import chainer + +from become_yukarin.config.old_config import CBHGDiscriminatorModelConfig +from become_yukarin.config.old_config import CBHGModelConfig + + +class Convolution1D(chainer.links.ConvolutionND): + def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, + nobias=False, initialW=None, initial_bias=None, + cover_all=False): + super().__init__( + ndim=1, + in_channels=in_channels, + out_channels=out_channels, + ksize=ksize, + stride=stride, + pad=pad, + nobias=nobias, + initialW=initialW, + initial_bias=initial_bias, + cover_all=cover_all, + ) + + +class LegacyConvolution1D(chainer.links.Convolution2D): + def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, + nobias=False, initialW=None, initial_bias=None, **kwargs): + assert ksize is None or isinstance(ksize, int) + assert isinstance(stride, int) + assert isinstance(pad, int) + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + ksize=(ksize, 1), + stride=(stride, 1), + pad=(pad, 0), + nobias=nobias, + initialW=initialW, + initial_bias=initial_bias, + **kwargs, + ) + + def __call__(self, x): + assert x.shape[-1] == 1 + return super().__call__(x) + + +class ConvHighway(chainer.link.Chain): + def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu, + init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1): + super().__init__() + self.activate = activate + + with self.init_scope(): + self.plain = Convolution1D( + in_out_size, in_out_size, 1, nobias=nobias, + initialW=init_Wh, initial_bias=init_bh) + self.transform = Convolution1D( + in_out_size, in_out_size, 1, nobias=nobias, + initialW=init_Wt, initial_bias=init_bt) + + def __call__(self, x): + out_plain = self.activate(self.plain(x)) + out_transform = chainer.functions.sigmoid(self.transform(x)) + y = out_plain * out_transform + x * (1 - out_transform) + return y + + +class PreNet(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: + super().__init__() + with self.init_scope(): + self.conv1 = Convolution1D(in_channels, hidden_channels, 1) + self.conv2 = Convolution1D(hidden_channels, out_channels, 1) + + def __call__(self, x): + h = x + h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5)) + h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5)) + return h + + +class Conv1DBank(chainer.link.Chain): + def __init__(self, in_channels: int, out_channels: int, k: int) -> None: + super().__init__() + self.stacked_channels = out_channels * k + self.pads = [ + partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant') + for i in range(k) + ] + + with self.init_scope(): + self.convs = chainer.link.ChainList( + *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k)) + ) + self.bn = chainer.links.BatchNormalization(out_channels * k) + + def __call__(self, x): + h = x + h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)]) + h = chainer.functions.relu(self.bn(h)) + return h + + +class Conv1DProjections(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: + super().__init__() + + with self.init_scope(): + self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True) + self.bn1 = chainer.links.BatchNormalization(hidden_channels) + self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True) + self.bn2 = chainer.links.BatchNormalization(out_channels) + + def __call__(self, x): + h = x + h = chainer.functions.relu(self.bn1(self.conv1(h))) + h = chainer.functions.relu(self.bn2(self.conv2(h))) + return h + + +class CBHG(chainer.link.Chain): + def __init__( + self, + in_channels: int, + conv_bank_out_channels: int, + conv_bank_k: int, + max_pooling_k: int, + conv_projections_hidden_channels: int, + highway_layers: int, + out_channels: int, + disable_last_rnn: bool, + ) -> None: + super().__init__() + self.max_pooling_padding = partial( + chainer.functions.pad, + pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)), + mode='constant', + ) + self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) + self.out_size = out_channels * (1 if disable_last_rnn else 2) + + with self.init_scope(): + self.conv_bank = Conv1DBank( + in_channels=in_channels, + out_channels=conv_bank_out_channels, + k=conv_bank_k, + ) + self.conv_projectoins = Conv1DProjections( + in_channels=self.conv_bank.stacked_channels, + hidden_channels=conv_projections_hidden_channels, + out_channels=out_channels, + ) + self.highways = chainer.link.ChainList( + *([ConvHighway(out_channels) for _ in range(highway_layers)]) + ) + if not disable_last_rnn: + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=out_channels, + out_size=out_channels, + dropout=0.0, + ) + + def __call__(self, x): + h = x + h = self.conv_bank(h) + h = self.max_pooling(self.max_pooling_padding(h)) + h = self.conv_projectoins(h) + h = h + x + for highway in self.highways: + h = highway(h) + + if hasattr(self, 'gru'): + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) + _, h = self.gru(None, h) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) + return h + + +class Predictor(chainer.link.Chain): + def __init__(self, network, out_size: int) -> None: + super().__init__() + with self.init_scope(): + self.network = network + self.last = Convolution1D(network.out_size, out_size, 1) + + def __call__(self, x): + h = x + h = self.network(h) + h = self.last(h) + return h + + +class Aligner(chainer.link.Chain): + def __init__(self, in_size: int, out_time_length: int) -> None: + super().__init__() + with self.init_scope(): + self.gru = chainer.links.NStepBiGRU( + n_layers=1, + in_size=in_size, + out_size=in_size // 2, + dropout=0.0, + ) + self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) + + def __call__(self, x): + """ + :param x: (batch, channel, timeA) + """ + h = x + h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) + _, h = self.gru(None, h) # h: batch * (timeA, ?) + h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) + h = chainer.functions.softmax(self.last(h), axis=1) # h: (batch, timeB, timeA) + + h = chainer.functions.matmul(x, h) # h: (batch, channel, time) + return h + + +class Discriminator(chainer.link.Chain): + def __init__(self, in_channels: int, hidden_channels_list: List[int]) -> None: + super().__init__() + with self.init_scope(): + self.convs = chainer.link.ChainList(*( + LegacyConvolution1D(i_c, o_c, ksize=2, stride=2) + for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list) + )) + self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1) + + def __call__(self, x): + """ + :param x: (batch, channel, time) + """ + h = x + h = chainer.functions.reshape(h, h.shape + (1,)) + for conv in self.convs.children(): + h = chainer.functions.relu(conv(h)) + h = self.last_conv(h) + h = chainer.functions.reshape(h, h.shape[:-1]) + return h + + +def create_predictor(config: CBHGModelConfig): + network = CBHG( + in_channels=config.in_channels, + conv_bank_out_channels=config.conv_bank_out_channels, + conv_bank_k=config.conv_bank_k, + max_pooling_k=config.max_pooling_k, + conv_projections_hidden_channels=config.conv_projections_hidden_channels, + highway_layers=config.highway_layers, + out_channels=config.out_channels, + disable_last_rnn=config.disable_last_rnn, + ) + predictor = Predictor( + network=network, + out_size=config.out_size, + ) + return predictor + + +def create_aligner(config: CBHGModelConfig): + assert config.enable_aligner + aligner = Aligner( + in_size=config.in_channels, + out_time_length=config.aligner_out_time_length, + ) + return aligner + + +def create_discriminator(config: CBHGDiscriminatorModelConfig): + discriminator = Discriminator( + in_channels=config.in_channels, + hidden_channels_list=config.hidden_channels_list, + ) + return discriminator + + +def create(config: CBHGModelConfig): + predictor = create_predictor(config) + if config.enable_aligner: + aligner = create_aligner(config) + else: + aligner = None + if config.discriminator is not None: + discriminator = create_discriminator(config.discriminator) + else: + discriminator = None + return predictor, aligner, discriminator diff --git a/become_yukarin/model/model.py b/become_yukarin/model/model.py index 71fb805..56870d9 100644 --- a/become_yukarin/model/model.py +++ b/become_yukarin/model/model.py @@ -1,16 +1,14 @@ -from functools import partial -from typing import List - import chainer +import chainer.functions as F +import chainer.links as L -from become_yukarin.config.config import DiscriminatorModelConfig from become_yukarin.config.config import ModelConfig class Convolution1D(chainer.links.ConvolutionND): def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, nobias=False, initialW=None, initial_bias=None, - cover_all=False): + cover_all=False) -> None: super().__init__( ndim=1, in_channels=in_channels, @@ -25,268 +23,135 @@ class Convolution1D(chainer.links.ConvolutionND): ) -class LegacyConvolution1D(chainer.links.Convolution2D): - def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, - nobias=False, initialW=None, initial_bias=None, **kwargs): - assert ksize is None or isinstance(ksize, int) - assert isinstance(stride, int) - assert isinstance(pad, int) +class Deconvolution1D(chainer.links.DeconvolutionND): + def __init__(self, in_channels, out_channels, ksize, stride=1, pad=0, + nobias=False, outsize=None, + initialW=None, initial_bias=None) -> None: super().__init__( + ndim=1, in_channels=in_channels, out_channels=out_channels, - ksize=(ksize, 1), - stride=(stride, 1), - pad=(pad, 0), + ksize=ksize, + stride=stride, + pad=pad, nobias=nobias, + outsize=outsize, initialW=initialW, initial_bias=initial_bias, - **kwargs, ) - def __call__(self, x): - assert x.shape[-1] == 1 - return super().__call__(x) - - -class ConvHighway(chainer.link.Chain): - def __init__(self, in_out_size, nobias=False, activate=chainer.functions.relu, - init_Wh=None, init_Wt=None, init_bh=None, init_bt=-1): - super().__init__() - self.activate = activate - - with self.init_scope(): - self.plain = Convolution1D( - in_out_size, in_out_size, 1, nobias=nobias, - initialW=init_Wh, initial_bias=init_bh) - self.transform = Convolution1D( - in_out_size, in_out_size, 1, nobias=nobias, - initialW=init_Wt, initial_bias=init_bt) - - def __call__(self, x): - out_plain = self.activate(self.plain(x)) - out_transform = chainer.functions.sigmoid(self.transform(x)) - y = out_plain * out_transform + x * (1 - out_transform) - return y - - -class PreNet(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: - super().__init__() - with self.init_scope(): - self.conv1 = Convolution1D(in_channels, hidden_channels, 1) - self.conv2 = Convolution1D(hidden_channels, out_channels, 1) - - def __call__(self, x): - h = x - h = chainer.functions.dropout((chainer.functions.relu(self.conv1(h)), 0.5)) - h = chainer.functions.dropout((chainer.functions.relu(self.conv2(h)), 0.5)) - return h - -class Conv1DBank(chainer.link.Chain): - def __init__(self, in_channels: int, out_channels: int, k: int) -> None: +class CBR(chainer.Chain): + def __init__(self, ch0, ch1, bn=True, sample='down', activation=F.relu, dropout=False) -> None: super().__init__() - self.stacked_channels = out_channels * k - self.pads = [ - partial(chainer.functions.pad, pad_width=((0, 0), (0, 0), (i // 2, (i + 1) // 2)), mode='constant') - for i in range(k) - ] + self.bn = bn + self.activation = activation + self.dropout = dropout + w = chainer.initializers.Normal(0.02) with self.init_scope(): - self.convs = chainer.link.ChainList( - *(Convolution1D(in_channels, out_channels, i + 1, nobias=True) for i in range(k)) - ) - self.bn = chainer.links.BatchNormalization(out_channels * k) + if sample == 'down': + self.c = Convolution1D(ch0, ch1, 4, 2, 1, initialW=w) + else: + self.c = Deconvolution1D(ch0, ch1, 4, 2, 1, initialW=w) + if bn: + self.batchnorm = L.BatchNormalization(ch1) def __call__(self, x): - h = x - h = chainer.functions.concat([conv(pad(h)) for pad, conv in zip(self.pads, self.convs)]) - h = chainer.functions.relu(self.bn(h)) + h = self.c(x) + if self.bn: + h = self.batchnorm(h) + if self.dropout: + h = F.dropout(h) + if self.activation is not None: + h = self.activation(h) return h -class Conv1DProjections(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels: int, out_channels: int) -> None: +class Encoder(chainer.Chain): + def __init__(self, in_ch) -> None: super().__init__() - + w = chainer.initializers.Normal(0.02) with self.init_scope(): - self.conv1 = Convolution1D(in_channels, hidden_channels, 3, pad=1, nobias=True) - self.bn1 = chainer.links.BatchNormalization(hidden_channels) - self.conv2 = Convolution1D(hidden_channels, out_channels, 3, pad=1, nobias=True) - self.bn2 = chainer.links.BatchNormalization(out_channels) + self.c0 = Convolution1D(in_ch, 64, 3, 1, 1, initialW=w) + self.c1 = CBR(64, 128, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c2 = CBR(128, 256, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c3 = CBR(256, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c4 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c5 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c6 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c7 = CBR(512, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) def __call__(self, x): - h = x - h = chainer.functions.relu(self.bn1(self.conv1(h))) - h = chainer.functions.relu(self.bn2(self.conv2(h))) - return h + hs = [F.leaky_relu(self.c0(x))] + for i in range(1, 8): + hs.append(self['c%d' % i](hs[i - 1])) + return hs -class CBHG(chainer.link.Chain): - def __init__( - self, - in_channels: int, - conv_bank_out_channels: int, - conv_bank_k: int, - max_pooling_k: int, - conv_projections_hidden_channels: int, - highway_layers: int, - out_channels: int, - disable_last_rnn: bool, - ) -> None: +class Decoder(chainer.Chain): + def __init__(self, out_ch) -> None: super().__init__() - self.max_pooling_padding = partial( - chainer.functions.pad, - pad_width=((0, 0), (0, 0), ((max_pooling_k - 1) // 2, max_pooling_k // 2)), - mode='constant', - ) - self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False) - self.out_size = out_channels * (1 if disable_last_rnn else 2) - + w = chainer.initializers.Normal(0.02) with self.init_scope(): - self.conv_bank = Conv1DBank( - in_channels=in_channels, - out_channels=conv_bank_out_channels, - k=conv_bank_k, - ) - self.conv_projectoins = Conv1DProjections( - in_channels=self.conv_bank.stacked_channels, - hidden_channels=conv_projections_hidden_channels, - out_channels=out_channels, - ) - self.highways = chainer.link.ChainList( - *([ConvHighway(out_channels) for _ in range(highway_layers)]) - ) - if not disable_last_rnn: - self.gru = chainer.links.NStepBiGRU( - n_layers=1, - in_size=out_channels, - out_size=out_channels, - dropout=0.0, - ) + self.c0 = CBR(512, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c1 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c2 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=True) + self.c3 = CBR(1024, 512, bn=True, sample='up', activation=F.relu, dropout=False) + self.c4 = CBR(1024, 256, bn=True, sample='up', activation=F.relu, dropout=False) + self.c5 = CBR(512, 128, bn=True, sample='up', activation=F.relu, dropout=False) + self.c6 = CBR(256, 64, bn=True, sample='up', activation=F.relu, dropout=False) + self.c7 = Convolution1D(128, out_ch, 3, 1, 1, initialW=w) - def __call__(self, x): - h = x - h = self.conv_bank(h) - h = self.max_pooling(self.max_pooling_padding(h)) - h = self.conv_projectoins(h) - h = h + x - for highway in self.highways: - h = highway(h) - - if hasattr(self, 'gru'): - h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) - _, h = self.gru(None, h) - h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) + def __call__(self, hs): + h = self.c0(hs[-1]) + for i in range(1, 8): + h = F.concat([h, hs[-i - 1]]) + if i < 7: + h = self['c%d' % i](h) + else: + h = self.c7(h) return h -class Predictor(chainer.link.Chain): - def __init__(self, network, out_size: int) -> None: +class Predictor(chainer.Chain): + def __init__(self, in_ch, out_ch) -> None: super().__init__() with self.init_scope(): - self.network = network - self.last = Convolution1D(network.out_size, out_size, 1) + self.encoder = Encoder(in_ch) + self.decoder = Decoder(out_ch) def __call__(self, x): - h = x - h = self.network(h) - h = self.last(h) - return h + return self.decoder(self.encoder(x)) -class Aligner(chainer.link.Chain): - def __init__(self, in_size: int, out_time_length: int) -> None: +class Discriminator(chainer.Chain): + def __init__(self, in_ch, out_ch) -> None: super().__init__() + w = chainer.initializers.Normal(0.02) with self.init_scope(): - self.gru = chainer.links.NStepBiGRU( - n_layers=1, - in_size=in_size, - out_size=in_size // 2, - dropout=0.0, - ) - self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1) + self.c0_0 = CBR(in_ch, 32, bn=False, sample='down', activation=F.leaky_relu, dropout=False) + self.c0_1 = CBR(out_ch, 32, bn=False, sample='down', activation=F.leaky_relu, dropout=False) + self.c1 = CBR(64, 128, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c2 = CBR(128, 256, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c3 = CBR(256, 512, bn=True, sample='down', activation=F.leaky_relu, dropout=False) + self.c4 = Convolution1D(512, 1, 3, 1, 1, initialW=w) - def __call__(self, x): - """ - :param x: (batch, channel, timeA) - """ - h = x - h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel) - _, h = self.gru(None, h) # h: batch * (timeA, ?) - h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA) - h = chainer.functions.softmax(self.last(h), axis=1) # h: (batch, timeB, timeA) - - h = chainer.functions.matmul(x, h) # h: (batch, channel, time) - return h - - -class Discriminator(chainer.link.Chain): - def __init__(self, in_channels: int, hidden_channels_list: List[int]) -> None: - super().__init__() - with self.init_scope(): - self.convs = chainer.link.ChainList(*( - LegacyConvolution1D(i_c, o_c, ksize=2, stride=2) - for i_c, o_c in zip([in_channels] + hidden_channels_list[:-1], hidden_channels_list) - )) - self.last_conv = LegacyConvolution1D(hidden_channels_list[-1], 1, ksize=1) - - def __call__(self, x): - """ - :param x: (batch, channel, time) - """ - h = x - h = chainer.functions.reshape(h, h.shape + (1,)) - for conv in self.convs.children(): - h = chainer.functions.relu(conv(h)) - h = self.last_conv(h) - h = chainer.functions.reshape(h, h.shape[:-1]) + def __call__(self, x_0, x_1): + h = F.concat([self.c0_0(x_0), self.c0_1(x_1)]) + h = self.c1(h) + h = self.c2(h) + h = self.c3(h) + h = self.c4(h) + # h = F.average_pooling_2d(h, h.data.shape[2], 1, 0) return h def create_predictor(config: ModelConfig): - network = CBHG( - in_channels=config.in_channels, - conv_bank_out_channels=config.conv_bank_out_channels, - conv_bank_k=config.conv_bank_k, - max_pooling_k=config.max_pooling_k, - conv_projections_hidden_channels=config.conv_projections_hidden_channels, - highway_layers=config.highway_layers, - out_channels=config.out_channels, - disable_last_rnn=config.disable_last_rnn, - ) - predictor = Predictor( - network=network, - out_size=config.out_size, - ) - return predictor - - -def create_aligner(config: ModelConfig): - assert config.enable_aligner - aligner = Aligner( - in_size=config.in_channels, - out_time_length=config.aligner_out_time_length, - ) - return aligner - - -def create_discriminator(config: DiscriminatorModelConfig): - discriminator = Discriminator( - in_channels=config.in_channels, - hidden_channels_list=config.hidden_channels_list, - ) - return discriminator + return Predictor(in_ch=config.in_channels, out_ch=config.out_channels) def create(config: ModelConfig): predictor = create_predictor(config) - if config.enable_aligner: - aligner = create_aligner(config) - else: - aligner = None - if config.discriminator is not None: - discriminator = create_discriminator(config.discriminator) - else: - discriminator = None - return predictor, aligner, discriminator + discriminator = Discriminator(in_ch=config.in_channels, out_ch=config.out_channels) + return predictor, discriminator diff --git a/become_yukarin/updater/updater.py b/become_yukarin/updater/updater.py index 8dcb215..eb51068 100644 --- a/become_yukarin/updater/updater.py +++ b/become_yukarin/updater/updater.py @@ -1,9 +1,7 @@ import chainer -import numpy -from chainer import reporter +import chainer.functions as F from become_yukarin.config.config import LossConfig -from become_yukarin.model.model import Aligner from become_yukarin.model.model import Discriminator from become_yukarin.model.model import Predictor @@ -13,91 +11,77 @@ class Updater(chainer.training.StandardUpdater): self, loss_config: LossConfig, predictor: Predictor, - aligner: Aligner = None, - discriminator: Discriminator = None, + discriminator: Discriminator, *args, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.loss_config = loss_config self.predictor = predictor - self.aligner = aligner self.discriminator = discriminator - def forward(self, input, target, mask): - xp = self.predictor.xp + def _loss_predictor(self, predictor, output, target, d_fake): + b, _, t = d_fake.data.shape - input = chainer.as_variable(input) - target = chainer.as_variable(target) - mask = chainer.as_variable(mask) + loss_mse = (F.mean_absolute_error(output, target)) + chainer.report({'mse': loss_mse}, predictor) - if self.aligner is not None: - input = self.aligner(input) - y = self.predictor(input) + loss_adv = F.sum(F.softplus(-d_fake)) / (b * t) + chainer.report({'adversarial': loss_adv}, predictor) - loss_l1 = chainer.functions.sum(chainer.functions.absolute_error(y, target) * mask) - loss_l1 = loss_l1 / chainer.functions.sum(mask) - reporter.report({'l1': loss_l1}, self.predictor) + loss = self.loss_config.mse * loss_mse + self.loss_config.adversarial * loss_adv + chainer.report({'loss': loss}, predictor) + return loss - if self.discriminator is not None: - pair_fake = chainer.functions.concat([y * mask, input]) - pair_true = chainer.functions.concat([target * mask, input]) + def _loss_discriminator(self, discriminator, d_real, d_fake): + b, _, t = d_real.data.shape - # DRAGAN - if chainer.config.train: # grad is not available on test - std = xp.std(pair_true.data, axis=0, keepdims=True) - rand = xp.random.uniform(0, 1, pair_true.shape).astype(xp.float32) - perturb = chainer.Variable(pair_true.data + 0.5 * rand * std) - grad, = chainer.grad([self.discriminator(perturb)], [perturb], enable_double_backprop=True) - grad = chainer.functions.sqrt(chainer.functions.batch_l2_norm_squared(grad)) - loss_grad = chainer.functions.mean_squared_error(grad, xp.ones_like(grad.data, numpy.float32)) - reporter.report({'grad': loss_grad}, self.discriminator) + loss_real = F.sum(F.softplus(-d_real)) / (b * t) + chainer.report({'real': loss_real}, discriminator) - if xp.any(xp.isnan(loss_grad.data)): - import code - code.interact(local=locals()) + loss_fake = F.sum(F.softplus(d_fake)) / (b * t) + chainer.report({'fake': loss_fake}, discriminator) - # GAN - d_fake = self.discriminator(pair_fake) - d_true = self.discriminator(pair_true) - loss_dis_f = chainer.functions.average(chainer.functions.softplus(d_fake)) - loss_dis_t = chainer.functions.average(chainer.functions.softplus(-d_true)) - loss_gen_f = chainer.functions.average(chainer.functions.softplus(-d_fake)) - reporter.report({'fake': loss_dis_f}, self.discriminator) - reporter.report({'true': loss_dis_t}, self.discriminator) + loss = loss_real + loss_fake + chainer.report({'loss': loss}, discriminator) - tp = (d_true.data > 0.5).sum() - fp = (d_fake.data > 0.5).sum() - fn = (d_true.data <= 0.5).sum() - tn = (d_fake.data <= 0.5).sum() - accuracy = (tp + tn) / (tp + fp + fn + tn) - precision = tp / (tp + fp) - recall = tp / (tp + fn) - reporter.report({'accuracy': accuracy}, self.discriminator) - reporter.report({'precision': precision}, self.discriminator) - reporter.report({'recall': recall}, self.discriminator) + tp = (d_real.data > 0.5).sum() + fp = (d_fake.data > 0.5).sum() + fn = (d_real.data <= 0.5).sum() + tn = (d_fake.data <= 0.5).sum() + accuracy = (tp + tn) / (tp + fp + fn + tn) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + chainer.report({'accuracy': accuracy}, self.discriminator) + chainer.report({'precision': precision}, self.discriminator) + chainer.report({'recall': recall}, self.discriminator) + return loss - loss = {'predictor': loss_l1 * self.loss_config.l1} + def forward(self, input, target, mask): + input = chainer.as_variable(input) + target = chainer.as_variable(target) + mask = chainer.as_variable(mask) - if self.aligner is not None: - loss['aligner'] = loss_l1 * self.loss_config.l1 - reporter.report({'loss': loss['aligner']}, self.aligner) + output = self.predictor(input) + output = output * mask + target = target * mask - if self.discriminator is not None: - loss['discriminator'] = \ - loss_dis_f * self.loss_config.discriminator_fake + \ - loss_dis_t * self.loss_config.discriminator_true - if chainer.config.train: # grad is not available on test - loss['discriminator'] += loss_grad * self.loss_config.discriminator_grad - reporter.report({'loss': loss['discriminator']}, self.discriminator) - loss['predictor'] += loss_gen_f * self.loss_config.predictor_fake + d_fake = self.discriminator(input, output) + d_real = self.discriminator(input, target) - reporter.report({'loss': loss['predictor']}, self.predictor) + loss = { + 'predictor': self._loss_predictor(self.predictor, output, target, d_fake), + 'discriminator': self._loss_discriminator(self.discriminator, d_real, d_fake), + } return loss def update_core(self): + opt_predictor = self.get_optimizer('predictor') + opt_discriminator = self.get_optimizer('discriminator') + batch = self.get_iterator('main').next() - loss = self.forward(**self.converter(batch, self.device)) + batch = self.converter(batch, self.device) + loss = self.forward(**batch) - for k, opt in self.get_all_optimizers().items(): - opt.update(loss.get, k) + opt_predictor.update(loss.get, 'predictor') + opt_discriminator.update(loss.get, 'discriminator') @@ -27,12 +27,11 @@ config.save_as_json((arguments.output / 'config.json').absolute()) # model if config.train.gpu >= 0: cuda.get_device_from_id(config.train.gpu).use() -predictor, aligner, discriminator = create(config.model) -models = {'predictor': predictor} -if aligner is not None: - models['aligner'] = aligner -if discriminator is not None: - models['discriminator'] = discriminator +predictor, discriminator = create(config.model) +models = { + 'predictor': predictor, + 'discriminator': discriminator, +} # dataset dataset = create_dataset(config.dataset) @@ -43,7 +42,7 @@ train_eval_iter = MultiprocessIterator(dataset['train_eval'], config.train.batch # optimizer def create_optimizer(model): - optimizer = optimizers.Adam() + optimizer = optimizers.Adam(alpha=0.0002, beta1=0.5, beta2=0.999) optimizer.setup(model) return optimizer @@ -55,7 +54,6 @@ converter = partial(convert.concat_examples, padding=0) updater = Updater( loss_config=config.loss, predictor=predictor, - aligner=aligner, discriminator=discriminator, device=config.train.gpu, iterator=train_iter, |
