summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2017-11-20 03:06:39 +0900
committerHiroshiba Kazuyuki <kazuyuki_hiroshiba@dwango.co.jp>2017-11-20 03:06:39 +0900
commit16b4e72fe6728e2e64d4c6357b7c73ac06868c1c (patch)
tree657f0398b9a237ab46327d08f58a230b9581669b
parent437a869590c989c184d33990b1d788149d073ee9 (diff)
aligner
-rw-r--r--become_yukarin/config.py4
-rw-r--r--become_yukarin/dataset/dataset.py102
-rw-r--r--become_yukarin/loss.py12
-rw-r--r--become_yukarin/model.py66
-rw-r--r--become_yukarin/voice_changer.py4
-rw-r--r--train.py15
6 files changed, 156 insertions, 47 deletions
diff --git a/become_yukarin/config.py b/become_yukarin/config.py
index 05b0790..07f35fd 100644
--- a/become_yukarin/config.py
+++ b/become_yukarin/config.py
@@ -32,6 +32,8 @@ class ModelConfig(NamedTuple):
highway_layers: int
out_channels: int
out_size: int
+ aligner_out_time_length: int
+ disable_last_rnn: bool
class LossConfig(NamedTuple):
@@ -100,6 +102,8 @@ def create_from_json(s: Union[str, Path]):
highway_layers=d['model']['highway_layers'],
out_channels=d['model']['out_channels'],
out_size=d['model']['out_size'],
+ aligner_out_time_length=d['model']['aligner_out_time_length'],
+ disable_last_rnn=d['model']['disable_last_rnn'],
),
loss=LossConfig(
l1=d['loss']['l1'],
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py
index 83936b1..329226c 100644
--- a/become_yukarin/dataset/dataset.py
+++ b/become_yukarin/dataset/dataset.py
@@ -252,7 +252,41 @@ class ShapeAlignProcess(BaseDataProcess):
return data
-class CropProcess(BaseDataProcess):
+class RandomPaddingProcess(BaseDataProcess):
+ def __init__(self, min_size: int, time_axis: int = 1):
+ assert time_axis == 1
+ self._min_size = min_size
+ self._time_axis = time_axis
+
+ def __call__(self, datas: Dict[str, any], test=True):
+ assert not test
+
+ data, seed = datas['data'], datas['seed']
+ random = numpy.random.RandomState(seed)
+
+ if data.shape[self._time_axis] >= self._min_size:
+ return data
+
+ pre = random.randint(self._min_size - data.shape[self._time_axis] + 1)
+ post = self._min_size - pre
+ return numpy.pad(data, ((0, 0), (pre, post)), mode='constant')
+
+
+class LastPaddingProcess(BaseDataProcess):
+ def __init__(self, min_size: int, time_axis: int = 1):
+ assert time_axis == 1
+ self._min_size = min_size
+ self._time_axis = time_axis
+
+ def __call__(self, data: numpy.ndarray, test=None):
+ if data.shape[self._time_axis] >= self._min_size:
+ return data
+
+ pre = self._min_size - data.shape[self._time_axis]
+ return numpy.pad(data, ((0, 0), (pre, 0)), mode='constant')
+
+
+class RandomCropProcess(BaseDataProcess):
def __init__(self, crop_size: int, time_axis: int = 1):
self._crop_size = crop_size
self._time_axis = time_axis
@@ -270,6 +304,15 @@ class CropProcess(BaseDataProcess):
return numpy.split(data, [start, start + self._crop_size], axis=self._time_axis)[1]
+class FirstCropProcess(BaseDataProcess):
+ def __init__(self, crop_size: int, time_axis: int = 1):
+ self._crop_size = crop_size
+ self._time_axis = time_axis
+
+ def __call__(self, data: numpy.ndarray, test=None):
+ return numpy.split(data, [0, self._crop_size], axis=self._time_axis)[1]
+
+
class AddNoiseProcess(BaseDataProcess):
def __init__(self, p_global: float = None, p_local: float = None):
assert p_global is None or 0 <= p_global
@@ -338,24 +381,28 @@ def create(config: DatasetConfig):
])
data_process_train = copy.deepcopy(data_process_base)
- if config.train_crop_size is not None:
- data_process_train.append(ChainProcess([
- LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 32), **d)),
- SplitProcess(dict(
- input=ChainProcess([
- LambdaProcess(lambda d, test: dict(data=d['input'], seed=d['seed'])),
- CropProcess(crop_size=config.train_crop_size),
- ]),
- target=ChainProcess([
- LambdaProcess(lambda d, test: dict(data=d['target'], seed=d['seed'])),
- CropProcess(crop_size=config.train_crop_size),
- ]),
- mask=ChainProcess([
- LambdaProcess(lambda d, test: dict(data=d['mask'], seed=d['seed'])),
- CropProcess(crop_size=config.train_crop_size),
- ]),
- )),
- ]))
+
+ def add_seed():
+ return LambdaProcess(lambda d, test: dict(seed=numpy.random.randint(2 ** 32), **d))
+
+ def padding(s):
+ return ChainProcess([
+ LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
+ RandomPaddingProcess(min_size=config.train_crop_size),
+ ])
+
+ def crop(s):
+ return ChainProcess([
+ LambdaProcess(lambda d, test: dict(data=d[s], seed=d['seed'])),
+ RandomCropProcess(crop_size=config.train_crop_size),
+ ])
+
+ data_process_train.append(ChainProcess([
+ add_seed(),
+ SplitProcess(dict(input=padding('input'), target=padding('target'), mask=padding('mask'))),
+ add_seed(),
+ SplitProcess(dict(input=crop('input'), target=crop('target'), mask=crop('mask'))),
+ ]))
# add noise
data_process_train.append(SplitProcess(dict(
@@ -373,6 +420,23 @@ def create(config: DatasetConfig):
)))
data_process_test = data_process_base
+ data_process_test.append(SplitProcess(dict(
+ input=ChainProcess([
+ LambdaProcess(lambda d, test: d['input']),
+ LastPaddingProcess(min_size=config.train_crop_size),
+ FirstCropProcess(crop_size=config.train_crop_size),
+ ]),
+ target=ChainProcess([
+ LambdaProcess(lambda d, test: d['target']),
+ LastPaddingProcess(min_size=config.train_crop_size),
+ FirstCropProcess(crop_size=config.train_crop_size),
+ ]),
+ mask=ChainProcess([
+ LambdaProcess(lambda d, test: d['mask']),
+ LastPaddingProcess(min_size=config.train_crop_size),
+ FirstCropProcess(crop_size=config.train_crop_size),
+ ]),
+ )))
num_test = config.num_test
pairs = [
diff --git a/become_yukarin/loss.py b/become_yukarin/loss.py
index 3d89908..c59747a 100644
--- a/become_yukarin/loss.py
+++ b/become_yukarin/loss.py
@@ -1,18 +1,19 @@
-from .config import LossConfig
-from .model import Model
-
import chainer
-
from chainer import reporter
+from .config import LossConfig
+from .model import Aligner
+from .model import Predictor
+
class Loss(chainer.link.Chain):
- def __init__(self, config: LossConfig, predictor: Model):
+ def __init__(self, config: LossConfig, predictor: Predictor, aligner: Aligner):
super().__init__()
self.config = config
with self.init_scope():
self.predictor = predictor
+ self.aligner = aligner
def __call__(self, input, target, mask):
input = chainer.as_variable(input)
@@ -20,6 +21,7 @@ class Loss(chainer.link.Chain):
mask = chainer.as_variable(mask)
h = input
+ h = self.aligner(h)
y = self.predictor(h)
loss = chainer.functions.sum(chainer.functions.absolute_error(y, target) * mask)
diff --git a/become_yukarin/model.py b/become_yukarin/model.py
index 9d69378..6dfb2c1 100644
--- a/become_yukarin/model.py
+++ b/become_yukarin/model.py
@@ -105,6 +105,7 @@ class CBHG(chainer.link.Chain):
conv_projections_hidden_channels: int,
highway_layers: int,
out_channels: int,
+ disable_last_rnn: bool,
):
super().__init__()
self.max_pooling_padding = chainer.functions.Pad(
@@ -112,7 +113,7 @@ class CBHG(chainer.link.Chain):
mode='constant',
)
self.max_pooling = chainer.functions.MaxPoolingND(1, max_pooling_k, 1, cover_all=False)
- self.out_size = out_channels * 2
+ self.out_size = out_channels * (1 if disable_last_rnn else 2)
with self.init_scope():
self.conv_bank = Conv1DBank(
@@ -128,12 +129,13 @@ class CBHG(chainer.link.Chain):
self.highways = chainer.link.ChainList(
*([ConvHighway(out_channels) for _ in range(highway_layers)])
)
- self.gru = chainer.links.NStepBiGRU(
- n_layers=1,
- in_size=out_channels,
- out_size=out_channels,
- dropout=0.0,
- )
+ if not disable_last_rnn:
+ self.gru = chainer.links.NStepBiGRU(
+ n_layers=1,
+ in_size=out_channels,
+ out_size=out_channels,
+ dropout=0.0,
+ )
def __call__(self, x):
h = x
@@ -144,13 +146,14 @@ class CBHG(chainer.link.Chain):
for highway in self.highways:
h = highway(h)
- h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1)))
- _, h = self.gru(None, h)
- h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1))
+ if hasattr(self, 'gru'):
+ h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1)))
+ _, h = self.gru(None, h)
+ h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1))
return h
-class Model(chainer.link.Chain):
+class Predictor(chainer.link.Chain):
def __init__(self, network, out_size: int):
super().__init__()
with self.init_scope():
@@ -164,7 +167,33 @@ class Model(chainer.link.Chain):
return h
-def create(config: ModelConfig):
+class Aligner(chainer.link.Chain):
+ def __init__(self, in_size: int, out_time_length: int):
+ super().__init__()
+ with self.init_scope():
+ self.gru = chainer.links.NStepBiGRU(
+ n_layers=1,
+ in_size=in_size,
+ out_size=in_size // 2,
+ dropout=0.0,
+ )
+ self.last = Convolution1D(in_size // 2 * 2, out_time_length, 1)
+
+ def __call__(self, x):
+ """
+ :param x: (batch, channel, timeA)
+ """
+ h = x
+ h = chainer.functions.separate(chainer.functions.transpose(h, axes=(0, 2, 1))) # h: batch * (timeA, channel)
+ _, h = self.gru(None, h) # h: batch * (timeA, ?)
+ h = chainer.functions.transpose(chainer.functions.stack(h), axes=(0, 2, 1)) # h: (batch, ?, timeA)
+ h = chainer.functions.softmax(self.last(h), axis=2) # h: (batch, timeB, timeA)
+
+ h = chainer.functions.matmul(x, h, transb=True) # h: (batch, channel, timeB)
+ return h
+
+
+def create_predictor(config: ModelConfig):
network = CBHG(
in_channels=config.in_channels,
conv_bank_out_channels=config.conv_bank_out_channels,
@@ -173,9 +202,18 @@ def create(config: ModelConfig):
conv_projections_hidden_channels=config.conv_projections_hidden_channels,
highway_layers=config.highway_layers,
out_channels=config.out_channels,
+ disable_last_rnn=config.disable_last_rnn,
)
- model = Model(
+ predictor = Predictor(
network=network,
out_size=config.out_size,
)
- return model
+ return predictor
+
+
+def create_aligner(config: ModelConfig):
+ aligner = Aligner(
+ in_size=config.in_channels,
+ out_time_length=config.aligner_out_time_length,
+ )
+ return aligner
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 935b95c..268de24 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -17,7 +17,7 @@ from become_yukarin.dataset.dataset import AcousticFeatureProcess
from become_yukarin.dataset.dataset import DecodeFeatureProcess
from become_yukarin.dataset.dataset import EncodeFeatureProcess
from become_yukarin.dataset.dataset import WaveFileLoadProcess
-from become_yukarin.model import create as create_model
+from become_yukarin.model import create_predictor
class VoiceChanger(object):
@@ -25,7 +25,7 @@ class VoiceChanger(object):
self.config = config
self.model_path = model_path
- self.model = model = create_model(config.model)
+ self.model = model = create_predictor(config.model)
chainer.serializers.load_npz(str(model_path), model)
self._param = param = config.dataset.param
diff --git a/train.py b/train.py
index 27fd1fb..08ef2d9 100644
--- a/train.py
+++ b/train.py
@@ -1,18 +1,18 @@
import argparse
+from functools import partial
from pathlib import Path
-from chainer.iterators import MultiprocessIterator
from chainer import optimizers
from chainer import training
-from chainer.training import extensions
from chainer.dataset import convert
+from chainer.iterators import MultiprocessIterator
+from chainer.training import extensions
from become_yukarin.config import create_from_json
from become_yukarin.dataset import create as create_dataset
-from become_yukarin.model import create as create_model
from become_yukarin.loss import Loss
-
-from functools import partial
+from become_yukarin.model import create_aligner
+from become_yukarin.model import create_predictor
parser = argparse.ArgumentParser()
parser.add_argument('config_json_path', type=Path)
@@ -24,8 +24,9 @@ arguments.output.mkdir(exist_ok=True)
config.save_as_json((arguments.output / 'config.json').absolute())
# model
-predictor = create_model(config.model)
-model = Loss(config.loss, predictor=predictor)
+predictor = create_predictor(config.model)
+aligner = create_aligner(config.model)
+model = Loss(config.loss, predictor=predictor, aligner=aligner)
# dataset
dataset = create_dataset(config.dataset)