summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <hihokaruta@gmail.com>2017-11-06 02:37:05 +0900
committerHiroshiba Kazuyuki <hihokaruta@gmail.com>2017-11-06 02:37:05 +0900
commitbe9104a1019104751ff9352a896df0f55946fd05 (patch)
tree6a028d061ee30dd6dbf1cb4333bb97c8731f13f3
add extract acoustic feature script
-rw-r--r--become_yukarin/__init__.py2
-rw-r--r--become_yukarin/dataset/__init__.py2
-rw-r--r--become_yukarin/dataset/dataset.py150
-rw-r--r--become_yukarin/dataset/utility.py46
-rw-r--r--become_yukarin/model.py24
-rw-r--r--become_yukarin/param.py17
-rw-r--r--scripts/extract_acoustic_feature.py100
7 files changed, 341 insertions, 0 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py
new file mode 100644
index 0000000..b54083d
--- /dev/null
+++ b/become_yukarin/__init__.py
@@ -0,0 +1,2 @@
+from . import dataset
+from . import param
diff --git a/become_yukarin/dataset/__init__.py b/become_yukarin/dataset/__init__.py
new file mode 100644
index 0000000..cdd8cf4
--- /dev/null
+++ b/become_yukarin/dataset/__init__.py
@@ -0,0 +1,2 @@
+from . import dataset
+from . import utility
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py
new file mode 100644
index 0000000..781dbec
--- /dev/null
+++ b/become_yukarin/dataset/dataset.py
@@ -0,0 +1,150 @@
+import json
+import os
+import typing
+from abc import ABCMeta, abstractmethod
+from typing import NamedTuple
+
+import nnmnkwii.preprocessing
+import chainer
+import librosa
+import numpy
+import pysptk
+import pyworld
+
+
+class Wave(NamedTuple):
+ wave: numpy.ndarray
+ sampling_rate: int
+
+
+class AcousticFeature(NamedTuple):
+ f0: numpy.ndarray
+ spectrogram: numpy.ndarray
+ aperiodicity: numpy.ndarray
+ mfcc: numpy.ndarray
+
+
+class BaseDataProcess(metaclass=ABCMeta):
+ @abstractmethod
+ def __call__(self, data, test):
+ pass
+
+
+class ChainProcess(BaseDataProcess):
+ def __init__(self, process: typing.Iterable[BaseDataProcess]):
+ self._process = process
+
+ def __call__(self, data, test):
+ for p in self._process:
+ data = p(data, test)
+ return data
+
+
+class SplitProcess(BaseDataProcess):
+ def __init__(self, process: typing.Dict[str, typing.Optional[BaseDataProcess]]):
+ self._process = process
+
+ def __call__(self, data, test):
+ data = {
+ k: p(data, test) if p is not None else data
+ for k, p in self._process.items()
+ }
+ return data
+
+
+class DataProcessDataset(chainer.dataset.DatasetMixin):
+ def __init__(self, data: typing.List, data_process: BaseDataProcess, test):
+ self._data = data
+ self._data_process = data_process
+ self._test = test
+
+ def __len__(self):
+ return len(self._data)
+
+ def get_example(self, i):
+ return self._data_process(data=self._data[i], test=self._test)
+
+
+class WaveFileLoadProcess(BaseDataProcess):
+ def __init__(self, sample_rate: int, top_db: float):
+ self._sample_rate = sample_rate
+ self._top_db = top_db
+
+ def __call__(self, data: str, test):
+ wave = librosa.core.load(data, sr=self._sample_rate)[0]
+ wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db))
+ return Wave(wave, self._sample_rate)
+
+
+class AcousticFeatureProcess(BaseDataProcess):
+ def __init__(self, frame_period, order, alpha):
+ self._frame_period = frame_period
+ self._order = order
+ self._alpha = alpha
+
+ def __call__(self, data: Wave, test):
+ x = data.wave.astype(numpy.float64)
+ fs = data.sampling_rate
+
+ _f0, t = pyworld.dio(x, fs, frame_period=self._frame_period)
+ f0 = pyworld.stonemask(x, _f0, t, fs)
+ spectrogram = pyworld.cheaptrick(x, f0, t, fs)
+ aperiodicity = pyworld.d4c(x, f0, t, fs)
+ mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
+ return AcousticFeature(
+ f0=f0,
+ spectrogram=spectrogram,
+ aperiodicity=aperiodicity,
+ mfcc=mfcc,
+ )
+
+
+# data_process = ChainProcess([
+# SplitProcess(dict(
+# input=ChainProcess([
+# WaveFileLoadProcess(),
+# AcousticFeatureProcess(),
+# ]),
+# tareget=ChainProcess([
+# WaveFileLoadProcess(),
+# AcousticFeatureProcess(),
+# ]),
+# )),
+#
+# PILImageProcess(mode='RGB'),
+# RandomFlipImageProcess(p_flip_horizontal=0.5, p_flip_vertical=0),
+# RandomResizeImageProcess(min_short=128, max_short=160),
+# RandomCropImageProcess(crop_width=128, crop_height=128),
+# RgbImageArrayProcess(),
+# SplitProcess({
+# 'target': None,
+# 'raw_line': RawLineImageArrayProcess(),
+# })
+# ])
+#
+#
+# def choose(config: DatasetConfig):
+# if config.images_glob is not None:
+# import glob
+# paths = glob.glob(config.images_glob)
+# paths = data_filter(
+# datas=paths,
+# keys=list(map(lambda p: os.path.basename(p), paths)),
+# filter_func=filter_image,
+# num_process=None,
+# cache_path=config.cache_path,
+# )
+# paths = list(paths)
+# else:
+# paths = json.load(open(config.images_list))
+#
+# num_test = config.num_test
+# train_paths = paths[num_test:]
+# test_paths = paths[:num_test]
+# train_for_evaluate_paths = train_paths[:num_test]
+#
+# return {
+# 'train': DataProcessDataset(train_paths, data_process, test=False),
+# 'test': DataProcessDataset(test_paths, data_process, test=True),
+# 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process, test=True),
+# }
diff --git a/become_yukarin/dataset/utility.py b/become_yukarin/dataset/utility.py
new file mode 100644
index 0000000..b2f5480
--- /dev/null
+++ b/become_yukarin/dataset/utility.py
@@ -0,0 +1,46 @@
+import fastdtw
+import nnmnkwii.metrics
+import numpy
+import scipy.interpolate
+
+
+class DTWAligner(object):
+ """
+ from https://github.com/r9y9/nnmnkwii/blob/4cade86b5c35b4e35615a2a8162ddc638018af0e/nnmnkwii/preprocessing/alignment.py#L14
+ """
+
+ def __init__(self, x, y, dist=lambda x, y: numpy.linalg.norm(x - y), radius=1):
+ assert x.ndim == 2 and y.ndim == 2
+
+ _, path = fastdtw.fastdtw(x, y, radius=radius, dist=dist)
+ self.normed_path_x = numpy.array(list(map(lambda l: l[0], path))) / len(x)
+ self.normed_path_y = numpy.array(list(map(lambda l: l[1], path))) / len(y)
+
+ def align_x(self, x):
+ path = self._interp_path(self.normed_path_x, len(x))
+ return x[path]
+
+ def align_y(self, y):
+ path = self._interp_path(self.normed_path_y, len(y))
+ return y[path]
+
+ def align(self, x, y):
+ return self.align_x(x), self.align_y(y)
+
+ @staticmethod
+ def align_and_transform(x, y, *args, **kwargs):
+ aligner = DTWAligner(*args, x=x, y=y, **kwargs)
+ return aligner.align(x, y)
+
+ @staticmethod
+ def _interp_path(normed_path: numpy.ndarray, target_length: int):
+ base = numpy.linspace(0, 1, len(normed_path))
+ target = numpy.linspace(0, 1, target_length)
+ path = scipy.interpolate.interp1d(base, normed_path)(target)
+ path = numpy.floor(path * target_length).astype(numpy.int)
+ return path
+
+
+class MFCCAligner(DTWAligner):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, dist=nnmnkwii.metrics.melcd, **kwargs)
diff --git a/become_yukarin/model.py b/become_yukarin/model.py
new file mode 100644
index 0000000..087afcd
--- /dev/null
+++ b/become_yukarin/model.py
@@ -0,0 +1,24 @@
+import chainer
+
+
+class DeepConvolution(chainer.link.Chain):
+ def __init__(self, num_scale: int, base_num_z: int, **kwargs):
+ super().__init__(**kwargs)
+ self.num_scale = num_scale
+
+ for i in range(num_scale):
+ l = base_num_z * 2 ** i
+ self.add_link('conv{}'.format(i + 1),
+ chainer.links.Convolution2D(None, l, 4, 2, 1, nobias=True))
+ self.add_link('bn{}'.format(i + 1), chainer.links.BatchNormalization(l))
+
+ def get_scaled_width(self, base_width):
+ return base_width // (2 ** self.num_scale)
+
+ def __call__(self, x):
+ h = x
+ for i in range(self.num_scale):
+ conv = getattr(self, 'conv{}'.format(i + 1))
+ bn = getattr(self, 'bn{}'.format(i + 1))
+ chainer.functions.relu(bn(conv(h)))
+ return h
diff --git a/become_yukarin/param.py b/become_yukarin/param.py
new file mode 100644
index 0000000..a1b8843
--- /dev/null
+++ b/become_yukarin/param.py
@@ -0,0 +1,17 @@
+from typing import NamedTuple
+
+
+class VoiceParam(NamedTuple):
+ sample_rate: int = 24000
+ top_db: float = 20
+
+
+class AcousticFeatureParam(NamedTuple):
+ frame_period: int = 5
+ order: int = 25
+ alpha: float = 0.466
+
+
+class Param(NamedTuple):
+ voice_param: VoiceParam = VoiceParam()
+ acoustic_feature_param: AcousticFeatureParam = AcousticFeatureParam()
diff --git a/scripts/extract_acoustic_feature.py b/scripts/extract_acoustic_feature.py
new file mode 100644
index 0000000..a9d229f
--- /dev/null
+++ b/scripts/extract_acoustic_feature.py
@@ -0,0 +1,100 @@
+"""
+extract alignments voices.
+"""
+
+import argparse
+import multiprocessing
+from pathlib import Path
+
+import numpy
+
+from become_yukarin.dataset.dataset import AcousticFeatureProcess
+from become_yukarin.dataset.dataset import Wave
+from become_yukarin.dataset.dataset import WaveFileLoadProcess
+from become_yukarin.dataset.utility import MFCCAligner
+from become_yukarin.param import AcousticFeatureParam
+from become_yukarin.param import VoiceParam
+
+base_voice_param = VoiceParam()
+base_acoustic_feature_param = AcousticFeatureParam()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input1_directory', '-i1', type=Path)
+parser.add_argument('--input2_directory', '-i2', type=Path)
+parser.add_argument('--output1_directory', '-o1', type=Path)
+parser.add_argument('--output2_directory', '-o2', type=Path)
+parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate)
+parser.add_argument('--top_db', type=float, default=base_voice_param.top_db)
+parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period)
+parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order)
+parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha)
+arguments = parser.parse_args()
+
+
+def make_feature(
+ path,
+ sample_rate,
+ top_db,
+ frame_period,
+ order,
+ alpha,
+):
+ wave = WaveFileLoadProcess(sample_rate=sample_rate, top_db=top_db)(path, test=True)
+ feature = AcousticFeatureProcess(frame_period=frame_period, order=order, alpha=alpha)(wave, test=True)
+ return feature
+
+
+def process(path1, path2):
+ # load wave and padding
+ wave_file_load_process = WaveFileLoadProcess(
+ sample_rate=arguments.sample_rate,
+ top_db=arguments.top_db,
+ )
+ wave1 = wave_file_load_process(path1, test=True)
+ wave2 = wave_file_load_process(path2, test=True)
+
+ m = max(len(wave1.wave), len(wave2.wave))
+ wave1 = Wave(wave=numpy.pad(wave1.wave, (0, m - len(wave1.wave)), mode='mean'), sampling_rate=wave1.sampling_rate)
+ wave2 = Wave(wave=numpy.pad(wave2.wave, (0, m - len(wave2.wave)), mode='mean'), sampling_rate=wave2.sampling_rate)
+
+ # make acoustic feature
+ acoustic_feature_process = AcousticFeatureProcess(
+ frame_period=arguments.frame_period,
+ order=arguments.order,
+ alpha=arguments.alpha,
+ )
+ f1 = acoustic_feature_process(wave1, test=True)
+ f2 = acoustic_feature_process(wave2, test=True)
+
+ # alignment
+ aligner = MFCCAligner(f1.mfcc, f2.mfcc)
+
+ f0_1, f0_2 = aligner.align(f1.f0, f2.f0)
+ spectrogram_1, spectrogram_2 = aligner.align(f1.spectrogram, f2.spectrogram)
+ aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity)
+ mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc)
+
+ # save
+ path = Path(arguments.output1_directory, path1.stem + '.npy')
+ numpy.save(path.absolute(), dict(f0=f0_1, spectrogram=spectrogram_1, aperiodicity=aperiodicity_1, mfcc=mfcc_1))
+ print('saved!', path)
+
+ path = Path(arguments.output2_directory, path2.stem + '.npy')
+ numpy.save(path.absolute(), dict(f0=f0_2, spectrogram=spectrogram_2, aperiodicity=aperiodicity_2, mfcc=mfcc_2))
+ print('saved!', path)
+
+
+def main():
+ paths1 = list(sorted(arguments.input1_directory.glob('*')))
+ paths2 = list(sorted(arguments.input2_directory.glob('*')))
+ assert len(paths1) == len(paths2)
+
+ arguments.output1_directory.mkdir(exist_ok=True)
+ arguments.output2_directory.mkdir(exist_ok=True)
+
+ pool = multiprocessing.Pool()
+ pool.starmap(process, zip(paths1, paths2))
+
+
+if __name__ == '__main__':
+ main()