diff options
| -rw-r--r-- | become_yukarin/__init__.py | 2 | ||||
| -rw-r--r-- | become_yukarin/dataset/__init__.py | 2 | ||||
| -rw-r--r-- | become_yukarin/dataset/dataset.py | 150 | ||||
| -rw-r--r-- | become_yukarin/dataset/utility.py | 46 | ||||
| -rw-r--r-- | become_yukarin/model.py | 24 | ||||
| -rw-r--r-- | become_yukarin/param.py | 17 | ||||
| -rw-r--r-- | scripts/extract_acoustic_feature.py | 100 |
7 files changed, 341 insertions, 0 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py new file mode 100644 index 0000000..b54083d --- /dev/null +++ b/become_yukarin/__init__.py @@ -0,0 +1,2 @@ +from . import dataset +from . import param diff --git a/become_yukarin/dataset/__init__.py b/become_yukarin/dataset/__init__.py new file mode 100644 index 0000000..cdd8cf4 --- /dev/null +++ b/become_yukarin/dataset/__init__.py @@ -0,0 +1,2 @@ +from . import dataset +from . import utility diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py new file mode 100644 index 0000000..781dbec --- /dev/null +++ b/become_yukarin/dataset/dataset.py @@ -0,0 +1,150 @@ +import json +import os +import typing +from abc import ABCMeta, abstractmethod +from typing import NamedTuple + +import nnmnkwii.preprocessing +import chainer +import librosa +import numpy +import pysptk +import pyworld + + +class Wave(NamedTuple): + wave: numpy.ndarray + sampling_rate: int + + +class AcousticFeature(NamedTuple): + f0: numpy.ndarray + spectrogram: numpy.ndarray + aperiodicity: numpy.ndarray + mfcc: numpy.ndarray + + +class BaseDataProcess(metaclass=ABCMeta): + @abstractmethod + def __call__(self, data, test): + pass + + +class ChainProcess(BaseDataProcess): + def __init__(self, process: typing.Iterable[BaseDataProcess]): + self._process = process + + def __call__(self, data, test): + for p in self._process: + data = p(data, test) + return data + + +class SplitProcess(BaseDataProcess): + def __init__(self, process: typing.Dict[str, typing.Optional[BaseDataProcess]]): + self._process = process + + def __call__(self, data, test): + data = { + k: p(data, test) if p is not None else data + for k, p in self._process.items() + } + return data + + +class DataProcessDataset(chainer.dataset.DatasetMixin): + def __init__(self, data: typing.List, data_process: BaseDataProcess, test): + self._data = data + self._data_process = data_process + self._test = test + + def __len__(self): + return len(self._data) + + def get_example(self, i): + return self._data_process(data=self._data[i], test=self._test) + + +class WaveFileLoadProcess(BaseDataProcess): + def __init__(self, sample_rate: int, top_db: float): + self._sample_rate = sample_rate + self._top_db = top_db + + def __call__(self, data: str, test): + wave = librosa.core.load(data, sr=self._sample_rate)[0] + wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db)) + return Wave(wave, self._sample_rate) + + +class AcousticFeatureProcess(BaseDataProcess): + def __init__(self, frame_period, order, alpha): + self._frame_period = frame_period + self._order = order + self._alpha = alpha + + def __call__(self, data: Wave, test): + x = data.wave.astype(numpy.float64) + fs = data.sampling_rate + + _f0, t = pyworld.dio(x, fs, frame_period=self._frame_period) + f0 = pyworld.stonemask(x, _f0, t, fs) + spectrogram = pyworld.cheaptrick(x, f0, t, fs) + aperiodicity = pyworld.d4c(x, f0, t, fs) + mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) + return AcousticFeature( + f0=f0, + spectrogram=spectrogram, + aperiodicity=aperiodicity, + mfcc=mfcc, + ) + + +# data_process = ChainProcess([ +# SplitProcess(dict( +# input=ChainProcess([ +# WaveFileLoadProcess(), +# AcousticFeatureProcess(), +# ]), +# tareget=ChainProcess([ +# WaveFileLoadProcess(), +# AcousticFeatureProcess(), +# ]), +# )), +# +# PILImageProcess(mode='RGB'), +# RandomFlipImageProcess(p_flip_horizontal=0.5, p_flip_vertical=0), +# RandomResizeImageProcess(min_short=128, max_short=160), +# RandomCropImageProcess(crop_width=128, crop_height=128), +# RgbImageArrayProcess(), +# SplitProcess({ +# 'target': None, +# 'raw_line': RawLineImageArrayProcess(), +# }) +# ]) +# +# +# def choose(config: DatasetConfig): +# if config.images_glob is not None: +# import glob +# paths = glob.glob(config.images_glob) +# paths = data_filter( +# datas=paths, +# keys=list(map(lambda p: os.path.basename(p), paths)), +# filter_func=filter_image, +# num_process=None, +# cache_path=config.cache_path, +# ) +# paths = list(paths) +# else: +# paths = json.load(open(config.images_list)) +# +# num_test = config.num_test +# train_paths = paths[num_test:] +# test_paths = paths[:num_test] +# train_for_evaluate_paths = train_paths[:num_test] +# +# return { +# 'train': DataProcessDataset(train_paths, data_process, test=False), +# 'test': DataProcessDataset(test_paths, data_process, test=True), +# 'train_eval': DataProcessDataset(train_for_evaluate_paths, data_process, test=True), +# } diff --git a/become_yukarin/dataset/utility.py b/become_yukarin/dataset/utility.py new file mode 100644 index 0000000..b2f5480 --- /dev/null +++ b/become_yukarin/dataset/utility.py @@ -0,0 +1,46 @@ +import fastdtw +import nnmnkwii.metrics +import numpy +import scipy.interpolate + + +class DTWAligner(object): + """ + from https://github.com/r9y9/nnmnkwii/blob/4cade86b5c35b4e35615a2a8162ddc638018af0e/nnmnkwii/preprocessing/alignment.py#L14 + """ + + def __init__(self, x, y, dist=lambda x, y: numpy.linalg.norm(x - y), radius=1): + assert x.ndim == 2 and y.ndim == 2 + + _, path = fastdtw.fastdtw(x, y, radius=radius, dist=dist) + self.normed_path_x = numpy.array(list(map(lambda l: l[0], path))) / len(x) + self.normed_path_y = numpy.array(list(map(lambda l: l[1], path))) / len(y) + + def align_x(self, x): + path = self._interp_path(self.normed_path_x, len(x)) + return x[path] + + def align_y(self, y): + path = self._interp_path(self.normed_path_y, len(y)) + return y[path] + + def align(self, x, y): + return self.align_x(x), self.align_y(y) + + @staticmethod + def align_and_transform(x, y, *args, **kwargs): + aligner = DTWAligner(*args, x=x, y=y, **kwargs) + return aligner.align(x, y) + + @staticmethod + def _interp_path(normed_path: numpy.ndarray, target_length: int): + base = numpy.linspace(0, 1, len(normed_path)) + target = numpy.linspace(0, 1, target_length) + path = scipy.interpolate.interp1d(base, normed_path)(target) + path = numpy.floor(path * target_length).astype(numpy.int) + return path + + +class MFCCAligner(DTWAligner): + def __init__(self, *args, **kwargs): + super().__init__(*args, dist=nnmnkwii.metrics.melcd, **kwargs) diff --git a/become_yukarin/model.py b/become_yukarin/model.py new file mode 100644 index 0000000..087afcd --- /dev/null +++ b/become_yukarin/model.py @@ -0,0 +1,24 @@ +import chainer + + +class DeepConvolution(chainer.link.Chain): + def __init__(self, num_scale: int, base_num_z: int, **kwargs): + super().__init__(**kwargs) + self.num_scale = num_scale + + for i in range(num_scale): + l = base_num_z * 2 ** i + self.add_link('conv{}'.format(i + 1), + chainer.links.Convolution2D(None, l, 4, 2, 1, nobias=True)) + self.add_link('bn{}'.format(i + 1), chainer.links.BatchNormalization(l)) + + def get_scaled_width(self, base_width): + return base_width // (2 ** self.num_scale) + + def __call__(self, x): + h = x + for i in range(self.num_scale): + conv = getattr(self, 'conv{}'.format(i + 1)) + bn = getattr(self, 'bn{}'.format(i + 1)) + chainer.functions.relu(bn(conv(h))) + return h diff --git a/become_yukarin/param.py b/become_yukarin/param.py new file mode 100644 index 0000000..a1b8843 --- /dev/null +++ b/become_yukarin/param.py @@ -0,0 +1,17 @@ +from typing import NamedTuple + + +class VoiceParam(NamedTuple): + sample_rate: int = 24000 + top_db: float = 20 + + +class AcousticFeatureParam(NamedTuple): + frame_period: int = 5 + order: int = 25 + alpha: float = 0.466 + + +class Param(NamedTuple): + voice_param: VoiceParam = VoiceParam() + acoustic_feature_param: AcousticFeatureParam = AcousticFeatureParam() diff --git a/scripts/extract_acoustic_feature.py b/scripts/extract_acoustic_feature.py new file mode 100644 index 0000000..a9d229f --- /dev/null +++ b/scripts/extract_acoustic_feature.py @@ -0,0 +1,100 @@ +""" +extract alignments voices. +""" + +import argparse +import multiprocessing +from pathlib import Path + +import numpy + +from become_yukarin.dataset.dataset import AcousticFeatureProcess +from become_yukarin.dataset.dataset import Wave +from become_yukarin.dataset.dataset import WaveFileLoadProcess +from become_yukarin.dataset.utility import MFCCAligner +from become_yukarin.param import AcousticFeatureParam +from become_yukarin.param import VoiceParam + +base_voice_param = VoiceParam() +base_acoustic_feature_param = AcousticFeatureParam() + +parser = argparse.ArgumentParser() +parser.add_argument('--input1_directory', '-i1', type=Path) +parser.add_argument('--input2_directory', '-i2', type=Path) +parser.add_argument('--output1_directory', '-o1', type=Path) +parser.add_argument('--output2_directory', '-o2', type=Path) +parser.add_argument('--sample_rate', type=int, default=base_voice_param.sample_rate) +parser.add_argument('--top_db', type=float, default=base_voice_param.top_db) +parser.add_argument('--frame_period', type=int, default=base_acoustic_feature_param.frame_period) +parser.add_argument('--order', type=int, default=base_acoustic_feature_param.order) +parser.add_argument('--alpha', type=float, default=base_acoustic_feature_param.alpha) +arguments = parser.parse_args() + + +def make_feature( + path, + sample_rate, + top_db, + frame_period, + order, + alpha, +): + wave = WaveFileLoadProcess(sample_rate=sample_rate, top_db=top_db)(path, test=True) + feature = AcousticFeatureProcess(frame_period=frame_period, order=order, alpha=alpha)(wave, test=True) + return feature + + +def process(path1, path2): + # load wave and padding + wave_file_load_process = WaveFileLoadProcess( + sample_rate=arguments.sample_rate, + top_db=arguments.top_db, + ) + wave1 = wave_file_load_process(path1, test=True) + wave2 = wave_file_load_process(path2, test=True) + + m = max(len(wave1.wave), len(wave2.wave)) + wave1 = Wave(wave=numpy.pad(wave1.wave, (0, m - len(wave1.wave)), mode='mean'), sampling_rate=wave1.sampling_rate) + wave2 = Wave(wave=numpy.pad(wave2.wave, (0, m - len(wave2.wave)), mode='mean'), sampling_rate=wave2.sampling_rate) + + # make acoustic feature + acoustic_feature_process = AcousticFeatureProcess( + frame_period=arguments.frame_period, + order=arguments.order, + alpha=arguments.alpha, + ) + f1 = acoustic_feature_process(wave1, test=True) + f2 = acoustic_feature_process(wave2, test=True) + + # alignment + aligner = MFCCAligner(f1.mfcc, f2.mfcc) + + f0_1, f0_2 = aligner.align(f1.f0, f2.f0) + spectrogram_1, spectrogram_2 = aligner.align(f1.spectrogram, f2.spectrogram) + aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity) + mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc) + + # save + path = Path(arguments.output1_directory, path1.stem + '.npy') + numpy.save(path.absolute(), dict(f0=f0_1, spectrogram=spectrogram_1, aperiodicity=aperiodicity_1, mfcc=mfcc_1)) + print('saved!', path) + + path = Path(arguments.output2_directory, path2.stem + '.npy') + numpy.save(path.absolute(), dict(f0=f0_2, spectrogram=spectrogram_2, aperiodicity=aperiodicity_2, mfcc=mfcc_2)) + print('saved!', path) + + +def main(): + paths1 = list(sorted(arguments.input1_directory.glob('*'))) + paths2 = list(sorted(arguments.input2_directory.glob('*'))) + assert len(paths1) == len(paths2) + + arguments.output1_directory.mkdir(exist_ok=True) + arguments.output2_directory.mkdir(exist_ok=True) + + pool = multiprocessing.Pool() + pool.starmap(process, zip(paths1, paths2)) + + +if __name__ == '__main__': + main() |
