summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--become_yukarin/__init__.py2
-rw-r--r--become_yukarin/config.py10
-rw-r--r--become_yukarin/data_struct.py8
-rw-r--r--become_yukarin/dataset/dataset.py51
-rw-r--r--become_yukarin/voice_changer.py97
-rw-r--r--scripts/extract_acoustic_feature.py10
6 files changed, 153 insertions, 25 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py
index b54083d..81bea87 100644
--- a/become_yukarin/__init__.py
+++ b/become_yukarin/__init__.py
@@ -1,2 +1,4 @@
+from . import config
from . import dataset
from . import param
+from .voice_changer import VoiceChanger
diff --git a/become_yukarin/config.py b/become_yukarin/config.py
index d00f179..ecfcfab 100644
--- a/become_yukarin/config.py
+++ b/become_yukarin/config.py
@@ -72,10 +72,10 @@ def create_from_json(s: Union[str, Path]):
param=Param(),
input_glob=d['dataset']['input_glob'],
target_glob=d['dataset']['target_glob'],
- input_mean_path=Path(d['dataset']['input_mean']),
- input_var_path=Path(d['dataset']['input_var']),
- target_mean_path=Path(d['dataset']['target_mean']),
- target_var_path=Path(d['dataset']['target_var']),
+ input_mean_path=Path(d['dataset']['input_mean_path']).expanduser(),
+ input_var_path=Path(d['dataset']['input_var_path']).expanduser(),
+ target_mean_path=Path(d['dataset']['target_mean_path']).expanduser(),
+ target_var_path=Path(d['dataset']['target_var_path']).expanduser(),
seed=d['dataset']['seed'],
num_test=d['dataset']['num_test'],
),
@@ -93,6 +93,6 @@ def create_from_json(s: Union[str, Path]):
gpu=d['train']['gpu'],
log_iteration=d['train']['log_iteration'],
snapshot_iteration=d['train']['snapshot_iteration'],
- output=Path(d['train']['output']),
+ output=Path(d['train']['output']).expanduser(),
),
)
diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py
index c215ecc..f0601a5 100644
--- a/become_yukarin/data_struct.py
+++ b/become_yukarin/data_struct.py
@@ -13,3 +13,11 @@ class AcousticFeature(NamedTuple):
spectrogram: numpy.ndarray
aperiodicity: numpy.ndarray
mfcc: numpy.ndarray
+
+ def astype(self, dtype):
+ return AcousticFeature(
+ f0=self.f0.astype(dtype),
+ spectrogram=self.spectrogram.astype(dtype),
+ aperiodicity=self.aperiodicity.astype(dtype),
+ mfcc=self.mfcc.astype(dtype),
+ )
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py
index 39331c3..7a6ce08 100644
--- a/become_yukarin/dataset/dataset.py
+++ b/become_yukarin/dataset/dataset.py
@@ -61,21 +61,23 @@ class SplitProcess(BaseDataProcess):
class WaveFileLoadProcess(BaseDataProcess):
- def __init__(self, sample_rate: int, top_db: float):
+ def __init__(self, sample_rate: int, top_db: float, dtype=numpy.float32):
self._sample_rate = sample_rate
self._top_db = top_db
+ self._dtype = dtype
def __call__(self, data: str, test):
- wave = librosa.core.load(data, sr=self._sample_rate)[0]
+ wave = librosa.core.load(data, sr=self._sample_rate, dtype=self._dtype)[0]
wave = librosa.effects.remix(wave, intervals=librosa.effects.split(wave, top_db=self._top_db))
return Wave(wave, self._sample_rate)
class AcousticFeatureProcess(BaseDataProcess):
- def __init__(self, frame_period, order, alpha):
+ def __init__(self, frame_period, order, alpha, dtype=numpy.float32):
self._frame_period = frame_period
self._order = order
self._alpha = alpha
+ self._dtype = dtype
def __call__(self, data: Wave, test):
x = data.wave.astype(numpy.float64)
@@ -87,10 +89,10 @@ class AcousticFeatureProcess(BaseDataProcess):
aperiodicity = pyworld.d4c(x, f0, t, fs)
mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
return AcousticFeature(
- f0=f0,
- spectrogram=spectrogram,
- aperiodicity=aperiodicity,
- mfcc=mfcc,
+ f0=f0.astype(self._dtype),
+ spectrogram=spectrogram.astype(self._dtype),
+ aperiodicity=aperiodicity.astype(self._dtype),
+ mfcc=mfcc.astype(self._dtype),
)
@@ -122,7 +124,21 @@ class AcousticFeatureNormalizeProcess(BaseDataProcess):
)
-class ReshapeFeatureProcess(BaseDataProcess):
+class AcousticFeatureDenormalizeProcess(BaseDataProcess):
+ def __init__(self, mean: AcousticFeature, var: AcousticFeature):
+ self._mean = mean
+ self._var = var
+
+ def __call__(self, data: AcousticFeature, test):
+ return AcousticFeature(
+ f0=data.f0 * numpy.sqrt(self._var.f0) + self._mean.f0,
+ spectrogram=data.spectrogram * numpy.sqrt(self._var.spectrogram) + self._mean.spectrogram,
+ aperiodicity=data.aperiodicity * numpy.sqrt(self._var.aperiodicity) + self._mean.aperiodicity,
+ mfcc=data.mfcc * numpy.sqrt(self._var.mfcc) + self._mean.mfcc,
+ )
+
+
+class EncodeFeatureProcess(BaseDataProcess):
def __init__(self, targets: List[str]):
self._targets = targets
@@ -132,6 +148,21 @@ class ReshapeFeatureProcess(BaseDataProcess):
return feature
+class DecodeFeatureProcess(BaseDataProcess):
+ def __init__(self, targets: List[str]):
+ self._targets = targets
+
+ def __call__(self, data: numpy.ndarray, test):
+ # TODO: implement for other features
+ data = data.T
+ return AcousticFeature(
+ f0=numpy.nan,
+ spectrogram=numpy.nan,
+ aperiodicity=numpy.nan,
+ mfcc=data,
+ )
+
+
class ShapeAlignProcess(BaseDataProcess):
def __call__(self, data, test):
data1, data2 = data['input'], data['target']
@@ -173,13 +204,13 @@ def create(config: DatasetConfig):
LambdaProcess(lambda d, test: d['input_path']),
acoustic_feature_load_process,
AcousticFeatureNormalizeProcess(mean=input_mean, var=input_var),
- ReshapeFeatureProcess(['mfcc']),
+ EncodeFeatureProcess(['mfcc']),
]),
target=ChainProcess([
LambdaProcess(lambda d, test: d['target_path']),
acoustic_feature_load_process,
AcousticFeatureNormalizeProcess(mean=target_mean, var=target_var),
- ReshapeFeatureProcess(['mfcc']),
+ EncodeFeatureProcess(['mfcc']),
]),
)),
ShapeAlignProcess(),
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
new file mode 100644
index 0000000..2ff1b8e
--- /dev/null
+++ b/become_yukarin/voice_changer.py
@@ -0,0 +1,97 @@
+from functools import partial
+from pathlib import Path
+from typing import Optional
+
+import chainer
+import numpy
+import pysptk
+import pyworld
+
+from become_yukarin.config import Config
+from become_yukarin.data_struct import AcousticFeature
+from become_yukarin.data_struct import Wave
+from become_yukarin.dataset.dataset import AcousticFeatureDenormalizeProcess
+from become_yukarin.dataset.dataset import AcousticFeatureLoadProcess
+from become_yukarin.dataset.dataset import AcousticFeatureNormalizeProcess
+from become_yukarin.dataset.dataset import AcousticFeatureProcess
+from become_yukarin.dataset.dataset import EncodeFeatureProcess
+from become_yukarin.dataset.dataset import DecodeFeatureProcess
+from become_yukarin.dataset.dataset import WaveFileLoadProcess
+from become_yukarin.model import create as create_model
+
+
+class VoiceChanger(object):
+ def __init__(self, config: Config, model_path: Path):
+ self.config = config
+ self.model_path = model_path
+
+ self.model = model = create_model(config.model)
+ chainer.serializers.load_npz(str(model_path), model)
+
+ self._param = param = config.dataset.param
+ self._wave_process = WaveFileLoadProcess(
+ sample_rate=param.voice_param.sample_rate,
+ top_db=param.voice_param.top_db,
+ )
+ self._feature_process = AcousticFeatureProcess(
+ frame_period=param.acoustic_feature_param.frame_period,
+ order=param.acoustic_feature_param.order,
+ alpha=param.acoustic_feature_param.alpha,
+ )
+
+ _acoustic_feature_load_process = AcousticFeatureLoadProcess()
+
+ input_mean = _acoustic_feature_load_process(config.dataset.input_mean_path, test=True)
+ input_var = _acoustic_feature_load_process(config.dataset.input_var_path, test=True)
+ target_mean = _acoustic_feature_load_process(config.dataset.target_mean_path, test=True)
+ target_var = _acoustic_feature_load_process(config.dataset.target_var_path, test=True)
+ self._feature_normalize = AcousticFeatureNormalizeProcess(
+ mean=input_mean,
+ var=input_var,
+ )
+ self._feature_denormalize = AcousticFeatureDenormalizeProcess(
+ mean=target_mean,
+ var=target_var,
+ )
+
+ self._encode_feature = EncodeFeatureProcess(['mfcc'])
+ self._decode_feature = DecodeFeatureProcess(['mfcc'])
+
+ def __call__(self, voice_path: Path, out_sampling_rate: Optional[int] = None):
+ input = input_wave = self._wave_process(str(voice_path), test=True)
+ if out_sampling_rate is None:
+ out_sampling_rate = input_wave.sampling_rate
+
+ input = input_feature = self._feature_process(input, test=True)
+ input = self._feature_normalize(input, test=True)
+ input = self._encode_feature(input, test=True)
+
+ converter = partial(chainer.dataset.convert.concat_examples, padding=0)
+ inputs = converter([input])
+
+ out = self.model(inputs).data[0]
+ out = self._decode_feature(out, test=True)
+ out = self._feature_denormalize(out, test=True)
+
+ fftlen = pyworld.get_cheaptrick_fft_size(input_wave.sampling_rate)
+ spectrogram = pysptk.mc2sp(
+ out.mfcc,
+ alpha=self._param.acoustic_feature_param.alpha,
+ fftlen=fftlen,
+ )
+
+ out = AcousticFeature(
+ f0=input_feature.f0,
+ spectrogram=spectrogram,
+ aperiodicity=input_feature.aperiodicity,
+ mfcc=out.mfcc,
+ ).astype(numpy.float64)
+ out = pyworld.synthesize(
+ f0=out.f0,
+ spectrogram=out.spectrogram,
+ aperiodicity=out.aperiodicity,
+ fs=out_sampling_rate,
+ frame_period=self._param.acoustic_feature_param.frame_period,
+ )
+
+ return Wave(out, sampling_rate=out_sampling_rate)
diff --git a/scripts/extract_acoustic_feature.py b/scripts/extract_acoustic_feature.py
index b280db6..ccc8d66 100644
--- a/scripts/extract_acoustic_feature.py
+++ b/scripts/extract_acoustic_feature.py
@@ -73,16 +73,6 @@ def generate_feature(path1, path2):
aperiodicity_1, aperiodicity_2 = aligner.align(f1.aperiodicity, f2.aperiodicity)
mfcc_1, mfcc_2 = aligner.align(f1.mfcc, f2.mfcc)
- # convert type
- f0_1 = f0_1.astype(numpy.float32)
- f0_2 = f0_2.astype(numpy.float32)
- spectrogram_1 = spectrogram_1.astype(numpy.float32)
- spectrogram_2 = spectrogram_2.astype(numpy.float32)
- aperiodicity_1 = aperiodicity_1.astype(numpy.float32)
- aperiodicity_2 = aperiodicity_2.astype(numpy.float32)
- mfcc_1 = mfcc_1.astype(numpy.float32)
- mfcc_2 = mfcc_2.astype(numpy.float32)
-
# save
path = Path(arguments.output1_directory, path1.stem + '.npy')
numpy.save(path.absolute(), dict(f0=f0_1, spectrogram=spectrogram_1, aperiodicity=aperiodicity_1, mfcc=mfcc_1))