become_yukarin/super_resolution.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

from functools import partial
from pathlib import Path

import chainer
import numpy
import pyworld

from become_yukarin.config.sr_config import SRConfig
from become_yukarin.data_struct import AcousticFeature
from become_yukarin.data_struct import Wave
from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureLoadProcess
from become_yukarin.dataset.dataset import LowHighSpectrogramFeatureProcess
from become_yukarin.dataset.dataset import WaveFileLoadProcess
from become_yukarin.model.sr_model import create_predictor_sr


class SuperResolution(object):
    def __init__(self, config: SRConfig, model_path: Path, gpu: int = None):
        self.config = config
        self.model_path = model_path
        self.gpu = gpu

        self.model = model = create_predictor_sr(config.model)
        chainer.serializers.load_npz(str(model_path), model)
        if self.gpu is not None:
            model.to_gpu(self.gpu)

        self._param = param = config.dataset.param
        self._wave_process = WaveFileLoadProcess(
            sample_rate=param.voice_param.sample_rate,
            top_db=None,
        )
        self._low_high_spectrogram_process = LowHighSpectrogramFeatureProcess(
            frame_period=param.acoustic_feature_param.frame_period,
            order=param.acoustic_feature_param.order,
            alpha=param.acoustic_feature_param.alpha,
        )
        self._low_high_spectrogram_load_process = LowHighSpectrogramFeatureLoadProcess(
            validate=True,
        )

    def convert(self, input: numpy.ndarray) -> numpy.ndarray:
        converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0)
        pad = 128 - len(input) % 128
        input = numpy.pad(input, [(0, pad), (0, 0)], mode='minimum')
        input = numpy.log(input)[:, :-1]
        input = input[numpy.newaxis]
        inputs = converter([input])

        with chainer.using_config('train', False):
            out = self.model(inputs).data[0]

        if self.gpu is not None:
            out = chainer.cuda.to_cpu(out)

        out = out[0]
        out = numpy.pad(out, [(0, 0), (0, 1)], mode='edge')
        out = numpy.exp(out)
        out = out[:-pad]
        return out

    def convert_to_audio(
            self,
            input: numpy.ndarray,
            acoustic_feature: AcousticFeature,
            sampling_rate: int,
    ):
        acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
        out = pyworld.synthesize(
            f0=acoustic_feature.f0.ravel(),
            spectrogram=input.astype(numpy.float64),
            aperiodicity=acoustic_feature.aperiodicity,
            fs=sampling_rate,
            frame_period=self._param.acoustic_feature_param.frame_period,
        )
        return Wave(out, sampling_rate=sampling_rate)

    def convert_from_audio_path(self, input: Path):
        input = self._wave_process(str(input), test=True)
        input = self._low_high_spectrogram_process(input, test=True)
        return self.convert(input.low)

    def convert_from_feature_path(self, input: Path):
        input = self._low_high_spectrogram_load_process(input, test=True)
        return self.convert(input.low)

    def __call__(
            self,
            input: numpy.ndarray,
            acoustic_feature: AcousticFeature,
            sampling_rate: int,
    ):
        high = self.convert(input)
        return self.convert_to_audio(high, acoustic_feature=acoustic_feature, sampling_rate=sampling_rate)