become_yukarin/dataset/utility.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

import math

import fastdtw
import numpy

_logdb_const = 10.0 / numpy.log(10.0) * numpy.sqrt(2.0)


# should work on torch and numpy arrays
def _sqrt(x):
    isnumpy = isinstance(x, numpy.ndarray)
    isscalar = numpy.isscalar(x)
    return numpy.sqrt(x) if isnumpy else math.sqrt(x) if isscalar else x.sqrt()


def _exp(x):
    isnumpy = isinstance(x, numpy.ndarray)
    isscalar = numpy.isscalar(x)
    return numpy.exp(x) if isnumpy else math.exp(x) if isscalar else x.exp()


def _sum(x):
    if isinstance(x, list) or isinstance(x, numpy.ndarray):
        return numpy.sum(x)
    return float(x.sum())


def melcd(X, Y, lengths=None):
    """Mel-cepstrum distortion (MCD).

    The function computes MCD for time-aligned mel-cepstrum sequences.

    Args:
        X (ndarray): Input mel-cepstrum, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        Y (ndarray): Target mel-cepstrum, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        lengths (list): Lengths of padded inputs. This should only be specified
          if you give mini-batch inputs.

    Returns:
        float: Mean mel-cepstrum distortion in dB.

    .. note::

        The function doesn't check if inputs are actually mel-cepstrum.
    """
    # summing against feature axis, and then take mean against time axis
    # Eq. (1a)
    # https://www.cs.cmu.edu/~awb/papers/sltu2008/kominek_black.sltu_2008.pdf
    if lengths is None:
        z = X - Y
        r = _sqrt((z * z).sum(-1))
        if not numpy.isscalar(r):
            r = r.mean()
        return _logdb_const * r

    # Case for 1-dim features.
    if len(X.shape) == 2:
        # Add feature axis
        X, Y = X[:, :, None], Y[:, :, None]

    s = 0.0
    T = _sum(lengths)
    for x, y, length in zip(X, Y, lengths):
        x, y = x[:length], y[:length]
        z = x - y
        s += _sqrt((z * z).sum(-1)).sum()

    return _logdb_const * s / T


class DTWAligner(object):
    """
    from https://github.com/r9y9/nnmnkwii/blob/4cade86b5c35b4e35615a2a8162ddc638018af0e/nnmnkwii/preprocessing/alignment.py#L14
    """

    def __init__(self, x, y, dist=lambda x, y: numpy.linalg.norm(x - y), radius=1) -> None:
        assert x.ndim == 2 and y.ndim == 2

        _, path = fastdtw.fastdtw(x, y, radius=radius, dist=dist)
        path = numpy.array(path)
        self.normed_path_x = path[:, 0] / len(x)
        self.normed_path_y = path[:, 1] / len(y)

    def align_x(self, x):
        path = self._interp_path(self.normed_path_x, len(x))
        return x[path]

    def align_y(self, y):
        path = self._interp_path(self.normed_path_y, len(y))
        return y[path]

    def align(self, x, y):
        return self.align_x(x), self.align_y(y)

    @staticmethod
    def align_and_transform(x, y, *args, **kwargs):
        aligner = DTWAligner(*args, x=x, y=y, **kwargs)
        return aligner.align(x, y)

    @staticmethod
    def _interp_path(normed_path: numpy.ndarray, target_length: int):
        path = numpy.floor(normed_path * target_length).astype(numpy.int)
        return path


class MelCepstrumAligner(DTWAligner):
    def __init__(self, x, y, *args, **kwargs) -> None:
        x = self._calc_aligner_feature(x)
        y = self._calc_aligner_feature(y)
        kwargs.update(dist=melcd)
        super().__init__(x, y, *args, **kwargs)

    @classmethod
    def _calc_delta(cls, x):
        x = numpy.zeros_like(x, x.dtype)
        x[:-1] = x[1:] - x[:-1]
        x[-1] = 0
        return x

    @classmethod
    def _calc_aligner_feature(cls, x):
        d = cls._calc_delta(x)
        feature = numpy.concatenate((x, d), axis=1)[:, 1:]
        return feature