WIP real time voice conversion

author: Hiroshiba Kazuyuki <hihokaruta@gmail.com> 2018-01-29 07:11:40 +0900
committer: Hiroshiba Kazuyuki <hihokaruta@gmail.com> 2018-01-29 07:11:40 +0900
commit: b432502ccc924bb10bee0cf8fe11afd0a5f4757d (patch)
tree: 983c5c2d8bf953a7b4a728afe3cf537bdaef119b
parent: c44e1ec9b24a70cc30de5682bf1855afe5eb0485 (diff)
6 files changed, 432 insertions, 7 deletions
diff --git a/become_yukarin/__init__.py b/become_yukarin/__init__.py
index 7513f36..3c376cd 100644
--- a/become_yukarin/__init__.py
+++ b/become_yukarin/__init__.py
@@ -3,4 +3,6 @@ from . import dataset
 from . import param
 from .acoustic_converter import AcousticConverter
 from .super_resolution import SuperResolution
+from .vocoder import RealtimeVocoder
+from .vocoder import Vocoder
 from .voice_changer import VoiceChanger
diff --git a/become_yukarin/dataset/dataset.py b/become_yukarin/dataset/dataset.py
index 1a1438a..178844a 100644
--- a/become_yukarin/dataset/dataset.py
+++ b/become_yukarin/dataset/dataset.py
@@ -87,7 +87,8 @@ class WaveFileLoadProcess(BaseDataProcess):
 
 
 class AcousticFeatureProcess(BaseDataProcess):
-    def __init__(self, frame_period, order, alpha, f0_estimating_method, f0_floor=71, f0_ceil=800, dtype=numpy.float32) -> None:
+    def __init__(self, frame_period, order, alpha, f0_estimating_method, f0_floor=71, f0_ceil=800,
+                 dtype=numpy.float32) -> None:
         self._frame_period = frame_period
         self._order = order
         self._alpha = alpha
@@ -101,10 +102,21 @@ class AcousticFeatureProcess(BaseDataProcess):
         fs = data.sampling_rate
 
         if self._f0_estimating_method == 'dio':
-            _f0, t = pyworld.dio(x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil)
+            _f0, t = pyworld.dio(
+                x,
+                fs,
+                frame_period=self._frame_period,
+                f0_floor=self._f0_floor,
+                f0_ceil=self._f0_ceil,
+            )
         else:
-            _f0, t = pyworld.harvest(x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor,
-                                     f0_ceil=self._f0_ceil)
+            _f0, t = pyworld.harvest(
+                x,
+                fs,
+                frame_period=self._frame_period,
+                f0_floor=self._f0_floor,
+                f0_ceil=self._f0_ceil,
+            )
         f0 = pyworld.stonemask(x, _f0, t, fs)
         spectrogram = pyworld.cheaptrick(x, f0, t, fs)
         aperiodicity = pyworld.d4c(x, f0, t, fs)
diff --git a/become_yukarin/super_resolution.py b/become_yukarin/super_resolution.py
index 7c53b9d..fc91488 100644
--- a/become_yukarin/super_resolution.py
+++ b/become_yukarin/super_resolution.py
@@ -60,6 +60,21 @@ class SuperResolution(object):
         out = out[:-pad]
         return out
 
+    def convert_to_feature(
+            self,
+            spectrogram: numpy.ndarray,
+            acoustic_feature: AcousticFeature,
+    ):
+        acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
+        f_out = AcousticFeature(
+            f0=acoustic_feature.f0,
+            spectrogram=spectrogram.astype(numpy.float64),
+            aperiodicity=acoustic_feature.aperiodicity,
+            mfcc=acoustic_feature.mfcc,
+            voiced=acoustic_feature.voiced,
+        )
+        return f_out
+
     def convert_to_audio(
             self,
             input: numpy.ndarray,
diff --git a/become_yukarin/vocoder.py b/become_yukarin/vocoder.py
new file mode 100644
index 0000000..a49e8f2
--- /dev/null
+++ b/become_yukarin/vocoder.py
@@ -0,0 +1,105 @@
+import numpy
+import pyworld
+from world4py.native import structures, apidefinitions, utils
+
+from become_yukarin.data_struct import AcousticFeature
+from become_yukarin.data_struct import Wave
+from become_yukarin.dataset.dataset import AcousticFeatureProcess
+from become_yukarin.param import AcousticFeatureParam
+
+
+class Vocoder(object):
+    def __init__(
+            self,
+            acoustic_feature_param: AcousticFeatureParam,
+            out_sampling_rate: int,
+    ):
+        self.acoustic_feature_param = acoustic_feature_param
+        self.out_sampling_rate = out_sampling_rate
+        self._encoder = AcousticFeatureProcess(
+            frame_period=acoustic_feature_param.frame_period,
+            order=acoustic_feature_param.order,
+            alpha=acoustic_feature_param.alpha,
+        )
+
+    def encode(self, wave: Wave):
+        return self._encoder(wave)
+
+    def decode(
+            self,
+            acoustic_feature: AcousticFeature,
+    ):
+        acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
+        out = pyworld.synthesize(
+            f0=acoustic_feature.f0.ravel(),
+            spectrogram=acoustic_feature.spectrogram,
+            aperiodicity=acoustic_feature.aperiodicity,
+            fs=self.out_sampling_rate,
+            frame_period=self.acoustic_feature_param.frame_period
+        )
+        return Wave(out, sampling_rate=self.out_sampling_rate)
+
+
+class RealtimeVocoder(Vocoder):
+    def __init__(
+            self,
+            acoustic_feature_param: AcousticFeatureParam,
+            out_sampling_rate: int,
+            buffer_size: int,
+            number_of_pointers: int,
+    ):
+        super().__init__(
+            acoustic_feature_param=acoustic_feature_param,
+            out_sampling_rate=out_sampling_rate,
+        )
+
+        self.buffer_size = buffer_size
+
+        self._synthesizer = structures.WorldSynthesizer()
+        apidefinitions._InitializeSynthesizer(
+            self.out_sampling_rate,  # sampling rate
+            self.acoustic_feature_param.frame_period,  # frame period
+            pyworld.get_cheaptrick_fft_size(out_sampling_rate),  # fft size
+            buffer_size,  # buffer size
+            number_of_pointers,  # number of pointers
+            self._synthesizer,
+        )
+        self._before_buffer = None  # for holding memory
+
+    def decode(
+            self,
+            acoustic_feature: AcousticFeature,
+    ):
+        length = len(acoustic_feature.f0)
+        f0_buffer = utils.cast_1d_list_to_1d_pointer(acoustic_feature.f0.flatten().tolist())
+        sp_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.spectrogram.tolist())
+        ap_buffer = utils.cast_2d_list_to_2d_pointer(acoustic_feature.aperiodicity.tolist())
+        apidefinitions._AddParameters(f0_buffer, length, sp_buffer, ap_buffer, self._synthesizer)
+
+        ys = []
+        while apidefinitions._Synthesis2(self._synthesizer) != 0:
+            y = numpy.array([self._synthesizer.buffer[i] for i in range(self.buffer_size)])
+            ys.append(y)
+
+        if len(ys) > 0:
+            out_wave = Wave(
+                wave=numpy.concatenate(ys),
+                sampling_rate=self.out_sampling_rate,
+            )
+        else:
+            out_wave = Wave(
+                wave=numpy.empty(0),
+                sampling_rate=self.out_sampling_rate,
+            )
+
+        self._before_buffer = (f0_buffer, sp_buffer, ap_buffer)  # for holding memory
+        return out_wave
+
+    def warm_up(self, time_length: float):
+        y = numpy.zeros(int(time_length * self.out_sampling_rate))
+        w = Wave(wave=y, sampling_rate=self.out_sampling_rate)
+        f = self.encode(w)
+        self.decode(f)
+
+    def __del__(self):
+        apidefinitions._DestroySynthesizer(self._synthesizer)
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 7269053..05f5a96 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -1,7 +1,13 @@
+from typing import List
+from typing import NamedTuple
+
 import numpy
 
 from .acoustic_converter import AcousticConverter
+from .data_struct import AcousticFeature
+from .data_struct import Wave
 from .super_resolution import SuperResolution
+from .vocoder import Vocoder
 
 
 class VoiceChanger(object):
@@ -9,6 +15,7 @@ class VoiceChanger(object):
             self,
             acoustic_converter: AcousticConverter,
             super_resolution: SuperResolution,
+            vocoder: Vocoder,
             output_sampling_rate: int = None,
     ) -> None:
         if output_sampling_rate is None:
@@ -16,12 +23,154 @@ class VoiceChanger(object):
 
         self.acoustic_converter = acoustic_converter
         self.super_resolution = super_resolution
+        self.vocoder = vocoder
         self.output_sampling_rate = output_sampling_rate
 
     def convert_from_wave_path(self, wave_path: str):
         w_in = self.acoustic_converter._wave_process(wave_path)
-        f_in = self.acoustic_converter._feature_process(w_in)
+        return self.convert_from_wave(w_in)
+
+    def convert_from_wave(self, wave: Wave):
+        f_in = self.acoustic_converter._feature_process(wave)
+        f_high = self.convert_from_acoustic_feature(f_in)
+        wave = self.vocoder.decode(f_high)
+        return wave
+
+    def convert_from_acoustic_feature(self, f_in: AcousticFeature):
         f_low = self.acoustic_converter.convert_to_feature(f_in)
         s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32))
-        wave = self.super_resolution(s_high, acoustic_feature=f_low, sampling_rate=self.output_sampling_rate)
-        return wave
+        f_high = self.super_resolution.convert_to_feature(s_high, f_low)
+        return f_high
+
+
+class Segment(NamedTuple):
+    start_time: float
+    wave: Wave
+
+    @property
+    def time_length(self):
+        return len(self.wave.wave) / self.wave.sampling_rate
+
+    @property
+    def end_time(self):
+        return self.time_length + self.start_time
+
+
+class VoiceChangerStream(object):
+    def __init__(
+            self,
+            voice_changer: VoiceChanger,
+            sampling_rate: int,
+            in_dtype=numpy.float32,
+    ):
+        self.voice_changer = voice_changer
+        self.sampling_rate = sampling_rate
+        self.in_dtype = in_dtype
+        self._data_stream = []  # type: List[Segment]
+
+    @property
+    def vocoder(self):
+        return self.voice_changer.vocoder
+
+    def add_wave(self, start_time: float, wave: Wave):
+        # validation
+        assert wave.sampling_rate == self.sampling_rate
+        assert wave.wave.dtype == self.in_dtype
+
+        segment = Segment(start_time=start_time, wave=wave)
+        self._data_stream.append(segment)
+
+    def remove_wave(self, end_time: float):
+        self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream))
+
+    def convert(self, start_time: float, time_length: float):
+        end_time = start_time + time_length
+        buffer_list = []
+        stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream)
+
+        start_time_buffer = start_time
+        remaining_time = time_length
+        for segment in stream:
+            # padding
+            if segment.start_time > start_time_buffer:
+                pad = numpy.zeros(
+                    shape=int((segment.start_time - start_time_buffer) * self.sampling_rate),
+                    dtype=self.in_dtype,
+                )
+                buffer_list.append(pad)
+                start_time_buffer = segment.start_time
+
+            if remaining_time > segment.end_time - start_time_buffer:
+                one_time_length = segment.end_time - start_time_buffer
+            else:
+                one_time_length = remaining_time
+
+            first_index = int((start_time_buffer - segment.start_time) * self.sampling_rate)
+            last_index = int(first_index + one_time_length * self.sampling_rate)
+            one_buffer = segment.wave.wave[first_index:last_index]
+            buffer_list.append(one_buffer)
+
+            start_time_buffer += one_time_length
+            remaining_time -= one_time_length
+
+            if start_time_buffer >= end_time:
+                break
+        else:
+            # last padding
+            pad = numpy.zeros(shape=int((end_time - start_time_buffer) * self.sampling_rate), dtype=self.in_dtype)
+            buffer_list.append(pad)
+
+        buffer = numpy.concatenate(buffer_list)
+        print('buffer', len(buffer), flush=True)
+        in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate)
+        in_feature = self.vocoder.encode(in_wave)
+        out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
+        return out_feature
+
+    def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float):
+        """
+        :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。
+        """
+        frame_period = self.vocoder.acoustic_feature_param.frame_period
+
+        start_time -= extra_time
+        time_length += extra_time * 2
+
+        extra_feature = self.convert(start_time=start_time, time_length=time_length)
+
+        pad = int(extra_time / (frame_period / 1000))
+        feature = AcousticFeature(
+            f0=extra_feature.f0[pad:-pad],
+            spectrogram=extra_feature.spectrogram[pad:-pad],
+            aperiodicity=extra_feature.aperiodicity[pad:-pad],
+            mfcc=extra_feature.mfcc[pad:-pad],
+            voiced=extra_feature.voiced[pad:-pad],
+        )
+
+        out_wave = self.vocoder.decode(
+            acoustic_feature=feature,
+        )
+        return out_wave
+
+
+class VoiceChangerStreamWrapper(object):
+    def __init__(
+            self,
+            voice_changer_stream: VoiceChangerStream,
+            extra_time: float = 0.0
+    ):
+        self.voice_changer_stream = voice_changer_stream
+        self.extra_time = extra_time
+        self._current_time = 0
+
+    def convert_next(self, time_length: float):
+        out_wave = self.voice_changer_stream.convert_with_extra_time(
+            start_time=self._current_time,
+            time_length=time_length,
+            extra_time=self.extra_time,
+        )
+        self._current_time += time_length
+        return out_wave
+
+    def remove_previous_wave(self):
+        self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time)
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
new file mode 100644
index 0000000..4c50963
--- /dev/null
+++ b/scripts/realtime_voice_changer.py
@@ -0,0 +1,142 @@
+import queue
+from functools import partial
+from pathlib import Path
+from typing import NamedTuple
+
+import numpy
+import pyaudio
+
+from become_yukarin import AcousticConverter
+from become_yukarin import RealtimeVocoder
+from become_yukarin import SuperResolution
+from become_yukarin import VoiceChanger
+from become_yukarin.config.config import create_from_json as create_config
+from become_yukarin.config.sr_config import create_from_json as create_sr_config
+from become_yukarin.data_struct import Wave
+from become_yukarin.voice_changer import VoiceChangerStream
+from become_yukarin.voice_changer import VoiceChangerStreamWrapper
+
+
+class AudioConfig(NamedTuple):
+    rate: int
+    chunk: int
+    vocoder_buffer_size: int
+    out_norm: float
+
+
+queue_input_wave = queue.Queue()
+queue_output_wave = queue.Queue()
+queue_output_fragment_wave = queue.Queue(maxsize=1)
+
+
+def convert_worker(audio_config: AudioConfig, wrapper: VoiceChangerStreamWrapper):
+    start_time = 0
+    time_length = audio_config.chunk / audio_config.rate
+    while True:
+        wave = queue_input_wave.get()
+        wave = Wave(wave=wave, sampling_rate=audio_config.rate)
+        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
+        start_time += len(wave.wave) / wave.sampling_rate
+
+        wave = wrapper.convert_next(time_length=time_length)
+        queue_output_wave.put(wave.wave)
+        wrapper.remove_previous_wave()
+
+
+def input_callback(in_data, frame_count, time_info, status_flags, audio_config: AudioConfig):
+    print('input', status_flags, flush=True)
+    wave = numpy.fromstring(in_data, dtype=numpy.float32)
+    queue_input_wave.put(wave)
+    return None, pyaudio.paContinue
+
+
+def output_callback(_, frame_count, time_info, status_flags, audio_config: AudioConfig):
+    print('output', status_flags, flush=True)
+    try:
+        wave = queue_output_fragment_wave.get_nowait()
+    except:
+        wave = numpy.empty(0)
+
+    while len(wave) < audio_config.chunk:
+        wave_next = queue_output_wave.get()
+        wave = numpy.concatenate([wave, wave_next])
+
+    wave, wave_fragment = wave[:audio_config.chunk], wave[audio_config.chunk:]
+    queue_output_fragment_wave.put(wave_fragment)
+
+    wave *= audio_config.out_norm
+    b = wave.astype(numpy.float32).tobytes()
+    return b, pyaudio.paContinue
+
+
+def main():
+    print('model loading...', flush=True)
+
+    model_path = Path('./trained/mfcc8-preconvert-innoise03/predictor_350000.npz')
+    config_path = Path('./trained/mfcc8-preconvert-innoise03/config.json')
+    config = create_config(config_path)
+    acoustic_converter = AcousticConverter(config, model_path, gpu=0)
+    print('model 1 loaded!', flush=True)
+
+    model_path = Path('./trained/sr-noise3/predictor_70000.npz')
+    config_path = Path('./trained/sr-noise3/config.json')
+    sr_config = create_sr_config(config_path)
+    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
+    print('model 2 loaded!', flush=True)
+
+    audio_instance = pyaudio.PyAudio()
+    audio_config = AudioConfig(
+        rate=config.dataset.param.voice_param.sample_rate,
+        chunk=config.dataset.param.voice_param.sample_rate,
+        vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
+        out_norm=4.5,
+    )
+
+    vocoder = RealtimeVocoder(
+        acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+        out_sampling_rate=audio_config.rate,
+        buffer_size=audio_config.vocoder_buffer_size,
+        number_of_pointers=16,
+    )
+    vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
+
+    voice_changer = VoiceChanger(
+        super_resolution=super_resolution,
+        acoustic_converter=acoustic_converter,
+        vocoder=vocoder,
+    )
+
+    voice_changer_stream = VoiceChangerStream(
+        voice_changer=voice_changer,
+        sampling_rate=audio_config.rate,
+        in_dtype=numpy.float32,
+    )
+
+    wrapper = VoiceChangerStreamWrapper(
+        voice_changer_stream=voice_changer_stream,
+        extra_time=0.2,
+    )
+
+    input_audio_stream = audio_instance.open(
+        format=pyaudio.paFloat32,
+        channels=1,
+        rate=audio_config.rate,
+        frames_per_buffer=audio_config.chunk,
+        input=True,
+        stream_callback=partial(input_callback, audio_config=audio_config)
+    )
+
+    output_audio_stream = audio_instance.open(
+        format=pyaudio.paFloat32,
+        channels=1,
+        rate=audio_config.rate,
+        frames_per_buffer=audio_config.chunk,
+        output=True,
+        stream_callback=partial(output_callback, audio_config=audio_config)
+    )
+
+    convert_worker(audio_config, wrapper)
+
+
+if __name__ == '__main__':
+    main()
author	Hiroshiba Kazuyuki <hihokaruta@gmail.com>	2018-01-29 07:11:40 +0900
committer	Hiroshiba Kazuyuki <hihokaruta@gmail.com>	2018-01-29 07:11:40 +0900
commit	b432502ccc924bb10bee0cf8fe11afd0a5f4757d (patch)
tree	983c5c2d8bf953a7b4a728afe3cf537bdaef119b
parent	c44e1ec9b24a70cc30de5682bf1855afe5eb0485 (diff)