リアルタイム機能を切り分け

author: Hiroshiba Kazuyuki <hihokaruta@gmail.com> 2018-03-12 00:08:43 +0900
committer: Hiroshiba Kazuyuki <hihokaruta@gmail.com> 2018-03-12 00:08:43 +0900
commit: 210e8225e4f7c95d6d0c89309b5b1eb20e41e744 (patch)
tree: 2bd9f737040217ffb1eefda7ce61f1a3198390d8
parent: f8823b1913c29ce2710f92d51b74cb84b74323b0 (diff)
3 files changed, 0 insertions, 607 deletions
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 7f7bbe4..698244c 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -1,15 +1,8 @@
-from abc import ABCMeta, abstractproperty, abstractmethod
-from typing import List, Callable, Any
-from typing import NamedTuple
-
 import numpy
 
-from become_yukarin.param import Param
 from .acoustic_converter import AcousticConverter
 from .data_struct import AcousticFeature
-from .data_struct import Wave
 from .super_resolution import SuperResolution
-from .vocoder import Vocoder
 
 
 class VoiceChanger(object):
@@ -31,248 +24,3 @@ class VoiceChanger(object):
         s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32))
         f_high = self.super_resolution.convert_to_feature(s_high, f_low)
         return f_high
-
-
-class BaseSegment(ABCMeta):
-    start_time: float
-
-    @property
-    @abstractmethod
-    def time_length(self) -> float:
-        pass
-
-    @property
-    @abstractmethod
-    def end_time(self) -> float:
-        pass
-
-
-class FeatureSegment(NamedTuple, BaseSegment):
-    start_time: float
-    feature: AcousticFeature
-    frame_period: float
-
-    @property
-    def time_length(self):
-        return len(self.feature.f0) * self.frame_period / 1000
-
-    @property
-    def end_time(self):
-        return self.time_length + self.start_time
-
-
-class WaveSegment(NamedTuple, BaseSegment):
-    start_time: float
-    wave: Wave
-
-    @property
-    def time_length(self):
-        return len(self.wave.wave) / self.wave.sampling_rate
-
-    @property
-    def end_time(self):
-        return self.time_length + self.start_time
-
-
-class VoiceChangerStream(object):
-    def __init__(
-            self,
-            sampling_rate: int,
-            frame_period: float,
-            order: int,
-            in_dtype=numpy.float32,
-    ):
-        self.sampling_rate = sampling_rate
-        self.frame_period = frame_period
-        self.order = order
-        self.in_dtype = in_dtype
-
-        self.voice_changer: VoiceChanger = None
-        self.vocoder: Vocoder = None
-        self._data_stream = []  # type: List[WaveSegment]
-        self._in_feature_stream = []  # type: List[FeatureSegment]
-        self._out_feature_stream = []  # type: List[FeatureSegment]
-
-    def add_wave(self, start_time: float, wave: Wave):
-        # validation
-        assert wave.sampling_rate == self.sampling_rate
-        assert wave.wave.dtype == self.in_dtype
-
-        segment = WaveSegment(start_time=start_time, wave=wave)
-        self._data_stream.append(segment)
-
-    def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
-        # validation
-        assert frame_period == self.frame_period
-        assert feature.f0.dtype == self.in_dtype
-
-        segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
-        self._in_feature_stream.append(segment)
-
-    def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
-        # validation
-        assert frame_period == self.frame_period
-
-        segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
-        self._out_feature_stream.append(segment)
-
-    def remove(self, end_time: float):
-        self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream))
-        self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream))
-        self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream))
-
-    @staticmethod
-    def fetch(
-            start_time: float,
-            time_length: float,
-            data_stream: List[BaseSegment],
-            rate: float,
-            pad_function: Callable[[int], Any],
-            pick_function: Callable[[Any, int, int], Any],
-            concat_function: Callable[[List], Any],
-            extra_time: float = 0,
-    ):
-        start_time -= extra_time
-        time_length += extra_time * 2
-
-        end_time = start_time + time_length
-        buffer_list = []
-        stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), data_stream)
-
-        start_time_buffer = start_time
-        remaining_time = time_length
-        for segment in stream:
-            # padding
-            if segment.start_time > start_time_buffer:
-                length = int((segment.start_time - start_time_buffer) * rate)
-                pad = pad_function(length)
-                buffer_list.append(pad)
-                start_time_buffer = segment.start_time
-
-            if remaining_time > segment.end_time - start_time_buffer:
-                one_time_length = segment.end_time - start_time_buffer
-            else:
-                one_time_length = remaining_time
-
-            first_index = int((start_time_buffer - segment.start_time) * rate)
-            last_index = int(first_index + one_time_length * rate)
-            one_buffer = pick_function(segment, first_index, last_index)
-            buffer_list.append(one_buffer)
-
-            start_time_buffer += one_time_length
-            remaining_time -= one_time_length
-
-            if start_time_buffer >= end_time:
-                break
-        else:
-            # last padding
-            length = int((end_time - start_time_buffer) * rate)
-            pad = pad_function(length)
-            buffer_list.append(pad)
-
-        buffer = concat_function(buffer_list)
-        return buffer
-
-    def pre_convert(self, start_time: float, time_length: float, extra_time: float):
-        wave = self.fetch(
-            start_time=start_time,
-            time_length=time_length,
-            extra_time=extra_time,
-            data_stream=self._data_stream,
-            rate=self.sampling_rate,
-            pad_function=lambda length: numpy.zeros(shape=length, dtype=self.in_dtype),
-            pick_function=lambda segment, first, last: segment.wave.wave[first:last],
-            concat_function=numpy.concatenate,
-        )
-        in_wave = Wave(wave=wave, sampling_rate=self.sampling_rate)
-        in_feature = self.vocoder.encode(in_wave)
-
-        pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000))
-        in_feature = in_feature.pick(pad, -pad)
-        return in_feature
-
-    def convert(self, start_time: float, time_length: float, extra_time: float):
-        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
-        keys = ['f0', 'aperiodicity', 'mfcc', 'voiced']
-        in_feature = self.fetch(
-            start_time=start_time,
-            time_length=time_length,
-            extra_time=extra_time,
-            data_stream=self._in_feature_stream,
-            rate=1000 / self.frame_period,
-            pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys),
-            pick_function=lambda segment, first, last: segment.feature.pick(first, last),
-            concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys),
-        )
-        out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
-
-        pad = int(extra_time * 1000 / self.frame_period)
-        out_feature = out_feature.pick(pad, -pad)
-        return out_feature
-
-    def post_convert(self, start_time: float, time_length: float):
-        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
-        keys = ['f0', 'aperiodicity', 'spectrogram', 'voiced']
-        out_feature = self.fetch(
-            start_time=start_time,
-            time_length=time_length,
-            data_stream=self._out_feature_stream,
-            rate=1000 / self.frame_period,
-            pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys),
-            pick_function=lambda segment, first, last: segment.feature.pick(first, last),
-            concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys),
-        )
-
-        out_wave = self.vocoder.decode(
-            acoustic_feature=out_feature,
-        )
-        return out_wave
-
-
-class VoiceChangerStreamWrapper(object):
-    def __init__(
-            self,
-            voice_changer_stream: VoiceChangerStream,
-            extra_time_pre: float = 0.0,
-            extra_time: float = 0.0,
-    ):
-        self.voice_changer_stream = voice_changer_stream
-        self.extra_time_pre = extra_time_pre
-        self.extra_time = extra_time
-        self._current_time_pre = 0
-        self._current_time = 0
-        self._current_time_post = 0
-
-    def pre_convert_next(self, time_length: float):
-        in_feature = self.voice_changer_stream.pre_convert(
-            start_time=self._current_time_pre,
-            time_length=time_length,
-            extra_time=self.extra_time_pre,
-        )
-        self._current_time_pre += time_length
-        return in_feature
-
-    def convert_next(self, time_length: float):
-        out_feature = self.voice_changer_stream.convert(
-            start_time=self._current_time,
-            time_length=time_length,
-            extra_time=self.extra_time,
-        )
-        self._current_time += time_length
-        return out_feature
-
-    def post_convert_next(self, time_length: float):
-        out_wave = self.voice_changer_stream.post_convert(
-            start_time=self._current_time_post,
-            time_length=time_length,
-        )
-        self._current_time_post += time_length
-        return out_wave
-
-    def remove_previous(self):
-        end_time = min(
-            self._current_time_pre - self.extra_time_pre,
-            self._current_time - self.extra_time,
-            self._current_time_post,
-        )
-        self.voice_changer_stream.remove(end_time=end_time)
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
deleted file mode 100644
index bda64dd..0000000
--- a/scripts/realtime_voice_changer.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import librosa
-import world4py
-
-world4py._WORLD_LIBRARY_PATH = 'x64_world.dll'
-
-from pathlib import Path
-from typing import NamedTuple
-from multiprocessing import Queue
-from multiprocessing import Process
-
-import numpy
-import pyaudio
-
-from become_yukarin import AcousticConverter
-from become_yukarin import Vocoder
-from become_yukarin import RealtimeVocoder
-from become_yukarin import SuperResolution
-from become_yukarin import VoiceChanger
-from become_yukarin.config.config import Config
-from become_yukarin.config.config import create_from_json as create_config
-from become_yukarin.config.sr_config import create_from_json as create_sr_config
-from become_yukarin.data_struct import Wave
-from become_yukarin.data_struct import AcousticFeature
-from become_yukarin.voice_changer import VoiceChangerStream
-from become_yukarin.voice_changer import VoiceChangerStreamWrapper
-
-
-class AudioConfig(NamedTuple):
-    rate: int
-    frame_period: float
-    audio_chunk: int
-    convert_chunk: int
-    vocoder_buffer_size: int
-    out_norm: float
-    silent_threshold: float
-
-
-def encode_worker(
-        config: Config,
-        wrapper: VoiceChangerStreamWrapper,
-        audio_config: AudioConfig,
-        queue_input: Queue,
-        queue_output: Queue,
-):
-    wrapper.voice_changer_stream.vocoder = Vocoder(
-        acoustic_feature_param=config.dataset.param.acoustic_feature_param,
-        out_sampling_rate=audio_config.rate,
-    )
-
-    start_time = 0
-    time_length = audio_config.convert_chunk / audio_config.rate
-
-    while True:
-        wave = queue_input.get()
-
-        w = Wave(wave=wave, sampling_rate=audio_config.rate)
-        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
-        start_time += time_length
-
-        feature = wrapper.pre_convert_next(time_length=time_length)
-        queue_output.put(feature)
-
-
-def convert_worker(
-        config: Config,
-        wrapper: VoiceChangerStreamWrapper,
-        acoustic_converter: AcousticConverter,
-        super_resolution: SuperResolution,
-        audio_config: AudioConfig,
-        queue_input: Queue,
-        queue_output: Queue,
-):
-    wrapper.voice_changer_stream.voice_changer = VoiceChanger(
-        super_resolution=super_resolution,
-        acoustic_converter=acoustic_converter,
-    )
-
-    start_time = 0
-    time_length = audio_config.convert_chunk / audio_config.rate
-    while True:
-        in_feature: AcousticFeature = queue_input.get()
-        wrapper.voice_changer_stream.add_in_feature(
-            start_time=start_time,
-            feature=in_feature,
-            frame_period=audio_config.frame_period,
-        )
-        start_time += time_length
-
-        out_feature = wrapper.convert_next(time_length=time_length)
-        queue_output.put(out_feature)
-
-
-def decode_worker(
-        config: Config,
-        wrapper: VoiceChangerStreamWrapper,
-        audio_config: AudioConfig,
-        queue_input: Queue,
-        queue_output: Queue,
-):
-    wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
-        acoustic_feature_param=config.dataset.param.acoustic_feature_param,
-        out_sampling_rate=audio_config.rate,
-        buffer_size=audio_config.vocoder_buffer_size,
-        number_of_pointers=16,
-    )
-    # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
-
-    start_time = 0
-    time_length = audio_config.convert_chunk / audio_config.rate
-    wave_fragment = numpy.empty(0)
-    while True:
-        feature: AcousticFeature = queue_input.get()
-        wrapper.voice_changer_stream.add_out_feature(
-            start_time=start_time,
-            feature=feature,
-            frame_period=audio_config.frame_period,
-        )
-        start_time += time_length
-
-        wave = wrapper.post_convert_next(time_length=time_length).wave
-
-        wave_fragment = numpy.concatenate([wave_fragment, wave])
-        if len(wave_fragment) >= audio_config.audio_chunk:
-            wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
-
-            power = librosa.core.power_to_db(numpy.abs(librosa.stft(wave)) ** 2).mean()
-            if power >= audio_config.silent_threshold:
-                queue_output.put(wave)
-
-
-def main():
-    print('model loading...', flush=True)
-
-    queue_input_wave = Queue()
-    queue_input_feature = Queue()
-    queue_output_feature = Queue()
-    queue_output_wave = Queue()
-
-    model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
-    config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json')
-    config = create_config(config_path)
-    acoustic_converter = AcousticConverter(config, model_path, gpu=0)
-    print('model 1 loaded!', flush=True)
-
-    model_path = Path('./trained/sr-noise3/predictor_180000.npz')
-    config_path = Path('./trained/sr-noise3/config.json')
-    sr_config = create_sr_config(config_path)
-    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
-    print('model 2 loaded!', flush=True)
-
-    audio_instance = pyaudio.PyAudio()
-    audio_config = AudioConfig(
-        rate=config.dataset.param.voice_param.sample_rate,
-        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
-        audio_chunk=config.dataset.param.voice_param.sample_rate,
-        convert_chunk=config.dataset.param.voice_param.sample_rate,
-        vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
-        out_norm=2.5,
-        silent_threshold=-99.0,
-    )
-
-    voice_changer_stream = VoiceChangerStream(
-        sampling_rate=audio_config.rate,
-        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
-        order=config.dataset.param.acoustic_feature_param.order,
-        in_dtype=numpy.float32,
-    )
-
-    wrapper = VoiceChangerStreamWrapper(
-        voice_changer_stream=voice_changer_stream,
-        extra_time_pre=0.2,
-        extra_time=0.1,
-    )
-
-    process_encoder = Process(target=encode_worker, kwargs=dict(
-        config=config,
-        wrapper=wrapper,
-        audio_config=audio_config,
-        queue_input=queue_input_wave,
-        queue_output=queue_input_feature,
-    ))
-    process_encoder.start()
-
-    process_converter = Process(target=convert_worker, kwargs=dict(
-        config=config,
-        wrapper=wrapper,
-        acoustic_converter=acoustic_converter,
-        super_resolution=super_resolution,
-        audio_config=audio_config,
-        queue_input=queue_input_feature,
-        queue_output=queue_output_feature,
-    ))
-    process_converter.start()
-
-    process_decoder = Process(target=decode_worker, kwargs=dict(
-        config=config,
-        wrapper=wrapper,
-        audio_config=audio_config,
-        queue_input=queue_output_feature,
-        queue_output=queue_output_wave,
-    ))
-    process_decoder.start()
-
-    audio_stream = audio_instance.open(
-        format=pyaudio.paFloat32,
-        channels=1,
-        rate=audio_config.rate,
-        frames_per_buffer=audio_config.audio_chunk,
-        input=True,
-        output=True,
-    )
-
-    # process_converter.join()
-
-    while True:
-        # input audio
-        in_data = audio_stream.read(audio_config.audio_chunk)
-        wave = numpy.fromstring(in_data, dtype=numpy.float32)
-        print('input', len(wave), flush=True)
-        queue_input_wave.put(wave)
-
-        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
-        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
-        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
-        print('queue_output_wave', queue_output_wave.qsize(), flush=True)
-
-        # output
-        try:
-            wave = queue_output_wave.get_nowait()
-        except:
-            wave = None
-
-        if wave is not None:
-            print('output', len(wave), flush=True)
-            wave *= audio_config.out_norm
-            b = wave.astype(numpy.float32).tobytes()
-            audio_stream.write(b)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py
deleted file mode 100644
index 9772bbe..0000000
--- a/tests/test_voice_changer.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import world4py
-world4py._WORLD_LIBRARY_PATH = 'x64_world.dll'
-
-
-from pathlib import Path
-from typing import NamedTuple
-
-import librosa
-import numpy
-
-from become_yukarin import AcousticConverter
-from become_yukarin import RealtimeVocoder
-from become_yukarin import SuperResolution
-from become_yukarin import VoiceChanger
-from become_yukarin.config.config import create_from_json as create_config
-from become_yukarin.config.sr_config import create_from_json as create_sr_config
-from become_yukarin.data_struct import Wave
-from become_yukarin.voice_changer import VoiceChangerStream
-from become_yukarin.voice_changer import VoiceChangerStreamWrapper
-
-
-class AudioConfig(NamedTuple):
-    rate: int
-    chunk: int
-    vocoder_buffer_size: int
-    out_norm: float
-
-
-model_base_path = Path('~/Github/become-yukarin/trained/').expanduser()
-test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav')
-test_output_path = Path('output.wav')
-
-print('model loading...', flush=True)
-
-model_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
-config_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/config.json')
-config = create_config(config_path)
-acoustic_converter = AcousticConverter(config, model_path)
-print('model 1 loaded!', flush=True)
-
-model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
-config_path = model_base_path / Path('sr-noise3/config.json')
-sr_config = create_sr_config(config_path)
-super_resolution = SuperResolution(sr_config, model_path)
-print('model 2 loaded!', flush=True)
-
-audio_config = AudioConfig(
-    rate=config.dataset.param.voice_param.sample_rate,
-    chunk=config.dataset.param.voice_param.sample_rate,
-    vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
-    out_norm=4.5,
-)
-frame_period = config.dataset.param.acoustic_feature_param.frame_period
-
-vocoder = RealtimeVocoder(
-    acoustic_feature_param=config.dataset.param.acoustic_feature_param,
-    out_sampling_rate=audio_config.rate,
-    buffer_size=audio_config.vocoder_buffer_size,
-    number_of_pointers=16,
-)
-
-voice_changer = VoiceChanger(
-    super_resolution=super_resolution,
-    acoustic_converter=acoustic_converter,
-)
-
-voice_changer_stream = VoiceChangerStream(
-    sampling_rate=audio_config.rate,
-    frame_period=acoustic_converter._param.acoustic_feature_param.frame_period,
-    order=acoustic_converter._param.acoustic_feature_param.order,
-    in_dtype=numpy.float32,
-)
-
-voice_changer_stream.voice_changer = voice_changer
-voice_changer_stream.vocoder = vocoder
-
-wrapper = VoiceChangerStreamWrapper(
-    voice_changer_stream=voice_changer_stream,
-    extra_time_pre=1,
-    extra_time=0.2,
-)
-
-raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate)
-wave_out_list = []
-
-start_time = 0
-for i in range(0, len(raw_wave), audio_config.chunk):
-    wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate)
-    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in)
-    start_time += len(wave_in.wave) / wave_in.sampling_rate
-
-start_time = 0
-for i in range(len(raw_wave) // audio_config.chunk + 1):
-    feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate)
-    wrapper.voice_changer_stream.add_in_feature(start_time=start_time, feature=feature_in, frame_period=frame_period)
-    start_time += audio_config.chunk / audio_config.rate
-    print('pre', i, flush=True)
-
-start_time = 0
-for i in range(len(raw_wave) // audio_config.chunk + 1):
-    feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
-    wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period)
-    start_time += audio_config.chunk / audio_config.rate
-    print('cent', i, flush=True)
-
-start_time = 0
-for i in range(len(raw_wave) // audio_config.chunk + 1):
-    wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.rate)
-    wave_out_list.append(wave_out)
-    start_time += audio_config.chunk / audio_config.rate
-    print('post', i, flush=True)
-
-out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
-librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate)
author	Hiroshiba Kazuyuki <hihokaruta@gmail.com>	2018-03-12 00:08:43 +0900
committer	Hiroshiba Kazuyuki <hihokaruta@gmail.com>	2018-03-12 00:08:43 +0900
commit	210e8225e4f7c95d6d0c89309b5b1eb20e41e744 (patch)
tree	2bd9f737040217ffb1eefda7ce61f1a3198390d8
parent	f8823b1913c29ce2710f92d51b74cb84b74323b0 (diff)