4 files changed, 355 insertions, 119 deletions
diff --git a/become_yukarin/data_struct.py b/become_yukarin/data_struct.py
index 78c8cf3..4474331 100644
--- a/become_yukarin/data_struct.py
+++ b/become_yukarin/data_struct.py
@@ -1,8 +1,9 @@
-from typing import NamedTuple
+from typing import NamedTuple, Dict, List
 
+import numpy
 import pyworld
 
-import numpy
+_min_mc = -18.3
 
 
 class Wave(NamedTuple):
@@ -11,11 +12,21 @@ class Wave(NamedTuple):
 
 
 class AcousticFeature(NamedTuple):
-    f0: numpy.ndarray
-    spectrogram: numpy.ndarray
-    aperiodicity: numpy.ndarray
-    mfcc: numpy.ndarray
-    voiced: numpy.ndarray
+    f0: numpy.ndarray = numpy.nan
+    spectrogram: numpy.ndarray = numpy.nan
+    aperiodicity: numpy.ndarray = numpy.nan
+    mfcc: numpy.ndarray = numpy.nan
+    voiced: numpy.ndarray = numpy.nan
+
+    @staticmethod
+    def dtypes():
+        return dict(
+            f0=numpy.float32,
+            spectrogram=numpy.float32,
+            aperiodicity=numpy.float32,
+            mfcc=numpy.float32,
+            voiced=numpy.bool,
+        )
 
     def astype(self, dtype):
         return AcousticFeature(
@@ -51,6 +62,45 @@ class AcousticFeature(NamedTuple):
         assert self.voiced.dtype == numpy.bool
 
     @staticmethod
+    def silent(length: int, sizes: Dict[str, int], keys: List[str]):
+        d = {}
+        if 'f0' in keys:
+            d['f0'] = numpy.zeros((length, sizes['f0']), dtype=AcousticFeature.dtypes()['f0'])
+        if 'spectrogram' in keys:
+            d['spectrogram'] = numpy.zeros((length, sizes['spectrogram']),
+                                           dtype=AcousticFeature.dtypes()['spectrogram'])
+        if 'aperiodicity' in keys:
+            d['aperiodicity'] = numpy.zeros((length, sizes['aperiodicity']),
+                                            dtype=AcousticFeature.dtypes()['aperiodicity'])
+        if 'mfcc' in keys:
+            d['mfcc'] = numpy.hstack((
+                numpy.ones((length, 1), dtype=AcousticFeature.dtypes()['mfcc']) * _min_mc,
+                numpy.zeros((length, sizes['mfcc'] - 1), dtype=AcousticFeature.dtypes()['mfcc'])
+            ))
+        if 'voiced' in keys:
+            d['voiced'] = numpy.zeros((length, sizes['voiced']), dtype=AcousticFeature.dtypes()['voiced'])
+        feature = AcousticFeature(**d)
+        return feature
+
+    @staticmethod
+    def concatenate(fs: List['AcousticFeature'], keys: List[str]):
+        is_target = lambda a: not numpy.any(numpy.isnan(a))
+        return AcousticFeature(**{
+            key: numpy.concatenate([getattr(f, key) for f in fs]) if is_target(getattr(fs[0], key)) else numpy.nan
+            for key in keys
+        })
+
+    def pick(self, first: int, last: int):
+        is_target = lambda a: not numpy.any(numpy.isnan(a))
+        return AcousticFeature(
+            f0=self.f0[first:last] if is_target(self.f0) else numpy.nan,
+            spectrogram=self.spectrogram[first:last] if is_target(self.spectrogram) else numpy.nan,
+            aperiodicity=self.aperiodicity[first:last] if is_target(self.aperiodicity) else numpy.nan,
+            mfcc=self.mfcc[first:last] if is_target(self.mfcc) else numpy.nan,
+            voiced=self.voiced[first:last] if is_target(self.voiced) else numpy.nan,
+        )
+
+    @staticmethod
     def get_sizes(sampling_rate: int, order: int):
         fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate)
         return dict(
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index 5e0eac0..bed155f 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -1,4 +1,5 @@
-from typing import List
+from abc import ABCMeta, abstractproperty, abstractmethod
+from typing import List, Callable, Any
 from typing import NamedTuple
 
 import numpy
@@ -15,7 +16,6 @@ class VoiceChanger(object):
             self,
             acoustic_converter: AcousticConverter,
             super_resolution: SuperResolution,
-            vocoder: Vocoder,
             output_sampling_rate: int = None,
     ) -> None:
         if output_sampling_rate is None:
@@ -23,19 +23,8 @@ class VoiceChanger(object):
 
         self.acoustic_converter = acoustic_converter
         self.super_resolution = super_resolution
-        self.vocoder = vocoder
         self.output_sampling_rate = output_sampling_rate
 
-    def convert_from_wave_path(self, wave_path: str):
-        w_in = self.acoustic_converter._wave_process(wave_path)
-        return self.convert_from_wave(w_in)
-
-    def convert_from_wave(self, wave: Wave):
-        f_in = self.acoustic_converter._feature_process(wave)
-        f_high = self.convert_from_acoustic_feature(f_in)
-        wave = self.vocoder.decode(f_high)
-        return wave
-
     def convert_from_acoustic_feature(self, f_in: AcousticFeature):
         f_low = self.acoustic_converter.convert_to_feature(f_in)
         s_high = self.super_resolution.convert(f_low.spectrogram.astype(numpy.float32))
@@ -43,7 +32,35 @@ class VoiceChanger(object):
         return f_high
 
 
-class Segment(NamedTuple):
+class BaseSegment(ABCMeta):
+    start_time: float
+
+    @property
+    @abstractmethod
+    def time_length(self) -> float:
+        pass
+
+    @property
+    @abstractmethod
+    def end_time(self) -> float:
+        pass
+
+
+class FeatureSegment(NamedTuple, BaseSegment):
+    start_time: float
+    feature: AcousticFeature
+    frame_period: float
+
+    @property
+    def time_length(self):
+        return len(self.feature.f0) * self.frame_period / 1000
+
+    @property
+    def end_time(self):
+        return self.time_length + self.start_time
+
+
+class WaveSegment(NamedTuple, BaseSegment):
     start_time: float
     wave: Wave
 
@@ -59,44 +76,73 @@ class Segment(NamedTuple):
 class VoiceChangerStream(object):
     def __init__(
             self,
-            voice_changer: VoiceChanger,
             sampling_rate: int,
+            frame_period: float,
             in_dtype=numpy.float32,
     ):
-        self.voice_changer = voice_changer
         self.sampling_rate = sampling_rate
+        self.frame_period = frame_period
         self.in_dtype = in_dtype
-        self._data_stream = []  # type: List[Segment]
 
-    @property
-    def vocoder(self):
-        return self.voice_changer.vocoder
+        self.voice_changer: VoiceChanger = None
+        self.vocoder: Vocoder = None
+        self._data_stream = []  # type: List[WaveSegment]
+        self._in_feature_stream = []  # type: List[FeatureSegment]
+        self._out_feature_stream = []  # type: List[FeatureSegment]
 
     def add_wave(self, start_time: float, wave: Wave):
         # validation
         assert wave.sampling_rate == self.sampling_rate
         assert wave.wave.dtype == self.in_dtype
 
-        segment = Segment(start_time=start_time, wave=wave)
+        segment = WaveSegment(start_time=start_time, wave=wave)
         self._data_stream.append(segment)
 
-    def remove_wave(self, end_time: float):
+    def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
+        # validation
+        assert frame_period == self.frame_period
+        assert feature.f0.dtype == self.in_dtype
+
+        segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
+        self._in_feature_stream.append(segment)
+
+    def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float):
+        # validation
+        assert frame_period == self.frame_period
+
+        segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period)
+        self._out_feature_stream.append(segment)
+
+    def remove(self, end_time: float):
         self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream))
+        self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream))
+        self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream))
+
+    @staticmethod
+    def fetch(
+            start_time: float,
+            time_length: float,
+            data_stream: List[BaseSegment],
+            rate: float,
+            pad_function: Callable[[int], Any],
+            pick_function: Callable[[Any, int, int], Any],
+            concat_function: Callable[[List], Any],
+            extra_time: float = 0,
+    ):
+        start_time -= extra_time
+        time_length += extra_time * 2
 
-    def convert_to_feature(self, start_time: float, time_length: float):
         end_time = start_time + time_length
         buffer_list = []
-        stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream)
+        stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), data_stream)
 
         start_time_buffer = start_time
         remaining_time = time_length
         for segment in stream:
             # padding
             if segment.start_time > start_time_buffer:
-                pad = numpy.zeros(
-                    shape=int((segment.start_time - start_time_buffer) * self.sampling_rate),
-                    dtype=self.in_dtype,
-                )
+                length = int((segment.start_time - start_time_buffer) * rate)
+                pad = pad_function(length)
                 buffer_list.append(pad)
                 start_time_buffer = segment.start_time
 
@@ -105,9 +151,9 @@ class VoiceChangerStream(object):
             else:
                 one_time_length = remaining_time
 
-            first_index = int((start_time_buffer - segment.start_time) * self.sampling_rate)
-            last_index = int(first_index + one_time_length * self.sampling_rate)
-            one_buffer = segment.wave.wave[first_index:last_index]
+            first_index = int((start_time_buffer - segment.start_time) * rate)
+            last_index = int(first_index + one_time_length * rate)
+            one_buffer = pick_function(segment, first_index, last_index)
             buffer_list.append(one_buffer)
 
             start_time_buffer += one_time_length
@@ -117,44 +163,67 @@ class VoiceChangerStream(object):
                 break
         else:
             # last padding
-            pad = numpy.zeros(shape=int((end_time - start_time_buffer) * self.sampling_rate), dtype=self.in_dtype)
+            length = int((end_time - start_time_buffer) * rate)
+            pad = pad_function(length)
             buffer_list.append(pad)
 
-        buffer = numpy.concatenate(buffer_list)
-        in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate)
-        in_feature = self.vocoder.encode(in_wave)
-        out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
-        return out_feature
+        buffer = concat_function(buffer_list)
+        return buffer
 
-    def convert(self, start_time: float, time_length: float):
-        feature = self.convert_to_feature(start_time=start_time, time_length=time_length)
-        out_wave = self.vocoder.decode(
-            acoustic_feature=feature,
+    def pre_convert(self, start_time: float, time_length: float, extra_time: float):
+        wave = self.fetch(
+            start_time=start_time,
+            time_length=time_length,
+            extra_time=extra_time,
+            data_stream=self._data_stream,
+            rate=self.sampling_rate,
+            pad_function=lambda length: numpy.zeros(shape=length, dtype=self.in_dtype),
+            pick_function=lambda segment, first, last: segment.wave.wave[first:last],
+            concat_function=numpy.concatenate,
         )
-        return out_wave
+        in_wave = Wave(wave=wave, sampling_rate=self.sampling_rate)
+        in_feature = self.vocoder.encode(in_wave)
 
-    def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float):
-        """
-        :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。
-        """
-        frame_period = self.vocoder.acoustic_feature_param.frame_period
+        pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000))
+        in_feature = in_feature.pick(pad, -pad)
+        return in_feature
 
-        start_time -= extra_time
-        time_length += extra_time * 2
+    def convert(self, start_time: float, time_length: float, extra_time: float):
+        order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order
+        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order)
+        keys = ['f0', 'aperiodicity', 'mfcc', 'voiced']
+        in_feature = self.fetch(
+            start_time=start_time,
+            time_length=time_length,
+            extra_time=extra_time,
+            data_stream=self._in_feature_stream,
+            rate=1000 / self.frame_period,
+            pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys),
+            pick_function=lambda segment, first, last: segment.feature.pick(first, last),
+            concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys),
+        )
+        out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)
 
-        extra_feature = self.convert_to_feature(start_time=start_time, time_length=time_length)
+        pad = int(extra_time * 1000 / self.frame_period)
+        out_feature = out_feature.pick(pad, -pad)
+        return out_feature
 
-        pad = int(extra_time / (frame_period / 1000))
-        feature = AcousticFeature(
-            f0=extra_feature.f0[pad:-pad],
-            spectrogram=extra_feature.spectrogram[pad:-pad],
-            aperiodicity=extra_feature.aperiodicity[pad:-pad],
-            mfcc=extra_feature.mfcc[pad:-pad],
-            voiced=extra_feature.voiced[pad:-pad],
+    def post_convert(self, start_time: float, time_length: float):
+        order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order
+        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order)
+        keys = ['f0', 'aperiodicity', 'spectrogram', 'voiced']
+        out_feature = self.fetch(
+            start_time=start_time,
+            time_length=time_length,
+            data_stream=self._out_feature_stream,
+            rate=1000 / self.frame_period,
+            pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys),
+            pick_function=lambda segment, first, last: segment.feature.pick(first, last),
+            concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys),
         )
 
         out_wave = self.vocoder.decode(
-            acoustic_feature=feature,
+            acoustic_feature=out_feature,
         )
         return out_wave
 
@@ -163,20 +232,46 @@ class VoiceChangerStreamWrapper(object):
     def __init__(
             self,
             voice_changer_stream: VoiceChangerStream,
-            extra_time: float = 0.0
+            extra_time_pre: float = 0.0,
+            extra_time: float = 0.0,
     ):
         self.voice_changer_stream = voice_changer_stream
+        self.extra_time_pre = extra_time_pre
         self.extra_time = extra_time
+        self._current_time_pre = 0
         self._current_time = 0
+        self._current_time_post = 0
+
+    def pre_convert_next(self, time_length: float):
+        in_feature = self.voice_changer_stream.pre_convert(
+            start_time=self._current_time_pre,
+            time_length=time_length,
+            extra_time=self.extra_time_pre,
+        )
+        self._current_time_pre += time_length
+        return in_feature
 
     def convert_next(self, time_length: float):
-        out_wave = self.voice_changer_stream.convert_with_extra_time(
+        out_feature = self.voice_changer_stream.convert(
             start_time=self._current_time,
             time_length=time_length,
             extra_time=self.extra_time,
         )
         self._current_time += time_length
+        return out_feature
+
+    def post_convert_next(self, time_length: float):
+        out_wave = self.voice_changer_stream.post_convert(
+            start_time=self._current_time_post,
+            time_length=time_length,
+        )
+        self._current_time_post += time_length
         return out_wave
 
-    def remove_previous_wave(self):
-        self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time)
+    def remove_previous(self):
+        end_time = min(
+            self._current_time_pre - self.extra_time_pre,
+            self._current_time - self.extra_time,
+            self._current_time_post,
+        )
+        self.voice_changer_stream.remove(end_time=end_time)
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index a5d1a21..e96ce4e 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -14,92 +14,128 @@ import numpy
 import pyaudio
 
 from become_yukarin import AcousticConverter
+from become_yukarin import Vocoder
 from become_yukarin import RealtimeVocoder
 from become_yukarin import SuperResolution
 from become_yukarin import VoiceChanger
+from become_yukarin.config.config import Config
 from become_yukarin.config.config import create_from_json as create_config
 from become_yukarin.config.sr_config import create_from_json as create_sr_config
 from become_yukarin.data_struct import Wave
+from become_yukarin.data_struct import AcousticFeature
 from become_yukarin.voice_changer import VoiceChangerStream
 from become_yukarin.voice_changer import VoiceChangerStreamWrapper
 
 
 class AudioConfig(NamedTuple):
     rate: int
+    frame_period: float
     audio_chunk: int
     convert_chunk: int
     vocoder_buffer_size: int
     out_norm: float
 
 
-def convert_worker(
-        config,
-        acoustic_converter,
-        super_resolution,
+def encode_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
         audio_config: AudioConfig,
-        queue_input_wave,
-        queue_output_wave,
+        queue_input: Queue,
+        queue_output: Queue,
 ):
-    vocoder = RealtimeVocoder(
+    wrapper.voice_changer_stream.vocoder = Vocoder(
         acoustic_feature_param=config.dataset.param.acoustic_feature_param,
         out_sampling_rate=audio_config.rate,
-        buffer_size=audio_config.vocoder_buffer_size,
-        number_of_pointers=16,
     )
-    # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
 
-    voice_changer = VoiceChanger(
+    start_time = 0
+    time_length = audio_config.convert_chunk / audio_config.rate
+
+    while True:
+        wave = queue_input.get()
+
+        w = Wave(wave=wave, sampling_rate=audio_config.rate)
+        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+        start_time += time_length
+
+        feature = wrapper.pre_convert_next(time_length=time_length)
+        queue_output.put(feature)
+
+
+def convert_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
+        acoustic_converter: AcousticConverter,
+        super_resolution: SuperResolution,
+        audio_config: AudioConfig,
+        queue_input: Queue,
+        queue_output: Queue,
+):
+    wrapper.voice_changer_stream.voice_changer = VoiceChanger(
         super_resolution=super_resolution,
         acoustic_converter=acoustic_converter,
-        vocoder=vocoder,
     )
 
-    voice_changer_stream = VoiceChangerStream(
-        voice_changer=voice_changer,
-        sampling_rate=audio_config.rate,
-        in_dtype=numpy.float32,
-    )
+    start_time = 0
+    time_length = audio_config.convert_chunk / audio_config.rate
+    while True:
+        in_feature: AcousticFeature = queue_input.get()
+        wrapper.voice_changer_stream.add_in_feature(
+            start_time=start_time,
+            feature=in_feature,
+            frame_period=audio_config.frame_period,
+        )
+        start_time += time_length
 
-    wrapper = VoiceChangerStreamWrapper(
-        voice_changer_stream=voice_changer_stream,
-        extra_time=0.1,
+        out_feature = wrapper.convert_next(time_length=time_length)
+        queue_output.put(out_feature)
+
+
+def decode_worker(
+        config: Config,
+        wrapper: VoiceChangerStreamWrapper,
+        audio_config: AudioConfig,
+        queue_input: Queue,
+        queue_output: Queue,
+):
+    wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
+        acoustic_feature_param=config.dataset.param.acoustic_feature_param,
+        out_sampling_rate=audio_config.rate,
+        buffer_size=audio_config.vocoder_buffer_size,
+        number_of_pointers=16,
     )
+    # vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
 
     start_time = 0
-    wave = numpy.zeros(audio_config.convert_chunk * 2, dtype=numpy.float32)
-    wave = Wave(wave=wave, sampling_rate=audio_config.rate)
-    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave)
-    start_time += len(wave.wave) / wave.sampling_rate
-    wave = wrapper.convert_next(time_length=1)
-
     time_length = audio_config.convert_chunk / audio_config.rate
     wave_fragment = numpy.empty(0)
     while True:
-        wave = queue_input_wave.get()
-        w = Wave(wave=wave, sampling_rate=audio_config.rate)
-        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
+        feature: AcousticFeature = queue_input.get()
+        wrapper.voice_changer_stream.add_out_feature(
+            start_time=start_time,
+            feature=feature,
+            frame_period=audio_config.frame_period,
+        )
         start_time += time_length
 
-        b = time.time()
-        wave = wrapper.convert_next(time_length=time_length).wave
-        print('time', time.time()-b, flush=True)
-        wrapper.remove_previous_wave()
-        print('converted wave', len(wave), flush=True)
+        wave = wrapper.post_convert_next(time_length=time_length).wave
 
         wave_fragment = numpy.concatenate([wave_fragment, wave])
         if len(wave_fragment) >= audio_config.audio_chunk:
             wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
-            queue_output_wave.put(wave)
+            queue_output.put(wave)
 
 
 def main():
     print('model loading...', flush=True)
 
     queue_input_wave = Queue()
+    queue_input_feature = Queue()
+    queue_output_feature = Queue()
     queue_output_wave = Queue()
 
-    model_path = Path('./trained/harvest-innoise03/predictor_1390000.npz')
-    config_path = Path('./trained/harvest-innoise03/config.json')
+    model_path = Path('./trained/pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+    config_path = Path('./trained/pp-weakD-innoise01-tarnoise001/config.json')
     config = create_config(config_path)
     acoustic_converter = AcousticConverter(config, model_path, gpu=0)
     print('model 1 loaded!', flush=True)
@@ -113,23 +149,53 @@ def main():
     audio_instance = pyaudio.PyAudio()
     audio_config = AudioConfig(
         rate=config.dataset.param.voice_param.sample_rate,
+        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
         audio_chunk=config.dataset.param.voice_param.sample_rate,
         convert_chunk=config.dataset.param.voice_param.sample_rate,
         vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
         out_norm=2.5,
     )
 
-    process_converter = Process(target=convert_worker, kwargs=dict(
+    voice_changer_stream = VoiceChangerStream(
+        sampling_rate=audio_config.rate,
+        frame_period=config.dataset.param.acoustic_feature_param.frame_period,
+        in_dtype=numpy.float32,
+    )
+
+    wrapper = VoiceChangerStreamWrapper(
+        voice_changer_stream=voice_changer_stream,
+        extra_time_pre=0.2,
+        extra_time=0.1,
+    )
+
+    process_encoder = Process(target=encode_worker, kwargs=dict(
         config=config,
+        wrapper=wrapper,
         audio_config=audio_config,
+        queue_input=queue_input_wave,
+        queue_output=queue_input_feature,
+    ))
+    process_encoder.start()
+
+    process_converter = Process(target=convert_worker, kwargs=dict(
+        config=config,
+        wrapper=wrapper,
         acoustic_converter=acoustic_converter,
         super_resolution=super_resolution,
-        queue_input_wave=queue_input_wave,
-        queue_output_wave=queue_output_wave,
+        audio_config=audio_config,
+        queue_input=queue_input_feature,
+        queue_output=queue_output_feature,
     ))
     process_converter.start()
 
-    signal.signal(signal.SIGINT, lambda signum, frame: process_converter.terminate())
+    process_decoder = Process(target=decode_worker, kwargs=dict(
+        config=config,
+        wrapper=wrapper,
+        audio_config=audio_config,
+        queue_input=queue_output_feature,
+        queue_output=queue_output_wave,
+    ))
+    process_decoder.start()
 
     audio_stream = audio_instance.open(
         format=pyaudio.paFloat32,
@@ -149,6 +215,11 @@ def main():
         print('input', len(wave), flush=True)
         queue_input_wave.put(wave)
 
+        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
+        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
+        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
+        print('queue_output_wave', queue_output_wave.qsize(), flush=True)
+
         # output
         try:
             wave = queue_output_wave.get_nowait()
diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py
index ceddf9c..66ea003 100644
--- a/tests/test_voice_changer.py
+++ b/tests/test_voice_changer.py
@@ -32,16 +32,16 @@ test_output_path = Path('output.wav')
 
 print('model loading...', flush=True)
 
-model_path = model_base_path / Path('harvest-innoise03/predictor_1390000.npz')
-config_path = model_base_path / Path('harvest-innoise03/config.json')
+model_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/predictor_120000.npz')
+config_path = model_base_path / Path('pp-weakD-innoise01-tarnoise001/config.json')
 config = create_config(config_path)
-acoustic_converter = AcousticConverter(config, model_path, gpu=0)
+acoustic_converter = AcousticConverter(config, model_path)
 print('model 1 loaded!', flush=True)
 
 model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
 config_path = model_base_path / Path('sr-noise3/config.json')
 sr_config = create_sr_config(config_path)
-super_resolution = SuperResolution(sr_config, model_path, gpu=0)
+super_resolution = SuperResolution(sr_config, model_path)
 print('model 2 loaded!', flush=True)
 
 audio_config = AudioConfig(
@@ -50,6 +50,7 @@ audio_config = AudioConfig(
     vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
     out_norm=4.5,
 )
+frame_period = config.dataset.param.acoustic_feature_param.frame_period
 
 vocoder = RealtimeVocoder(
     acoustic_feature_param=config.dataset.param.acoustic_feature_param,
@@ -57,22 +58,24 @@ vocoder = RealtimeVocoder(
     buffer_size=audio_config.vocoder_buffer_size,
     number_of_pointers=16,
 )
-# vocoder.warm_up(audio_config.vocoder_buffer_size / config.dataset.param.voice_param.sample_rate)
 
 voice_changer = VoiceChanger(
     super_resolution=super_resolution,
     acoustic_converter=acoustic_converter,
-    vocoder=vocoder,
 )
 
 voice_changer_stream = VoiceChangerStream(
-    voice_changer=voice_changer,
     sampling_rate=audio_config.rate,
+    frame_period=acoustic_converter._param.acoustic_feature_param.frame_period,
     in_dtype=numpy.float32,
 )
 
+voice_changer_stream.voice_changer = voice_changer
+voice_changer_stream.vocoder = vocoder
+
 wrapper = VoiceChangerStreamWrapper(
     voice_changer_stream=voice_changer_stream,
+    extra_time_pre=1,
     extra_time=0.2,
 )
 
@@ -85,9 +88,26 @@ for i in range(0, len(raw_wave), audio_config.chunk):
     wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in)
     start_time += len(wave_in.wave) / wave_in.sampling_rate
 
-    wave_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+    feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate)
+    wrapper.voice_changer_stream.add_in_feature(start_time=start_time, feature=feature_in, frame_period=frame_period)
+    start_time += audio_config.chunk / audio_config.rate
+    print('pre', i, flush=True)
+
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+    feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.rate)
+    wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period)
+    start_time += audio_config.chunk / audio_config.rate
+    print('cent', i, flush=True)
+
+start_time = 0
+for i in range(len(raw_wave) // audio_config.chunk + 1):
+    wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.rate)
     wave_out_list.append(wave_out)
-    wrapper.remove_previous_wave()
+    start_time += audio_config.chunk / audio_config.rate
+    print('post', i, flush=True)
 
 out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
 librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.rate)