diff options
Diffstat (limited to 'become_yukarin')
| -rw-r--r-- | become_yukarin/voice_changer.py | 272 |
1 files changed, 227 insertions, 45 deletions
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py index 5e0eac0..3b75fb1 100644 --- a/become_yukarin/voice_changer.py +++ b/become_yukarin/voice_changer.py @@ -15,7 +15,6 @@ class VoiceChanger(object): self, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, - vocoder: Vocoder, output_sampling_rate: int = None, ) -> None: if output_sampling_rate is None: @@ -23,18 +22,17 @@ class VoiceChanger(object): self.acoustic_converter = acoustic_converter self.super_resolution = super_resolution - self.vocoder = vocoder self.output_sampling_rate = output_sampling_rate - def convert_from_wave_path(self, wave_path: str): - w_in = self.acoustic_converter._wave_process(wave_path) - return self.convert_from_wave(w_in) - - def convert_from_wave(self, wave: Wave): - f_in = self.acoustic_converter._feature_process(wave) - f_high = self.convert_from_acoustic_feature(f_in) - wave = self.vocoder.decode(f_high) - return wave + # def convert_from_wave_path(self, wave_path: str): + # w_in = self.acoustic_converter._wave_process(wave_path) + # return self.convert_from_wave(w_in) + # + # def convert_from_wave(self, wave: Wave): + # f_in = self.acoustic_converter._feature_process(wave) + # f_high = self.convert_from_acoustic_feature(f_in) + # wave = self.vocoder.decode(f_high) + # return wave def convert_from_acoustic_feature(self, f_in: AcousticFeature): f_low = self.acoustic_converter.convert_to_feature(f_in) @@ -43,6 +41,20 @@ class VoiceChanger(object): return f_high +class FeatureSegment(NamedTuple): + start_time: float + feature: AcousticFeature + frame_period: float + + @property + def time_length(self): + return len(self.feature.f0) * self.frame_period / 1000 + + @property + def end_time(self): + return self.time_length + self.start_time + + class Segment(NamedTuple): start_time: float wave: Wave @@ -59,18 +71,19 @@ class Segment(NamedTuple): class VoiceChangerStream(object): def __init__( self, - voice_changer: VoiceChanger, sampling_rate: int, + frame_period: float, in_dtype=numpy.float32, ): - self.voice_changer = voice_changer self.sampling_rate = sampling_rate + self.frame_period = frame_period self.in_dtype = in_dtype - self._data_stream = [] # type: List[Segment] - @property - def vocoder(self): - return self.voice_changer.vocoder + self.voice_changer: VoiceChanger = None + self.vocoder: Vocoder = None + self._data_stream = [] # type: List[Segment] + self._in_feature_stream = [] # type: List[FeatureSegment] + self._out_feature_stream = [] # type: List[FeatureSegment] def add_wave(self, start_time: float, wave: Wave): # validation @@ -80,10 +93,30 @@ class VoiceChangerStream(object): segment = Segment(start_time=start_time, wave=wave) self._data_stream.append(segment) - def remove_wave(self, end_time: float): + def add_in_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): + # validation + assert frame_period == self.frame_period + assert feature.f0.dtype == self.in_dtype + + segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) + self._in_feature_stream.append(segment) + + def add_out_feature(self, start_time: float, feature: AcousticFeature, frame_period: float): + # validation + assert frame_period == self.frame_period + + segment = FeatureSegment(start_time=start_time, feature=feature, frame_period=self.frame_period) + self._out_feature_stream.append(segment) + + def remove(self, end_time: float): self._data_stream = list(filter(lambda s: s.end_time > end_time, self._data_stream)) + self._in_feature_stream = list(filter(lambda s: s.end_time > end_time, self._in_feature_stream)) + self._out_feature_stream = list(filter(lambda s: s.end_time > end_time, self._out_feature_stream)) + + def pre_convert(self, start_time: float, time_length: float, extra_time: float): + start_time -= extra_time + time_length += extra_time * 2 - def convert_to_feature(self, start_time: float, time_length: float): end_time = start_time + time_length buffer_list = [] stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._data_stream) @@ -123,38 +156,161 @@ class VoiceChangerStream(object): buffer = numpy.concatenate(buffer_list) in_wave = Wave(wave=buffer, sampling_rate=self.sampling_rate) in_feature = self.vocoder.encode(in_wave) - out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) - return out_feature - def convert(self, start_time: float, time_length: float): - feature = self.convert_to_feature(start_time=start_time, time_length=time_length) - out_wave = self.vocoder.decode( - acoustic_feature=feature, + pad = int(extra_time / (self.vocoder.acoustic_feature_param.frame_period / 1000)) + in_feature = AcousticFeature( + f0=in_feature.f0[pad:-pad], + spectrogram=in_feature.spectrogram[pad:-pad], + aperiodicity=in_feature.aperiodicity[pad:-pad], + mfcc=in_feature.mfcc[pad:-pad], + voiced=in_feature.voiced[pad:-pad], ) - return out_wave - - def convert_with_extra_time(self, start_time: float, time_length: float, extra_time: float): - """ - :param extra_time: 音声変換時に余分に使うデータの時間長。ゼロパディングを防ぐ。 - """ - frame_period = self.vocoder.acoustic_feature_param.frame_period + return in_feature + def convert(self, start_time: float, time_length: float, extra_time: float): start_time -= extra_time time_length += extra_time * 2 - extra_feature = self.convert_to_feature(start_time=start_time, time_length=time_length) + order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order + + end_time = start_time + time_length + f0_buffer_list = [] + mfcc_buffer_list = [] + ap_buffer_list = [] + voiced_buffer_list = [] + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._in_feature_stream) + + start_time_buffer = start_time + remaining_time = time_length + for segment in stream: + # padding + if segment.start_time > start_time_buffer: + pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + start_time_buffer = segment.start_time + if remaining_time > segment.end_time - start_time_buffer: + one_time_length = segment.end_time - start_time_buffer + else: + one_time_length = remaining_time + + first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) + last_index = int(first_index + one_time_length * 1000 / self.frame_period) + + f0_buffer_list.append(segment.feature.f0[first_index:last_index]) + mfcc_buffer_list.append(segment.feature.mfcc[first_index:last_index]) + ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) + voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) + + start_time_buffer += one_time_length + remaining_time -= one_time_length + + if start_time_buffer >= end_time: + break + else: + # last padding + pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + mfcc_buffer_list.append(numpy.zeros(shape=[pad_size, dims['mfcc']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + f0 = numpy.concatenate(f0_buffer_list) + mfcc = numpy.concatenate(mfcc_buffer_list) + aperiodicity = numpy.concatenate(ap_buffer_list) + voiced = numpy.concatenate(voiced_buffer_list) + in_feature = AcousticFeature( + f0=f0, + spectrogram=numpy.nan, + aperiodicity=aperiodicity, + mfcc=mfcc, + voiced=voiced, + ) + + out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) + + pad = int(extra_time * 1000 / self.frame_period) + out_feature= AcousticFeature( + f0=out_feature.f0[pad:-pad], + spectrogram=out_feature.spectrogram[pad:-pad], + aperiodicity=out_feature.aperiodicity[pad:-pad], + mfcc=out_feature.mfcc[pad:-pad], + voiced=out_feature.voiced[pad:-pad], + ) + return out_feature + + def post_convert(self, start_time: float, time_length: float): + end_time = start_time + time_length + f0_buffer_list = [] + sp_buffer_list = [] + ap_buffer_list = [] + voiced_buffer_list = [] + stream = filter(lambda s: not (end_time < s.start_time or s.end_time < start_time), self._out_feature_stream) + + start_time_buffer = start_time + remaining_time = time_length + for segment in stream: + # padding + if segment.start_time > start_time_buffer: + pad_size = int((segment.start_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=numpy.bool)) + + start_time_buffer = segment.start_time + + if remaining_time > segment.end_time - start_time_buffer: + one_time_length = segment.end_time - start_time_buffer + else: + one_time_length = remaining_time + + first_index = int((start_time_buffer - segment.start_time) * 1000 / self.frame_period) + last_index = int(first_index + one_time_length * 1000 / self.frame_period) + + f0_buffer_list.append(segment.feature.f0[first_index:last_index]) + sp_buffer_list.append(segment.feature.spectrogram[first_index:last_index]) + ap_buffer_list.append(segment.feature.aperiodicity[first_index:last_index]) + voiced_buffer_list.append(segment.feature.voiced[first_index:last_index]) + + start_time_buffer += one_time_length + remaining_time -= one_time_length + + if start_time_buffer >= end_time: + break + else: + # last padding + pad_size = int((end_time - start_time_buffer) * 1000 / self.frame_period) + dims = AcousticFeature.get_sizes(self.sampling_rate, self.vocoder.acoustic_feature_param.order) + + f0_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) + sp_buffer_list.append(numpy.zeros(shape=[pad_size, dims['spectrogram']], dtype=self.in_dtype)) + ap_buffer_list.append(numpy.zeros(shape=[pad_size, dims['aperiodicity']], dtype=self.in_dtype)) + voiced_buffer_list.append(numpy.zeros(shape=[pad_size, 1], dtype=self.in_dtype)) - pad = int(extra_time / (frame_period / 1000)) - feature = AcousticFeature( - f0=extra_feature.f0[pad:-pad], - spectrogram=extra_feature.spectrogram[pad:-pad], - aperiodicity=extra_feature.aperiodicity[pad:-pad], - mfcc=extra_feature.mfcc[pad:-pad], - voiced=extra_feature.voiced[pad:-pad], + f0 = numpy.concatenate(f0_buffer_list) + spectrogram = numpy.concatenate(sp_buffer_list) + aperiodicity = numpy.concatenate(ap_buffer_list) + voiced = numpy.concatenate(voiced_buffer_list) + out_feature = AcousticFeature( + f0=f0, + spectrogram=spectrogram, + aperiodicity=aperiodicity, + mfcc=numpy.nan, + voiced=voiced, ) out_wave = self.vocoder.decode( - acoustic_feature=feature, + acoustic_feature=out_feature, ) return out_wave @@ -163,20 +319,46 @@ class VoiceChangerStreamWrapper(object): def __init__( self, voice_changer_stream: VoiceChangerStream, - extra_time: float = 0.0 + extra_time_pre: float = 0.0, + extra_time: float = 0.0, ): self.voice_changer_stream = voice_changer_stream + self.extra_time_pre = extra_time_pre self.extra_time = extra_time + self._current_time_pre = 0 self._current_time = 0 + self._current_time_post = 0 + + def pre_convert_next(self, time_length: float): + in_feature = self.voice_changer_stream.pre_convert( + start_time=self._current_time_pre, + time_length=time_length, + extra_time=self.extra_time_pre, + ) + self._current_time_pre += time_length + return in_feature def convert_next(self, time_length: float): - out_wave = self.voice_changer_stream.convert_with_extra_time( + out_feature = self.voice_changer_stream.convert( start_time=self._current_time, time_length=time_length, extra_time=self.extra_time, ) self._current_time += time_length + return out_feature + + def post_convert_next(self, time_length: float): + out_wave = self.voice_changer_stream.post_convert( + start_time=self._current_time_post, + time_length=time_length, + ) + self._current_time_post += time_length return out_wave - def remove_previous_wave(self): - self.voice_changer_stream.remove_wave(end_time=self._current_time - self.extra_time) + def remove_previous(self): + end_time = min( + self._current_time_pre - self.extra_time_pre, + self._current_time - self.extra_time, + self._current_time_post, + ) + self.voice_changer_stream.remove(end_time=end_time) |
