summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHiroshiba Kazuyuki <hihokaruta@gmail.com>2018-03-11 02:37:25 +0900
committerHiroshiba Kazuyuki <hihokaruta@gmail.com>2018-03-11 02:37:25 +0900
commitf8823b1913c29ce2710f92d51b74cb84b74323b0 (patch)
treeb6a20f58f30a413369520c6e391729c6a45ba383
parentef2be3c389412e69d33d7da0066a4831500689d2 (diff)
無音を無視するように変更
-rw-r--r--become_yukarin/config/config.py3
-rw-r--r--become_yukarin/voice_changer.py9
-rw-r--r--scripts/realtime_voice_changer.py12
-rw-r--r--tests/test_voice_changer.py1
4 files changed, 17 insertions, 8 deletions
diff --git a/become_yukarin/config/config.py b/become_yukarin/config/config.py
index f1f24cf..ed128c4 100644
--- a/become_yukarin/config/config.py
+++ b/become_yukarin/config/config.py
@@ -145,3 +145,6 @@ def backward_compatible(d: Dict):
d['model']['generator_extensive_layers'] = 8
d['model']['discriminator_base_channels'] = 32
d['model']['discriminator_extensive_layers'] = 5
+
+ if 'weak_discriminator' not in d['model']:
+ d['model']['weak_discriminator'] = False
diff --git a/become_yukarin/voice_changer.py b/become_yukarin/voice_changer.py
index bed155f..7f7bbe4 100644
--- a/become_yukarin/voice_changer.py
+++ b/become_yukarin/voice_changer.py
@@ -4,6 +4,7 @@ from typing import NamedTuple
import numpy
+from become_yukarin.param import Param
from .acoustic_converter import AcousticConverter
from .data_struct import AcousticFeature
from .data_struct import Wave
@@ -78,10 +79,12 @@ class VoiceChangerStream(object):
self,
sampling_rate: int,
frame_period: float,
+ order: int,
in_dtype=numpy.float32,
):
self.sampling_rate = sampling_rate
self.frame_period = frame_period
+ self.order = order
self.in_dtype = in_dtype
self.voice_changer: VoiceChanger = None
@@ -189,8 +192,7 @@ class VoiceChangerStream(object):
return in_feature
def convert(self, start_time: float, time_length: float, extra_time: float):
- order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order
- sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order)
+ sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
keys = ['f0', 'aperiodicity', 'mfcc', 'voiced']
in_feature = self.fetch(
start_time=start_time,
@@ -209,8 +211,7 @@ class VoiceChangerStream(object):
return out_feature
def post_convert(self, start_time: float, time_length: float):
- order = self.voice_changer.acoustic_converter.config.dataset.param.acoustic_feature_param.order
- sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=order)
+ sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
keys = ['f0', 'aperiodicity', 'spectrogram', 'voiced']
out_feature = self.fetch(
start_time=start_time,
diff --git a/scripts/realtime_voice_changer.py b/scripts/realtime_voice_changer.py
index e96ce4e..bda64dd 100644
--- a/scripts/realtime_voice_changer.py
+++ b/scripts/realtime_voice_changer.py
@@ -1,11 +1,9 @@
+import librosa
import world4py
world4py._WORLD_LIBRARY_PATH = 'x64_world.dll'
-from functools import partial
from pathlib import Path
-import signal
-import time
from typing import NamedTuple
from multiprocessing import Queue
from multiprocessing import Process
@@ -34,6 +32,7 @@ class AudioConfig(NamedTuple):
convert_chunk: int
vocoder_buffer_size: int
out_norm: float
+ silent_threshold: float
def encode_worker(
@@ -123,7 +122,10 @@ def decode_worker(
wave_fragment = numpy.concatenate([wave_fragment, wave])
if len(wave_fragment) >= audio_config.audio_chunk:
wave, wave_fragment = wave_fragment[:audio_config.audio_chunk], wave_fragment[audio_config.audio_chunk:]
- queue_output.put(wave)
+
+ power = librosa.core.power_to_db(numpy.abs(librosa.stft(wave)) ** 2).mean()
+ if power >= audio_config.silent_threshold:
+ queue_output.put(wave)
def main():
@@ -154,11 +156,13 @@ def main():
convert_chunk=config.dataset.param.voice_param.sample_rate,
vocoder_buffer_size=config.dataset.param.voice_param.sample_rate // 16,
out_norm=2.5,
+ silent_threshold=-99.0,
)
voice_changer_stream = VoiceChangerStream(
sampling_rate=audio_config.rate,
frame_period=config.dataset.param.acoustic_feature_param.frame_period,
+ order=config.dataset.param.acoustic_feature_param.order,
in_dtype=numpy.float32,
)
diff --git a/tests/test_voice_changer.py b/tests/test_voice_changer.py
index 66ea003..9772bbe 100644
--- a/tests/test_voice_changer.py
+++ b/tests/test_voice_changer.py
@@ -67,6 +67,7 @@ voice_changer = VoiceChanger(
voice_changer_stream = VoiceChangerStream(
sampling_rate=audio_config.rate,
frame_period=acoustic_converter._param.acoustic_feature_param.frame_period,
+ order=acoustic_converter._param.acoustic_feature_param.order,
in_dtype=numpy.float32,
)