diff --git a/conda-start.bat b/conda-start.bat index ecc5e2a..ddfad7b 100644 --- a/conda-start.bat +++ b/conda-start.bat @@ -5,7 +5,7 @@ for /f %%i in ('cd') do set ENV_NAME=%%~nxi SET INSTALL_DIR=%userprofile%\Miniconda3 SET PATH=%INSTALL_DIR%\condabin;%PATH% -conda info --envs | findstr /i %ENV_NAME% +CALL conda info --envs | findstr /i %ENV_NAME% if %errorlevel% == 0 ( echo %ENV_NAME% environment is already available ) else ( diff --git a/i18n/en_US.json b/i18n/en_US.json index 4b40ab9..4ec92d6 100644 --- a/i18n/en_US.json +++ b/i18n/en_US.json @@ -14,6 +14,7 @@ "inference.postprocess_model": "Choose one or more postprocessing model to run in sequence (optional)", "inference.device": "Device", "inference.format": "Audio Format", + "inference.channels": "Output Channels", "inference.merge_type": "Merge Type", "inference.agg": "Aggressiveness in isolating vocals", "inference.use_cache": "Reduce future processing time by saving results to disk", diff --git a/lib/audio.py b/lib/audio.py index cf1794e..72ef6c9 100644 --- a/lib/audio.py +++ b/lib/audio.py @@ -10,6 +10,7 @@ MAX_INT16 = 32768 SUPPORTED_AUDIO = ["mp3","flac","wav"] # ogg breaks soundfile +OUTPUT_CHANNELS = ["mono", "stereo"] AUTOTUNE_NOTES = np.array([ 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, @@ -25,7 +26,7 @@ 2959.96, 3135.96, 3322.44, 3520.00, 3729.31, 3951.07 ]) -def load_audio(file, sr=None): +def load_audio(file, sr, **kwargs): try: # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 # This launches a subprocess to decode audio while down-mixing and resampling as necessary. @@ -41,9 +42,9 @@ def load_audio(file, sr=None): except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.float32).flatten() + return np.frombuffer(out, np.float32).flatten(), sr -def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=False,to_mono=False,axis=0,**kwargs): +def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=False,axis=0,**kwargs): audio = np.array(input_audio[0],dtype="float32") if target_sr is None: target_sr=input_audio[1] @@ -51,13 +52,12 @@ def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=Fa if resample or input_audio[1]!=target_sr: audio = librosa.core.resample(np.array(input_audio[0],dtype="float32"),orig_sr=input_audio[1],target_sr=target_sr,**kwargs) - if to_mono and audio.ndim>1: audio=np.nanmedian(audio,axis) + if audio.ndim>1: audio=np.nanmedian(audio,axis=axis) + if norm: audio = librosa.util.normalize(audio,axis=axis) - if norm: audio = librosa.util.normalize(audio) - - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: audio /= audio_max - + audio_max = np.abs(audio).max()/.99 + if audio_max > 1: audio = audio / audio_max + if to_int16: audio = np.clip(audio * MAX_INT16, a_min=-MAX_INT16+1, a_max=MAX_INT16-1).astype("int16") print(f"after remix: shape={audio.shape}, max={audio.max()}, min={audio.min()}, mean={audio.mean()}, sr={target_sr}") @@ -65,23 +65,26 @@ def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=Fa def load_input_audio(fname,sr=None,**kwargs): if sr is None: sr=44100 - audio = load_audio(fname, sr) + sound = load_audio(fname, sr, **kwargs) # sound = librosa.load(fname,sr=sr,**kwargs) - sound = audio, sr print(f"loading sound {fname} {sound[0].shape} {sound[1]} {sound[0].dtype}") return sound -def save_input_audio(fname,input_audio,sr=None,to_int16=False): +def save_input_audio(fname,input_audio,sr=None,to_int16=False,to_stereo=False): print(f"saving sound to {fname}") os.makedirs(os.path.dirname(fname),exist_ok=True) - audio=np.array(input_audio[0],dtype="int16" if np.abs(input_audio[0]).max()>100 else "float32") + audio=np.array(input_audio[0],dtype="float32") + if to_int16: - max_a = np.abs(audio).max() * .99 - if max_a<1: - audio=(audio*max_a*MAX_INT16) - audio=audio.astype("int16") + audio_max = np.abs(audio).max()/.99 + if audio_max > 1: audio = audio / audio_max + audio = np.clip(audio * MAX_INT16, a_min=-MAX_INT16+1, a_max=MAX_INT16-1) + + if to_stereo and audio.ndim<2: audio=np.stack([audio,audio],axis=-1) + print(f"{audio.shape=}") + try: - sf.write(fname, audio, sr if sr else input_audio[1]) + sf.write(fname, audio.astype("int16" if np.abs(audio).max()>1 else "float32"), sr if sr else input_audio[1]) return f"File saved to ${fname}" except Exception as e: return f"failed to save audio: {e}" @@ -156,12 +159,12 @@ def pad_audio(*audios,axis=0): def merge_audio(audio1,audio2,sr=40000): print(f"merging audio audio1={audio1[0].shape,audio1[1]} audio2={audio2[0].shape,audio2[1]} sr={sr}") - m1,_=remix_audio(audio1,target_sr=sr) - m2,_=remix_audio(audio2,target_sr=sr) + m1,_=remix_audio(audio1,target_sr=sr,axis=0) + m2,_=remix_audio(audio2,target_sr=sr,axis=0) - mixed = pad_audio(m1,m2) + mixed = pad_audio(m1,m2,axis=0) - return remix_audio((mixed,sr),to_int16=True,norm=True,to_mono=True,axis=0) + return remix_audio((mixed,sr),to_int16=True,axis=0,norm=True) def autotune_f0(f0, threshold=0.): # autotuned_f0 = [] diff --git a/lib/karafan/inference.py b/lib/karafan/inference.py index fe810ef..c25e589 100644 --- a/lib/karafan/inference.py +++ b/lib/karafan/inference.py @@ -493,7 +493,7 @@ def SEPARATE(self, audio_file, BATCH_MODE=False): self.Save_Audio(6, vocal_final) - output_vocals = remix_audio((vocal_final, self.sample_rate),norm=True,to_int16=True,to_mono=True) + output_vocals = remix_audio((vocal_final, self.sample_rate),to_int16=True) print("► Save Music FINAL !") @@ -505,7 +505,7 @@ def SEPARATE(self, audio_file, BATCH_MODE=False): self.Save_Audio(7, music_final) - output_music = remix_audio((music_final, self.sample_rate),norm=True,to_int16=True,to_mono=True) + output_music = remix_audio((music_final, self.sample_rate),to_int16=True) print('--> Processing DONE !') diff --git a/lib/separators.py b/lib/separators.py index a41192a..38574de 100644 --- a/lib/separators.py +++ b/lib/separators.py @@ -152,7 +152,7 @@ def process_vocals(self,v_spec_m,input_high_end,input_high_end_h,return_dict={}) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) print(f"vocals done: {wav_vocals.shape}") - return_dict["vocals"] = remix_audio((wav_vocals,return_dict["sr"]),norm=True,to_int16=True,to_mono=True,axis=0) + return_dict["vocals"] = remix_audio((wav_vocals,return_dict["sr"]),to_int16=True,axis=0) return return_dict["vocals"] def process_instrumental(self,y_spec_m,input_high_end,input_high_end_h,return_dict={}): @@ -166,7 +166,7 @@ def process_instrumental(self,y_spec_m,input_high_end,input_high_end_h,return_di else: wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) print(f"instruments done: {wav_instrument.shape}") - return_dict["instrumentals"] = remix_audio((wav_instrument,return_dict["sr"]),norm=True,to_int16=True,to_mono=True,axis=0) + return_dict["instrumentals"] = remix_audio((wav_instrument,return_dict["sr"]),to_int16=True,axis=0) return return_dict["instrumentals"] def process_audio(self,y_spec_m,v_spec_m,input_high_end,input_high_end_h): diff --git a/pages/1_Inference.py b/pages/1_Inference.py index 8032981..b5e419e 100644 --- a/pages/1_Inference.py +++ b/pages/1_Inference.py @@ -9,7 +9,7 @@ from webui.components import file_uploader_form, initial_vocal_separation_params, initial_voice_conversion_params, save_vocal_separation_params, save_voice_conversion_params, vocal_separation_form, voice_conversion_form from lib.utils import ObjectNamespace from webui.contexts import SessionStateContext -from lib.audio import SUPPORTED_AUDIO, bytes_to_audio, merge_audio, remix_audio, save_input_audio +from lib.audio import OUTPUT_CHANNELS, SUPPORTED_AUDIO, bytes_to_audio, merge_audio, remix_audio, save_input_audio from lib.utils import gc_collect, get_filenames, get_index, get_optimal_torch_device @@ -37,6 +37,7 @@ def init_inference_state(): rvc_models=None, device=get_optimal_torch_device(), format="flac", + channels="mono", models=get_rvc_models(), model_name=None, @@ -95,13 +96,13 @@ def one_click_convert(state): state.output_audio = mixed_audio return state -def download_song(output_audio,output_audio_name,ext="mp3"): +def download_song(output_audio,output_audio_name,ext="mp3",to_stereo=False): audio_path = output_audio_name.split(".") output_dir = os.path.join(OUTPUT_DIR,"inference",audio_path[0]) os.makedirs(output_dir,exist_ok=True) - output_file = os.path.join(output_dir,f"{audio_path[1]}.{ext}") - if save_input_audio(output_file,output_audio,to_int16=True): + output_file = os.path.join(output_dir,".".join([audio_path[1],"stereo" if to_stereo else "mono",ext])) + if save_input_audio(output_file,output_audio,to_int16=True,to_stereo=to_stereo): return f"successfully saved to {output_file}" else: "failed to save" @@ -154,25 +155,17 @@ def render_voice_conversion_form(state): index=get_index(state.models,state.model_name), format_func=lambda option: os.path.basename(option).split(".")[0] ) - # col1, col2 = right.columns(2) - # if col1.button(i18n("inference.load_model.button"),use_container_width=True, type="primary"): - # del state.rvc_models - # state.rvc_models = load_model(state) - # gc_collect() - # if col2.button(i18n("inference.clear_data.button"),use_container_width=True): - # state = clear_data(state) - # st.experimental_rerun() - col1, col2 = st.columns(2) - state.device = col1.radio( - i18n("inference.device"), - disabled=not config.has_gpu, - options=DEVICE_OPTIONS,horizontal=True, - index=get_index(DEVICE_OPTIONS,state.device)) - state.format = col2.radio( - i18n("inference.format"), - options=SUPPORTED_AUDIO,horizontal=True, - index=get_index(SUPPORTED_AUDIO,state.format)) + col1, col2 = right.columns(2) + state.device = col1.radio( + i18n("inference.device"), + disabled=not config.has_gpu, + options=DEVICE_OPTIONS,horizontal=True, + index=get_index(DEVICE_OPTIONS,state.device)) + state.format = col2.radio( + i18n("inference.format"), + options=SUPPORTED_AUDIO,horizontal=True, + index=get_index(SUPPORTED_AUDIO,state.format)) st.subheader(i18n("inference.split_vocals")) with st.expander(i18n("inference.split_vocals.expander"),expanded=not (state.input_audio_name and len(state.uvr5_params.uvr_models))): @@ -210,14 +203,14 @@ def render_voice_conversion_form(state): if uploaded_vocals is not None: input_audio = bytes_to_audio( uploaded_vocals.getvalue()) - state.input_vocals = remix_audio(input_audio,norm=True,to_int16=True,to_mono=True) + state.input_vocals = remix_audio(input_audio,norm=True,to_int16=True) state.input_audio_name = uploaded_vocals.name del uploaded_vocals uploaded_instrumentals = col2.file_uploader("Upload your own instrumental file (if you didn't use voice extraction)",type=SUPPORTED_AUDIO) if uploaded_instrumentals is not None: input_audio = bytes_to_audio( uploaded_instrumentals.getvalue()) - state.input_instrumental = remix_audio(input_audio,norm=True,to_int16=True,to_mono=True) + state.input_instrumental = remix_audio(input_audio,norm=True,to_int16=True) state.input_audio_name = uploaded_instrumentals.name del uploaded_instrumentals @@ -259,5 +252,10 @@ def render_voice_conversion_form(state): if state.output_audio is not None: col2.write("Converted Song") col2.audio(state.output_audio[0],sample_rate=state.output_audio[1]) - if col2.button(i18n("inference.download.button")): - st.toast(download_song(state.output_audio,state.output_audio_name,ext="flac")) \ No newline at end of file + c1, c2 = col2.columns(2) + state.channels = c1.radio( + i18n("inference.channels"), + options=OUTPUT_CHANNELS,horizontal=True, + index=get_index(OUTPUT_CHANNELS,state.channels)) + if c2.button(i18n("inference.download.button")): + st.toast(download_song(state.output_audio,state.output_audio_name,ext=state.format,to_stereo=state.channels=="stereo")) \ No newline at end of file diff --git a/tts_cli.py b/tts_cli.py index 1136e47..2b897a9 100644 --- a/tts_cli.py +++ b/tts_cli.py @@ -81,7 +81,7 @@ def __tacotron2__(text, device="cpu"): speech = (waveforms.cpu().numpy().squeeze() * MAX_INT16).astype(np.int16) # return as numpy array - return remix_audio((speech, 22050),target_sr=16000,to_mono=True,norm=True) + return remix_audio((speech, 22050),target_sr=16000) def __edge__(text, speaker="en-US-JennyNeural"): import edge_tts diff --git a/uvr5_cli.py b/uvr5_cli.py index 381f618..b0a5341 100644 --- a/uvr5_cli.py +++ b/uvr5_cli.py @@ -125,11 +125,12 @@ def split_audio(uvr_models,audio_path,preprocess_models=[],postprocess_models=[] for model_path in uvr_models: args = (model_path,audio_path,agg,device,use_cache,cache_dir,num_threads,format) + print(f"processing... {args=}") vocals, instrumental, _ = __run_inference_worker(args) wav_vocals.append(vocals[0]) wav_instrument.append(instrumental[0]) - wav_instrument = merge_func(pad_audio(*wav_instrument),axis=0) - wav_vocals = merge_func(pad_audio(*wav_vocals),axis=0) + wav_instrument = np.nanmedian(pad_audio(*wav_instrument,axis=0),axis=0) + wav_vocals = merge_func(pad_audio(*wav_vocals,axis=0),axis=0) # postprocess vocals to reduce reverb if len(postprocess_models): @@ -148,8 +149,8 @@ def split_audio(uvr_models,audio_path,preprocess_models=[],postprocess_models=[] wav_vocals, _ = processed_audio vocals_file = intermediary_file - instrumental = remix_audio((wav_instrument,instrumental[-1]),norm=True,to_int16=True,to_mono=True) - vocals = remix_audio((wav_vocals,vocals[-1]),norm=True,to_int16=True,to_mono=True) + instrumental = remix_audio((wav_instrument,instrumental[-1]),to_int16=True) + vocals = remix_audio((wav_vocals,vocals[-1]),to_int16=True) return vocals, instrumental, input_audio diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 0833f6e..3c823b0 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -10,7 +10,7 @@ from pitch_extraction import FeatureExtractor -from lib.audio import load_input_audio, remix_audio +from lib.audio import MAX_INT16, load_input_audio, remix_audio from lib import config, BASE_MODELS_DIR from lib.utils import gc_collect, get_filenames @@ -194,9 +194,8 @@ def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, merge_ if resample_sr >= 16000 and tgt_sr != resample_sr: audio_opt = librosa.resample(audio_opt, orig_sr=tgt_sr, target_sr=resample_sr) - max_int16 = 32768 - audio_max = max(np.abs(audio_opt).max() / 0.99, 1) - audio_opt = (audio_opt * max_int16 / audio_max).astype(np.int16) + audio_max = np.abs(audio_opt).max() / 0.99 + audio_opt = (audio_opt * MAX_INT16 / audio_max).astype(np.int16) gc_collect() @@ -301,7 +300,7 @@ def vc_single( try: audio = input_audio[0] if input_audio is not None else load_input_audio(input_audio_path, 16000) - audio,_ = remix_audio((audio,input_audio[1] if input_audio is not None else 16000), target_sr=16000, norm=True, to_mono=True) + audio,_ = remix_audio((audio,input_audio[1] if input_audio is not None else 16000), target_sr=16000) times = [0, 0, 0] if_f0 = cpt.get("f0", 1) diff --git a/webui/components.py b/webui/components.py index cb0ede7..4beb54c 100644 --- a/webui/components.py +++ b/webui/components.py @@ -147,8 +147,8 @@ def voice_conversion_form(state, use_hybrid=True): col1, col2 = st.columns(2) state.merge_type = col1.radio( i18n("inference.merge_type"), - options=["median","mean"],horizontal=True, - index=get_index(["median","mean"],state.merge_type)) + options=MERGE_OPTIONS,horizontal=True, + index=get_index(MERGE_OPTIONS,state.merge_type)) state.f0_autotune = col2.checkbox(i18n("inference.f0_autotune"),value=state.f0_autotune) state.resample_sr = st.select_slider(i18n("inference.resample_sr"), options=[0,16000,24000,22050,40000,44100,48000],