Skip to content

Commit

Permalink
added output channel option
Browse files Browse the repository at this point in the history
  • Loading branch information
SayanoAI committed Mar 5, 2024
1 parent 4f35cd1 commit 9a954f5
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 65 deletions.
2 changes: 1 addition & 1 deletion conda-start.bat
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ for /f %%i in ('cd') do set ENV_NAME=%%~nxi
SET INSTALL_DIR=%userprofile%\Miniconda3
SET PATH=%INSTALL_DIR%\condabin;%PATH%

conda info --envs | findstr /i %ENV_NAME%
CALL conda info --envs | findstr /i %ENV_NAME%
if %errorlevel% == 0 (
echo %ENV_NAME% environment is already available
) else (
Expand Down
1 change: 1 addition & 0 deletions i18n/en_US.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"inference.postprocess_model": "Choose one or more postprocessing model to run in sequence (optional)",
"inference.device": "Device",
"inference.format": "Audio Format",
"inference.channels": "Output Channels",
"inference.merge_type": "Merge Type",
"inference.agg": "Aggressiveness in isolating vocals",
"inference.use_cache": "Reduce future processing time by saving results to disk",
Expand Down
47 changes: 25 additions & 22 deletions lib/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

MAX_INT16 = 32768
SUPPORTED_AUDIO = ["mp3","flac","wav"] # ogg breaks soundfile
OUTPUT_CHANNELS = ["mono", "stereo"]
AUTOTUNE_NOTES = np.array([
65.41, 69.30, 73.42, 77.78, 82.41, 87.31,
92.50, 98.00, 103.83, 110.00, 116.54, 123.47,
Expand All @@ -25,7 +26,7 @@
2959.96, 3135.96, 3322.44, 3520.00, 3729.31, 3951.07
])

def load_audio(file, sr=None):
def load_audio(file, sr, **kwargs):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
Expand All @@ -41,47 +42,49 @@ def load_audio(file, sr=None):
except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}")

return np.frombuffer(out, np.float32).flatten()
return np.frombuffer(out, np.float32).flatten(), sr

def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=False,to_mono=False,axis=0,**kwargs):
def remix_audio(input_audio,target_sr=None,norm=False,to_int16=False,resample=False,axis=0,**kwargs):
audio = np.array(input_audio[0],dtype="float32")
if target_sr is None: target_sr=input_audio[1]

print(f"before remix: shape={audio.shape}, max={audio.max()}, min={audio.min()}, mean={audio.mean()} sr={input_audio[1]}")
if resample or input_audio[1]!=target_sr:
audio = librosa.core.resample(np.array(input_audio[0],dtype="float32"),orig_sr=input_audio[1],target_sr=target_sr,**kwargs)

if to_mono and audio.ndim>1: audio=np.nanmedian(audio,axis)
if audio.ndim>1: audio=np.nanmedian(audio,axis=axis)
if norm: audio = librosa.util.normalize(audio,axis=axis)

if norm: audio = librosa.util.normalize(audio)

audio_max = np.abs(audio).max() / 0.95
if audio_max > 1: audio /= audio_max

audio_max = np.abs(audio).max()/.99
if audio_max > 1: audio = audio / audio_max

if to_int16: audio = np.clip(audio * MAX_INT16, a_min=-MAX_INT16+1, a_max=MAX_INT16-1).astype("int16")
print(f"after remix: shape={audio.shape}, max={audio.max()}, min={audio.min()}, mean={audio.mean()}, sr={target_sr}")

return audio, target_sr

def load_input_audio(fname,sr=None,**kwargs):
if sr is None: sr=44100
audio = load_audio(fname, sr)
sound = load_audio(fname, sr, **kwargs)
# sound = librosa.load(fname,sr=sr,**kwargs)
sound = audio, sr
print(f"loading sound {fname} {sound[0].shape} {sound[1]} {sound[0].dtype}")
return sound

def save_input_audio(fname,input_audio,sr=None,to_int16=False):
def save_input_audio(fname,input_audio,sr=None,to_int16=False,to_stereo=False):
print(f"saving sound to {fname}")
os.makedirs(os.path.dirname(fname),exist_ok=True)
audio=np.array(input_audio[0],dtype="int16" if np.abs(input_audio[0]).max()>100 else "float32")
audio=np.array(input_audio[0],dtype="float32")

if to_int16:
max_a = np.abs(audio).max() * .99
if max_a<1:
audio=(audio*max_a*MAX_INT16)
audio=audio.astype("int16")
audio_max = np.abs(audio).max()/.99
if audio_max > 1: audio = audio / audio_max
audio = np.clip(audio * MAX_INT16, a_min=-MAX_INT16+1, a_max=MAX_INT16-1)

if to_stereo and audio.ndim<2: audio=np.stack([audio,audio],axis=-1)
print(f"{audio.shape=}")

try:
sf.write(fname, audio, sr if sr else input_audio[1])
sf.write(fname, audio.astype("int16" if np.abs(audio).max()>1 else "float32"), sr if sr else input_audio[1])
return f"File saved to ${fname}"
except Exception as e:
return f"failed to save audio: {e}"
Expand Down Expand Up @@ -156,12 +159,12 @@ def pad_audio(*audios,axis=0):

def merge_audio(audio1,audio2,sr=40000):
print(f"merging audio audio1={audio1[0].shape,audio1[1]} audio2={audio2[0].shape,audio2[1]} sr={sr}")
m1,_=remix_audio(audio1,target_sr=sr)
m2,_=remix_audio(audio2,target_sr=sr)
m1,_=remix_audio(audio1,target_sr=sr,axis=0)
m2,_=remix_audio(audio2,target_sr=sr,axis=0)

mixed = pad_audio(m1,m2)
mixed = pad_audio(m1,m2,axis=0)

return remix_audio((mixed,sr),to_int16=True,norm=True,to_mono=True,axis=0)
return remix_audio((mixed,sr),to_int16=True,axis=0,norm=True)

def autotune_f0(f0, threshold=0.):
# autotuned_f0 = []
Expand Down
4 changes: 2 additions & 2 deletions lib/karafan/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def SEPARATE(self, audio_file, BATCH_MODE=False):

self.Save_Audio(6, vocal_final)

output_vocals = remix_audio((vocal_final, self.sample_rate),norm=True,to_int16=True,to_mono=True)
output_vocals = remix_audio((vocal_final, self.sample_rate),to_int16=True)

print("► Save Music FINAL !")

Expand All @@ -505,7 +505,7 @@ def SEPARATE(self, audio_file, BATCH_MODE=False):

self.Save_Audio(7, music_final)

output_music = remix_audio((music_final, self.sample_rate),norm=True,to_int16=True,to_mono=True)
output_music = remix_audio((music_final, self.sample_rate),to_int16=True)

print('<b>--> Processing DONE !</b>')

Expand Down
4 changes: 2 additions & 2 deletions lib/separators.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def process_vocals(self,v_spec_m,input_high_end,input_high_end_h,return_dict={})
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print(f"vocals done: {wav_vocals.shape}")
return_dict["vocals"] = remix_audio((wav_vocals,return_dict["sr"]),norm=True,to_int16=True,to_mono=True,axis=0)
return_dict["vocals"] = remix_audio((wav_vocals,return_dict["sr"]),to_int16=True,axis=0)
return return_dict["vocals"]

def process_instrumental(self,y_spec_m,input_high_end,input_high_end_h,return_dict={}):
Expand All @@ -166,7 +166,7 @@ def process_instrumental(self,y_spec_m,input_high_end,input_high_end_h,return_di
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print(f"instruments done: {wav_instrument.shape}")
return_dict["instrumentals"] = remix_audio((wav_instrument,return_dict["sr"]),norm=True,to_int16=True,to_mono=True,axis=0)
return_dict["instrumentals"] = remix_audio((wav_instrument,return_dict["sr"]),to_int16=True,axis=0)
return return_dict["instrumentals"]

def process_audio(self,y_spec_m,v_spec_m,input_high_end,input_high_end_h):
Expand Down
50 changes: 24 additions & 26 deletions pages/1_Inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from webui.components import file_uploader_form, initial_vocal_separation_params, initial_voice_conversion_params, save_vocal_separation_params, save_voice_conversion_params, vocal_separation_form, voice_conversion_form
from lib.utils import ObjectNamespace
from webui.contexts import SessionStateContext
from lib.audio import SUPPORTED_AUDIO, bytes_to_audio, merge_audio, remix_audio, save_input_audio
from lib.audio import OUTPUT_CHANNELS, SUPPORTED_AUDIO, bytes_to_audio, merge_audio, remix_audio, save_input_audio

from lib.utils import gc_collect, get_filenames, get_index, get_optimal_torch_device

Expand Down Expand Up @@ -37,6 +37,7 @@ def init_inference_state():
rvc_models=None,
device=get_optimal_torch_device(),
format="flac",
channels="mono",
models=get_rvc_models(),
model_name=None,

Expand Down Expand Up @@ -95,13 +96,13 @@ def one_click_convert(state):
state.output_audio = mixed_audio
return state

def download_song(output_audio,output_audio_name,ext="mp3"):
def download_song(output_audio,output_audio_name,ext="mp3",to_stereo=False):
audio_path = output_audio_name.split(".")

output_dir = os.path.join(OUTPUT_DIR,"inference",audio_path[0])
os.makedirs(output_dir,exist_ok=True)
output_file = os.path.join(output_dir,f"{audio_path[1]}.{ext}")
if save_input_audio(output_file,output_audio,to_int16=True):
output_file = os.path.join(output_dir,".".join([audio_path[1],"stereo" if to_stereo else "mono",ext]))
if save_input_audio(output_file,output_audio,to_int16=True,to_stereo=to_stereo):
return f"successfully saved to {output_file}"
else: "failed to save"

Expand Down Expand Up @@ -154,25 +155,17 @@ def render_voice_conversion_form(state):
index=get_index(state.models,state.model_name),
format_func=lambda option: os.path.basename(option).split(".")[0]
)
# col1, col2 = right.columns(2)
# if col1.button(i18n("inference.load_model.button"),use_container_width=True, type="primary"):
# del state.rvc_models
# state.rvc_models = load_model(state)
# gc_collect()
# if col2.button(i18n("inference.clear_data.button"),use_container_width=True):
# state = clear_data(state)
# st.experimental_rerun()

col1, col2 = st.columns(2)
state.device = col1.radio(
i18n("inference.device"),
disabled=not config.has_gpu,
options=DEVICE_OPTIONS,horizontal=True,
index=get_index(DEVICE_OPTIONS,state.device))
state.format = col2.radio(
i18n("inference.format"),
options=SUPPORTED_AUDIO,horizontal=True,
index=get_index(SUPPORTED_AUDIO,state.format))
col1, col2 = right.columns(2)
state.device = col1.radio(
i18n("inference.device"),
disabled=not config.has_gpu,
options=DEVICE_OPTIONS,horizontal=True,
index=get_index(DEVICE_OPTIONS,state.device))
state.format = col2.radio(
i18n("inference.format"),
options=SUPPORTED_AUDIO,horizontal=True,
index=get_index(SUPPORTED_AUDIO,state.format))

st.subheader(i18n("inference.split_vocals"))
with st.expander(i18n("inference.split_vocals.expander"),expanded=not (state.input_audio_name and len(state.uvr5_params.uvr_models))):
Expand Down Expand Up @@ -210,14 +203,14 @@ def render_voice_conversion_form(state):
if uploaded_vocals is not None:
input_audio = bytes_to_audio(
uploaded_vocals.getvalue())
state.input_vocals = remix_audio(input_audio,norm=True,to_int16=True,to_mono=True)
state.input_vocals = remix_audio(input_audio,norm=True,to_int16=True)
state.input_audio_name = uploaded_vocals.name
del uploaded_vocals
uploaded_instrumentals = col2.file_uploader("Upload your own instrumental file (if you didn't use voice extraction)",type=SUPPORTED_AUDIO)
if uploaded_instrumentals is not None:
input_audio = bytes_to_audio(
uploaded_instrumentals.getvalue())
state.input_instrumental = remix_audio(input_audio,norm=True,to_int16=True,to_mono=True)
state.input_instrumental = remix_audio(input_audio,norm=True,to_int16=True)
state.input_audio_name = uploaded_instrumentals.name
del uploaded_instrumentals

Expand Down Expand Up @@ -259,5 +252,10 @@ def render_voice_conversion_form(state):
if state.output_audio is not None:
col2.write("Converted Song")
col2.audio(state.output_audio[0],sample_rate=state.output_audio[1])
if col2.button(i18n("inference.download.button")):
st.toast(download_song(state.output_audio,state.output_audio_name,ext="flac"))
c1, c2 = col2.columns(2)
state.channels = c1.radio(
i18n("inference.channels"),
options=OUTPUT_CHANNELS,horizontal=True,
index=get_index(OUTPUT_CHANNELS,state.channels))
if c2.button(i18n("inference.download.button")):
st.toast(download_song(state.output_audio,state.output_audio_name,ext=state.format,to_stereo=state.channels=="stereo"))
2 changes: 1 addition & 1 deletion tts_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __tacotron2__(text, device="cpu"):
speech = (waveforms.cpu().numpy().squeeze() * MAX_INT16).astype(np.int16)

# return as numpy array
return remix_audio((speech, 22050),target_sr=16000,to_mono=True,norm=True)
return remix_audio((speech, 22050),target_sr=16000)

def __edge__(text, speaker="en-US-JennyNeural"):
import edge_tts
Expand Down
9 changes: 5 additions & 4 deletions uvr5_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,12 @@ def split_audio(uvr_models,audio_path,preprocess_models=[],postprocess_models=[]

for model_path in uvr_models:
args = (model_path,audio_path,agg,device,use_cache,cache_dir,num_threads,format)
print(f"processing... {args=}")
vocals, instrumental, _ = __run_inference_worker(args)
wav_vocals.append(vocals[0])
wav_instrument.append(instrumental[0])
wav_instrument = merge_func(pad_audio(*wav_instrument),axis=0)
wav_vocals = merge_func(pad_audio(*wav_vocals),axis=0)
wav_instrument = np.nanmedian(pad_audio(*wav_instrument,axis=0),axis=0)
wav_vocals = merge_func(pad_audio(*wav_vocals,axis=0),axis=0)

# postprocess vocals to reduce reverb
if len(postprocess_models):
Expand All @@ -148,8 +149,8 @@ def split_audio(uvr_models,audio_path,preprocess_models=[],postprocess_models=[]
wav_vocals, _ = processed_audio
vocals_file = intermediary_file

instrumental = remix_audio((wav_instrument,instrumental[-1]),norm=True,to_int16=True,to_mono=True)
vocals = remix_audio((wav_vocals,vocals[-1]),norm=True,to_int16=True,to_mono=True)
instrumental = remix_audio((wav_instrument,instrumental[-1]),to_int16=True)
vocals = remix_audio((wav_vocals,vocals[-1]),to_int16=True)

return vocals, instrumental, input_audio

Expand Down
9 changes: 4 additions & 5 deletions vc_infer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pitch_extraction import FeatureExtractor

from lib.audio import load_input_audio, remix_audio
from lib.audio import MAX_INT16, load_input_audio, remix_audio
from lib import config, BASE_MODELS_DIR

from lib.utils import gc_collect, get_filenames
Expand Down Expand Up @@ -194,9 +194,8 @@ def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, merge_
if resample_sr >= 16000 and tgt_sr != resample_sr:
audio_opt = librosa.resample(audio_opt, orig_sr=tgt_sr, target_sr=resample_sr)

max_int16 = 32768
audio_max = max(np.abs(audio_opt).max() / 0.99, 1)
audio_opt = (audio_opt * max_int16 / audio_max).astype(np.int16)
audio_max = np.abs(audio_opt).max() / 0.99
audio_opt = (audio_opt * MAX_INT16 / audio_max).astype(np.int16)

gc_collect()

Expand Down Expand Up @@ -301,7 +300,7 @@ def vc_single(
try:
audio = input_audio[0] if input_audio is not None else load_input_audio(input_audio_path, 16000)

audio,_ = remix_audio((audio,input_audio[1] if input_audio is not None else 16000), target_sr=16000, norm=True, to_mono=True)
audio,_ = remix_audio((audio,input_audio[1] if input_audio is not None else 16000), target_sr=16000)

times = [0, 0, 0]
if_f0 = cpt.get("f0", 1)
Expand Down
4 changes: 2 additions & 2 deletions webui/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ def voice_conversion_form(state, use_hybrid=True):
col1, col2 = st.columns(2)
state.merge_type = col1.radio(
i18n("inference.merge_type"),
options=["median","mean"],horizontal=True,
index=get_index(["median","mean"],state.merge_type))
options=MERGE_OPTIONS,horizontal=True,
index=get_index(MERGE_OPTIONS,state.merge_type))
state.f0_autotune = col2.checkbox(i18n("inference.f0_autotune"),value=state.f0_autotune)
state.resample_sr = st.select_slider(i18n("inference.resample_sr"),
options=[0,16000,24000,22050,40000,44100,48000],
Expand Down

0 comments on commit 9a954f5

Please sign in to comment.