RVCV2 Tools Mocci - Ipynb
RVCV2 Tools Mocci - Ipynb
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/mocci24/RVCGI-V2/blob/
main/RVCV2_Tools_Mocci.ipynb\" target=\"_parent\"><img
src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In
Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F7cWqy7KeHyv"
},
"source": [
"RVC TRAINING V2 ALL IN ONE\n",
"\n",
"DON'T RUN ALL THE RUNTIME!!\n",
"\n",
"CHOSE WHAT YOU WANT\n",
"\n",
"TRAIN, SPLITTING, DOWNLOADING AUDIO, SEPARATE VOCAL, OR INFERENCE <br />\
n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FmlBOuMKSvDV"
},
"source": [
"# 1. Preparation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OA3CyHOtzm4d",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Test Runtime\n",
"!nvidia-smi\n",
"!nvcc -V\n",
"!free -h"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jwu07JgqoFON"
},
"outputs": [],
"source": [
"#@title Mount Drive\n",
"\n",
"from google.colab import drive\n",
"drive.mount('/content/drive')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EGs9neZQeT8F"
},
"source": [
"# 2. SELECT MODE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KTUIsJldFm5m"
},
"outputs": [],
"source": [
"#@title Input Form\n",
"\n",
"Mode = \"training\" #@param
[\"Separate\", \"Splitting\", \"training\", \"inference\"]\n",
"dataset = \"Youtube\" #@param [\"Youtube\", \"Drive\"]\n",
"url = \"\" #@param {type:\"string\"}\n",
"drive_path = \"\" #@param {type:\"string\"}\n",
"AUDIO_NAME = \"\" #@param {type:\"string\"}\n",
"#@markdown fill modelname if training/resume training\n",
"MODELNAME = \"kazuha\" #@param {type:\"string\"}\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E22J0T-Uwt21"
},
"source": [
"# 3. This option for download wav audio from YOUTUBE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yjD5e1N_yZAy",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Install Library for Youtube WAV Download\n",
"if dataset == \"Drive\":\n",
" print(\"Dataset is set to Drive. Skipping this section\")\n",
"elif dataset == \"Youtube\":\n",
" !pip install yt_dlp\n",
" !pip install ffmpeg\n",
" !mkdir youtubeaudio"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "I3RqGfboyeY0"
},
"outputs": [],
"source": [
"#@title Download Youtube WAV\n",
"from __future__ import unicode_literals\n",
"\n",
"if dataset == \"Drive\":\n",
" print(\"Dataset is set to Drive. Skipping this section\")\n",
"elif dataset == \"Youtube\":\n",
" import yt_dlp\n",
" import ffmpeg\n",
" import sys\n",
"\n",
"\n",
" ydl_opts = {\n",
" 'format': 'bestaudio/best',\n",
" # 'outtmpl': 'output.%(ext)s',\n",
" 'postprocessors': [{\n",
" 'key': 'FFmpegExtractAudio',\n",
" 'preferredcodec': 'wav',\n",
" }],\n",
" \"outtmpl\": f'youtubeaudio/{AUDIO_NAME}', # this is where you can
edit how you'd like the filenames to be formatted\n",
" }\n",
" def download_from_url(url):\n",
" ydl.download([url])\n",
" # stream = ffmpeg.input('output.m4a')\n",
" # stream = ffmpeg.output(stream, 'output.wav')\n",
"\n",
"\n",
" with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
"\n",
" download_from_url(url)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Qt3gYUiVV925"
},
"source": [
"# 4. Separating vocal & instrument with demucs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OULwx5u44GbL"
},
"outputs": [],
"source": [
"#@title Install Demucs for Separating Audio\n",
"!python3 -m pip install -U demucs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qirHqUAr4IAO"
},
"outputs": [],
"source": [
"#@title Start Separate Vocal and Instrument using Demucs\n",
"#@markdown This program is standart separating Vocal and Instrument.\n",
"#@markdown Use webUI for more feature.\n",
"import subprocess\n",
"input_file = \"\" #@param {type:\"string\"}\n",
"OUTPUT_DIR = \"/content/separated\"\n",
"\n",
"command = f\"demucs --two-stems=vocals {input_file}\"\n",
"result = subprocess.run(command.split(), stdout=subprocess.PIPE)\n",
"print(result.stdout.decode())\n",
"#input_file = input(\"Masukkan file audio: \")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2Ij2dJJvSesJ"
},
"source": [
"# 5. Split audio to 10-sec segments for dataset sample before training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4z_C02unP5S2"
},
"outputs": [],
"source": [
"#@title Split The Audio into Smaller Duration Before Training\n",
"if Mode == \"Separate\":\n",
" print(\"Mode is set to Separate. Skipping this section\")\n",
"elif Mode == \"Splitting\":\n",
" !pip install numpy\n",
" !pip install librosa\n",
" !pip install soundfile\n",
" !mkdir -p dataset/{AUDIO_NAME}"
]
},
{
"cell_type": "code",
"source": [
"#@title\n",
"import numpy as np\n",
"import librosa\n",
"import soundfile\n",
"\n",
"\n",
"# This function is obtained from librosa.\n",
"def get_rms(\n",
" y,\n",
" *,\n",
" frame_length=2048,\n",
" hop_length=512,\n",
" pad_mode=\"constant\",\n",
"):\n",
" padding = (int(frame_length // 2), int(frame_length // 2))\n",
" y = np.pad(y, padding, mode=pad_mode)\n",
"\n",
" axis = -1\n",
" # put our new within-frame axis at the end for now\n",
" out_strides = y.strides + tuple([y.strides[axis]])\n",
" # Reduce the shape on the framing axis\n",
" x_shape_trimmed = list(y.shape)\n",
" x_shape_trimmed[axis] -= frame_length - 1\n",
" out_shape = tuple(x_shape_trimmed) + tuple([frame_length])\n",
" xw = np.lib.stride_tricks.as_strided(\n",
" y, shape=out_shape, strides=out_strides\n",
" )\n",
" if axis < 0:\n",
" target_axis = axis - 1\n",
" else:\n",
" target_axis = axis + 1\n",
" xw = np.moveaxis(xw, -1, target_axis)\n",
" # Downsample along the target axis\n",
" slices = [slice(None)] * xw.ndim\n",
" slices[axis] = slice(0, None, hop_length)\n",
" x = xw[tuple(slices)]\n",
"\n",
" # Calculate power\n",
" power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)\n",
"\n",
" return np.sqrt(power)\n",
"\n",
"\n",
"class Slicer:\n",
" def __init__(self,\n",
" sr: int,\n",
" threshold: float = -40.,\n",
" min_length: int = 5000,\n",
" min_interval: int = 300,\n",
" hop_size: int = 20,\n",
" max_sil_kept: int = 5000):\n",
" if not min_length >= min_interval >= hop_size:\n",
" raise ValueError('The following condition must be satisfied:
min_length >= min_interval >= hop_size')\n",
" if not max_sil_kept >= hop_size:\n",
" raise ValueError('The following condition must be satisfied:
max_sil_kept >= hop_size')\n",
" min_interval = sr * min_interval / 1000\n",
" self.threshold = 10 ** (threshold / 20.)\n",
" self.hop_size = round(sr * hop_size / 1000)\n",
" self.win_size = min(round(min_interval), 4 * self.hop_size)\n",
" self.min_length = round(sr * min_length / 1000 / self.hop_size)\
n",
" self.min_interval = round(min_interval / self.hop_size)\n",
" self.max_sil_kept = round(sr * max_sil_kept / 1000 /
self.hop_size)\n",
"\n",
" def _apply_slice(self, waveform, begin, end):\n",
" if len(waveform.shape) > 1:\n",
" return waveform[:, begin * self.hop_size:
min(waveform.shape[1], end * self.hop_size)]\n",
" else:\n",
" return waveform[begin * self.hop_size: min(waveform.shape[0],
end * self.hop_size)]\n",
"\n",
" def slice(self, waveform):\n",
" if len(waveform.shape) > 1:\n",
" samples = waveform.mean(axis=0)\n",
" else:\n",
" samples = waveform\n",
" if samples.shape[0] <= self.min_length:\n",
" return [waveform]\n",
" rms_list = get_rms(y=samples, frame_length=self.win_size,
hop_length=self.hop_size).squeeze(0)\n",
" sil_tags = []\n",
" silence_start = None\n",
" clip_start = 0\n",
" for i, rms in enumerate(rms_list):\n",
" # Keep looping while frame is silent.\n",
" if rms < self.threshold:\n",
" # Record start of silent frames.\n",
" if silence_start is None:\n",
" silence_start = i\n",
" continue\n",
" # Keep looping while frame is not silent and silence start has
not been recorded.\n",
" if silence_start is None:\n",
" continue\n",
" # Clear recorded silence start if interval is not enough or
clip is too short\n",
" is_leading_silence = silence_start == 0 and i >
self.max_sil_kept\n",
" need_slice_middle = i - silence_start >= self.min_interval and
i - clip_start >= self.min_length\n",
" if not is_leading_silence and not need_slice_middle:\n",
" silence_start = None\n",
" continue\n",
" # Need slicing. Record the range of silent frames to be
removed.\n",
" if i - silence_start <= self.max_sil_kept:\n",
" pos = rms_list[silence_start: i + 1].argmin() +
silence_start\n",
" if silence_start == 0:\n",
" sil_tags.append((0, pos))\n",
" else:\n",
" sil_tags.append((pos, pos))\n",
" clip_start = pos\n",
" elif i - silence_start <= self.max_sil_kept * 2:\n",
" pos = rms_list[i - self.max_sil_kept: silence_start +
self.max_sil_kept + 1].argmin()\n",
" pos += i - self.max_sil_kept\n",
" pos_l = rms_list[silence_start: silence_start +
self.max_sil_kept + 1].argmin() + silence_start\n",
" pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() +
i - self.max_sil_kept\n",
" if silence_start == 0:\n",
" sil_tags.append((0, pos_r))\n",
" clip_start = pos_r\n",
" else:\n",
" sil_tags.append((min(pos_l, pos), max(pos_r, pos)))\
n",
" clip_start = max(pos_r, pos)\n",
" else:\n",
" pos_l = rms_list[silence_start: silence_start +
self.max_sil_kept + 1].argmin() + silence_start\n",
" pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() +
i - self.max_sil_kept\n",
" if silence_start == 0:\n",
" sil_tags.append((0, pos_r))\n",
" else:\n",
" sil_tags.append((pos_l, pos_r))\n",
" clip_start = pos_r\n",
" silence_start = None\n",
" # Deal with trailing silence.\n",
" total_frames = rms_list.shape[0]\n",
" if silence_start is not None and total_frames - silence_start >=
self.min_interval:\n",
" silence_end = min(total_frames, silence_start +
self.max_sil_kept)\n",
" pos = rms_list[silence_start: silence_end + 1].argmin() +
silence_start\n",
" sil_tags.append((pos, total_frames + 1))\n",
" # Apply and return slices.\n",
" if len(sil_tags) == 0:\n",
" return [waveform]\n",
" else:\n",
" chunks = []\n",
" if sil_tags[0][0] > 0:\n",
" chunks.append(self._apply_slice(waveform, 0, sil_tags[0]
[0]))\n",
" for i in range(len(sil_tags) - 1):\n",
" chunks.append(self._apply_slice(waveform, sil_tags[i][1],
sil_tags[i + 1][0]))\n",
" if sil_tags[-1][1] < total_frames:\n",
" chunks.append(self._apply_slice(waveform, sil_tags[-1][1],
total_frames))\n",
" return chunks\n",
"\n",
"if Mode == \"Separate\":\n",
" print(\"Mode is set to Separate. Skipping this section\")\n",
"\n",
"elif Mode == \"Splitting\":\n",
" audio, sr =
librosa.load(f'/content/separated/htdemucs/{AUDIO_NAME}/vocals.wav', sr=None,
mono=False) # Load an audio file with librosa.\n",
" slicer = Slicer(\n",
" sr=sr,\n",
" threshold=-40,\n",
" min_length=5000,\n",
" min_interval=200,\n",
" hop_size=10,\n",
" max_sil_kept=500\n",
" )\n",
" chunks = slicer.slice(audio)\n",
" for i, chunk in enumerate(chunks):\n",
" if len(chunk.shape) > 1:\n",
" chunk = chunk.T # Swap axes if the audio is stereo.\n",
" soundfile.write(f'/content/dataset/{AUDIO_NAME}/split_{i}.wav',
chunk, sr) # Save sliced audio files with soundfile."
],
"metadata": {
"id": "zp4gL2L3BfRD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U5c8uqCrUE3B"
},
"outputs": [],
"source": [
"#@title Save splitting result to drive\n",
"if Mode == \"Separate\":\n",
" print(\"Mode is set to Separate. Skipping this section\")\n",
"elif Mode == \"Splitting\":\n",
" !mkdir -p /content/drive/MyDrive/dataset/{AUDIO_NAME}\n",
" !cp -r /content/dataset/{AUDIO_NAME}/*
/content/drive/MyDrive/dataset/{AUDIO_NAME}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "t1myplxnxHz5"
},
"source": [
"# 6. Load our models characters from Hugingface (optional)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "nHmZGugomyAo"
},
"outputs": [],
"source": [
"#@title Load models from Hugingface\n",
"#@markdown Click [here](https://huggingface.co/mocci24/RVCV2-GI) for more
info\n",
"!apt install git-lfs\n",
"!git lfs install\n",
"!git clone https://huggingface.co/mocci24/RVCV2-GI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "ge_97mfpgqTm"
},
"outputs": [],
"source": [
"#@title Cloning Github | Hugingface\n",
"#@markdown Run this if you want to use our models from HugingFace.\n",
"#@markdown Note: Don't run both \"Clone Github\" !<br />\n",
"#@markdown Our Models info\n",
"#@markdown [Here](https://huggingface.co/mocci24/RVCV2-GI)\n",
"\n",
"\n",
"!git clone https://github.com/RVC-Project/Retrieval-based-Voice-
Conversion-WebUI.git\n",
"!cp -r /content/RVCV2-GI/RVC/weights/* /content/Retrieval-based-Voice-
Conversion-WebUI/weights\n",
"!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
"!cp -r /content/RVCV2-GI/RVC/* /content/Retrieval-based-Voice-Conversion-
WebUI/logs\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
"!mkdir -p pretrained uvr5_weights"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "31Y6S8tUhME3"
},
"source": [
"# 7. Load your models from Drive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LFfY69g0hP-j"
},
"outputs": [],
"source": [
"#@title Cloning Github | Drive\n",
"#@markdown Run this if you want to use models from your Google Drive.\n",
"#@markdown Note: Don't run both \"Clone Github\" !\n",
"\n",
"!git clone https://github.com/RVC-Project/Retrieval-based-Voice-
Conversion-WebUI.git\n",
"!cp -r /content/drive/MyDrive/RVC/weights/* /content/Retrieval-based-
Voice-Conversion-WebUI/weights\n",
"!mkdir -p
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"!cp -r /content/drive/MyDrive/RVC/{MODELNAME}/* /content/Retrieval-based-
Voice-Conversion-WebUI/logs/{MODELNAME}\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
"!mkdir -p pretrained uvr5_weights"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sGuCk9_KUkYI"
},
"source": [
"# 8. Download the necessary requirements for training, inference, and
others."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lxJmBrEF8xGB"
},
"outputs": [],
"source": [
"!pip install gradio==3.34.0\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "pqE0PrnuRqI2"
},
"outputs": [],
"source": [
"#@title Install aria2\n",
"!apt -y install -qq aria2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "UG3XpUwEomUz"
},
"outputs": [],
"source": [
"#@title Download requirements for training\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth
-d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0D32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0D48k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0G32k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/
f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o
f0G48k.pth\n",
"\n",
"#RVC V2\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/
D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o
D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/
G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o
G40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/
f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o
f0D40k.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/
f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o
f0G40k.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "HugjmZqZRuiF"
},
"outputs": [],
"source": [
"#@title Download requirements for Audio Splitting\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人
声 vocals+非人声 instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-
WebUI/uvr5_weights -o HP2-人声 vocals+非人声 instrumentals.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主
旋律人声 vocals+其他 instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-
WebUI/uvr5_weights -o HP5-主旋律人声 vocals+其他 instrumentals.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/5_HP-
Karaoke-UVR.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o
5_HP-Karaoke-UVR.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/
HP2_all_vocals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights
-o HP2_all_vocals.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/
HP3_all_vocals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights
-o HP3_all_vocals.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/
HP5_only_main_vocal.pth -d
/content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o
HP5_only_main_vocal.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-
DeEchoAggressive.pth -d
/content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o VR-
DeEchoAggressive.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-
DeEchoDeReverb.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights
-o VR-DeEchoDeReverb.pth\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-
DeEchoNormal.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o
VR-DeEchoNormal.pth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2RCaT9FTR0ej"
},
"outputs": [],
"source": [
"#@title Download hubert_base\n",
"!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M
https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -
d /content/Retrieval-based-Voice-Conversion-WebUI -o hubert_base.pt"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wpu3ylEQWZ03"
},
"source": [
"# 9. Run this if you want to split vocals into separate one-minute
segments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "hS2USIpka445"
},
"outputs": [],
"source": [
"#@title Instal requirements for audio splitter ( 1 minute per split )\n",
"!pip install pydub"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6dCqJdxHann7",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Start Splitter ( 1-minute segments )\n",
"#@markdown Input file path, for example\n",
"#@markdown \"/content/separated/vocal.wav\"\n",
"#@markdown\n",
"#@markdown - What is this for?<br />\n",
"#@markdown To split the vocal duration into one-minute segments.<br />\n",
"#@markdown - Why does it need to be split?<br />\n",
"#@markdown To avoid \"connection timeout\" messages during inference when
using the Havert/Crepe mode, as the maximum duration that can be processed is only
around one minute.<br />\n",
"#@markdown However, if you are using the PM mode, you can skip this
process.\n",
"input_file = \"\" #@param {type:\"string\"}\n",
"from pydub import AudioSegment\n",
"import os\n",
"\n",
"def split_audio_per_minute(input_file):\n",
" output_directory = '/content/Split'\n",
" os.makedirs(output_directory, exist_ok=True)\n",
" audio = AudioSegment.from_file(input_file)\n",
" duration = len(audio)\n",
" total_minutes = int(duration / (60 * 1000))\n",
"\n",
" for minute in range(total_minutes + 1):\n",
" start_time = minute * 60 * 1000\n",
" end_time = (minute + 1) * 60 * 1000\n",
" split_audio = audio[start_time:end_time]\n",
" output_file = os.path.join(output_directory,
f'split_{os.path.splitext(os.path.basename(input_file))[0]}_{minute}.wav')\n",
" split_audio.export(output_file, format='wav')\n",
" print(f\"Berhasil memisahkan audio {os.path.basename(input_file)}
menit ke-{minute + 1}.\")\n",
"\n",
" print(\"Proses pemisahan audio selesai.\")\n",
"\n",
"#input_file = input(\"Masukkan file audio: \")\n",
"split_audio_per_minute(input_file)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aYhsHY6zZIr6"
},
"source": [
"# 10. Final process (Run webUI)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7vh6vphDwO0b"
},
"outputs": [],
"source": [
"#@title Run Web\n",
"%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
"# %load_ext tensorboard\n",
"# %tensorboard --logdir
/content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
"!python3 infer-web.py --colab --pycmd python3"
]
},
{
"cell_type": "code",
"source": [
"#@title Copy Training Result to Drive (standart)\n",
"if Mode == \"training\":\n",
" !mkdir -p /content/drive/MyDrive/RVC/{MODELNAME}\n",
" !mkdir -p /content/drive/MyDrive/RVC/weights\n",
"\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_*.pth
/content/drive/MyDrive/RVC/{MODELNAME}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_*.pth
/content/drive/MyDrive/RVC/{MODELNAME}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index
/content/drive/MyDrive/RVC/{MODELNAME}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/trained*.index
/content/drive/MyDrive/RVC/{MODELNAME}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy
/content/drive/MyDrive/RVC/{MODELNAME}\n",
"\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth
/content/drive/MyDrive/RVC/weights/{MODELNAME}.pth\n",
"elif Mode == \"inference\":\n",
" print(\"Mode set to inference. Skipping this section\")"
],
"metadata": {
"id": "bFdixVdmbt7G"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 11. copy training result (advanced)"
],
"metadata": {
"id": "yvz2jj1XfhsQ"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FgJuNeAwx5Y_",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title Start copying\n",
"#@markdown Same with standart copy but you can input the name manualy\n",
"import ipywidgets as widgets\n",
"from IPython.display import display\n",
"\n",
"modelname_input = widgets.Text(\n",
" value='',\n",
" placeholder='Masukkan nama model',\n",
" description='Nama Model:',\n",
" disabled=False\n",
")\n",
"\n",
"display(modelname_input)\n",
"\n",
"copy_button = widgets.Button(description=\"Salin Hasil Pelatihan\")\n",
"\n",
"def copy_training_result(button):\n",
" modelname = modelname_input.value.strip()\n",
" if modelname:\n",
" !mkdir -p /content/drive/MyDrive/RVC/{modelname}\n",
" !mkdir -p /content/drive/MyDrive/RVC/weights\n",
"\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{modelname}/G_*.pth
/content/drive/MyDrive/RVC/{modelname}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{modelname}/D_*.pth
/content/drive/MyDrive/RVC/{modelname}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{modelname}/added_*.index
/content/drive/MyDrive/RVC/{modelname}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{modelname}/trained*.index
/content/drive/MyDrive/RVC/{modelname}\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/logs/{modelname}/total_*.npy
/content/drive/MyDrive/RVC/{modelname}\n",
"\n",
" !cp
/content/Retrieval-based-Voice-Conversion-WebUI/weights/{modelname}.pth
/content/drive/MyDrive/RVC/weights/{modelname}.pth\n",
" print(\"Hasil pelatihan telah disalin ke Google Drive.\")\n",
" else:\n",
" print(\"Nama model tidak boleh kosong.\")\n",
"\n",
"copy_button.on_click(copy_training_result)\n",
"display(copy_button)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cQpsrCw80wYz"
},
"source": [
"# <center> Confused? Yes Me Too<br />\n",
"# <center> Some [☕](https://ko-fi.com/mocci24)?"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [
"FmlBOuMKSvDV",
"EGs9neZQeT8F",
"E22J0T-Uwt21",
"Qt3gYUiVV925",
"2Ij2dJJvSesJ",
"t1myplxnxHz5",
"31Y6S8tUhME3",
"sGuCk9_KUkYI",
"wpu3ylEQWZ03",
"aYhsHY6zZIr6",
"yvz2jj1XfhsQ"
],
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}