diff --git a/notebooks/Getting_started.ipynb b/notebooks/Getting_started.ipynb
index c9a47dd..b25b51f 100644
--- a/notebooks/Getting_started.ipynb
+++ b/notebooks/Getting_started.ipynb
@@ -30,6 +30,17 @@
"! pip install -q mayavoz "
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e3b59ac5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.chdir(\"/Users/shahules/Myprojects/enhancer\")"
+ ]
+ },
{
"cell_type": "markdown",
"id": "87ee497f",
@@ -62,14 +73,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "67698871",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/anaconda3/envs/enhancer/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
"source": [
"\n",
"from mayavoz import Mayamodel\n",
- "model = Mayamodel.from_pretrained(\"mayavoz/waveunet\")\n"
+ "model = Mayamodel.from_pretrained(\"shahules786/mayavoz-dccrn-valentini-28spk\")\n"
]
},
{
@@ -82,13 +102,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "d7996c16",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 1, 36414])"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "file = \"myvoice.wav\"\n",
- "audio = model.enhance(\"myvoice.wav\")\n",
+ "audio = model.enhance(\"my_voice.wav\")\n",
"audio.shape"
]
},
@@ -96,19 +126,84 @@
"cell_type": "markdown",
"id": "8ee20a83",
"metadata": {},
+ "source": [
+ "**Inference using numpy ndarray**\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "e1a1c718",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(36414,)"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "from librosa import load\n",
+ "my_voice,sr = load(\"my_voice.wav\",sr=16000)\n",
+ "my_voice.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "56b5c01b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1, 1, 36414)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "audio = model.enhance(my_voice,sampling_rate=sr)\n",
+ "audio.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0ab4d43",
+ "metadata": {},
"source": [
"**Inference using torch tensor**\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "e1a1c718",
+ "execution_count": 22,
+ "id": "fc6192b9",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 1, 36414])"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "audio_tensor = torch.rand(1,1,32000) ## random audio data\n",
- "audio = model.enhance(audio_tensor)\n",
+ "my_voice = torch.from_numpy(my_voice)\n",
+ "audio = model.enhance(my_voice,sampling_rate=sr)\n",
"audio.shape"
]
},
@@ -122,24 +217,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "9e0313f7",
"metadata": {},
"outputs": [],
"source": [
- "audio = model.enhance(\"myvoice.wav\",save_output=True)"
+ "audio = model.enhance(\"my_voice.wav\",save_output=True)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "25077720",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "from Ipython.audio import Audio\n",
- "\n",
- "Audio(\"myvoice_cleaned.wav\",rate=SAMPLING_RATE)"
+ "from IPython.display import Audio\n",
+ "SAMPLING_RATE = 16000\n",
+ "Audio(\"cleaned_my_voice.wav\",rate=SAMPLING_RATE)"
]
},
{
@@ -183,19 +297,19 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "2c8c2b12",
"metadata": {},
"outputs": [],
"source": [
"from mayavoz.utils import Files\n",
"\n",
- "name = \"dataset_name\"\n",
- "root_dir = \"root_directory_of_your_dataset\"\n",
- "files = Files(train_clean=\"train_cleanfiles_foldername\",\n",
- " train_noisy=\"noisy_train_foldername\",\n",
- " test_clean=\"clean_test_foldername\",\n",
- " test_noisy=\"noisy_test_foldername\")\n",
+ "name = \"valentini\"\n",
+ "root_dir = \"/Users/shahules/Myprojects/enhancer/datasets/vctk\"\n",
+ "files = Files(train_clean=\"clean_testset_wav\",\n",
+ " train_noisy=\"clean_testset_wav\",\n",
+ " test_clean=\"noisy_testset_wav\",\n",
+ " test_noisy=\"noisy_testset_wav\")\n",
"duration = 4.0 \n",
"stride = None\n",
"sampling_rate = 16000"
@@ -207,13 +321,13 @@
"metadata": {},
"source": [
"Now there are two types of `matching_function`\n",
- "- `one_to_one` : In this one clean file will only have one corresponding noisy file. For example VCTK datasets\n",
+ "- `one_to_one` : In this one clean file will only have one corresponding noisy file. For example Valentini datasets\n",
"- `one_to_many` : In this one clean file will only have one corresponding noisy file. For example DNS dataset."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "4b0fdc62",
"metadata": {},
"outputs": [],
@@ -223,25 +337,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "ff0cfe60",
"metadata": {},
"outputs": [],
"source": [
- "from mayavoz.dataset import MayaDataset\n",
+ "from mayavoz.data import MayaDataset\n",
"dataset = MayaDataset(\n",
" name=name,\n",
" root_dir=root_dir,\n",
" files=files,\n",
" duration=duration,\n",
" stride=stride,\n",
- " sampling_rate=sampling_rate\n",
+ " sampling_rate=sampling_rate,\n",
+ " min_valid_minutes = 5.0,\n",
" )\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "acfdc655",
"metadata": {},
"outputs": [],
@@ -252,7 +367,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 12,
"id": "4fabe46d",
"metadata": {},
"outputs": [],
@@ -262,13 +377,91 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "20d98ed0",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "GPU available: False, used: False\n",
+ "TPU available: False, using: 0 TPU cores\n",
+ "IPU available: False, using: 0 IPUs\n",
+ "HPU available: False, using: 0 HPUs\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Selected fp257 for valid\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " | Name | Type | Params\n",
+ "----------------------------------------\n",
+ "0 | _loss | LossWrapper | 0 \n",
+ "1 | encoder | ModuleList | 4.7 M \n",
+ "2 | decoder | ModuleList | 4.7 M \n",
+ "3 | de_lstm | DemucsLSTM | 24.8 M\n",
+ "----------------------------------------\n",
+ "34.2 M Trainable params\n",
+ "0 Non-trainable params\n",
+ "34.2 M Total params\n",
+ "136.866 Total estimated model params size (MB)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total train duration 27.4 minutes\n",
+ "Total validation duration 29.733333333333334 minutes\n",
+ "Total test duration 57.2 minutes\n",
+ "Epoch 0: 48%|▍| 13/27 [15:18<16:29, 70.66s/it, loss=0.0265, v_num=2, train_loss\n",
+ "Validation: 0it [00:00, ?it/s]\u001b[A\n",
+ "Validation: 0%| | 0/14 [00:00, ?it/s]\u001b[A\n",
+ "Validation DataLoader 0: 0%| | 0/14 [00:00, ?it/s]\u001b[A\n",
+ "Epoch 0: 52%|▌| 14/27 [19:55<18:29, 85.37s/it, loss=0.0265, v_num=2, train_loss\u001b[A\n",
+ "Epoch 0: 56%|▌| 15/27 [42:05<33:40, 168.34s/it, loss=0.0265, v_num=2, train_los\u001b[A\n",
+ "Epoch 0: 59%|▌| 16/27 [1:14:42<51:21, 280.16s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 63%|▋| 17/27 [1:28:59<52:20, 314.08s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 67%|▋| 18/27 [1:29:11<44:35, 297.31s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 70%|▋| 19/27 [1:44:31<44:00, 330.08s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 74%|▋| 20/27 [1:53:03<39:34, 339.17s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 78%|▊| 21/27 [2:11:07<37:27, 374.63s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 81%|▊| 22/27 [2:30:21<34:10, 410.07s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 85%|▊| 23/27 [2:30:33<26:11, 392.78s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 89%|▉| 24/27 [2:30:46<18:50, 376.94s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 93%|▉| 25/27 [2:30:58<12:04, 362.34s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 96%|▉| 26/27 [2:31:10<05:48, 348.85s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 100%|█| 27/27 [2:31:21<00:00, 336.34s/it, loss=0.0265, v_num=2, train_l\u001b[A\n",
+ "Epoch 0: 100%|█| 27/27 [2:31:21<00:00, 336.34s/it, loss=0.0265, v_num=2, train_l\u001b[A"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "`Trainer.fit` stopped: `max_epochs=1` reached.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 0: 100%|█| 27/27 [2:31:21<00:00, 336.37s/it, loss=0.0265, v_num=2, train_l\n"
+ ]
+ }
+ ],
"source": [
- "trainer = pl.Trainer(model)\n",
- "trainer.fit(max_epochs=1)"
+ "trainer = pl.Trainer(max_epochs=1)\n",
+ "trainer.fit(model)"
]
},
{
@@ -312,17 +505,8 @@
"mayavoz-train \\\n",
" model=Demucs \\\n",
" Demucs.sampling_rate=16000 \\\n",
- " dataset=DNS-2020 \\\n",
- " DNS-2020.name = \"dns-2020\" \\\n",
- " DNS-2020.root_dir=\"your_root_dir\" \\\n",
- " DNS-2020.train_clean=\"\" \\\n",
- " DNS-2020.train_noisy=\"\" \\\n",
- " DNS-2020.test_clean=\"\" \\\n",
- " DNS-2020.test_noisy=\"\" \\\n",
- " DNS-2020.sampling_rate=16000 \\\n",
- " DNS-2020.duration=2.0 \\\n",
- " traine=default \\ \n",
- " default.max_epochs=1 \\\n",
+ " dataset=VCTK dataset.root_dir = \"your_root_directory\" \\\n",
+ " trainer=fastrun_dev\n",
"\n",
"```\n",
"\n",
@@ -336,27 +520,34 @@
"metadata": {},
"outputs": [],
"source": [
- "from mayavoz.utils import Files\n",
"from mayavoz.data import MayaDataset\n",
"from mayavoz.models import Demucs\n",
"\n",
- "files = Files(\n",
- " train_clean=\"\",\n",
- " train_noisy=\"\",\n",
- " test_clean=\"\",\n",
- " test_noisy=\"\"\n",
- ")\n",
"dataset = MayaDataset(\n",
- " name='dns-2020'\n",
- " root_dir=\"your_root_dir\",\n",
- " files=files,\n",
- " sampling_rate=16000,\n",
- " duration=2.0)\n",
- "model = Demucs(dataset=dataset,sampling_rate=16000)\n",
- "trainer = Trainer(max_epochs=1)\n",
+ " name='vctk'\n",
+ " root_dir=\"your_root_directory\",\n",
+ " )\n",
+ "model = Demucs(dataset=dataset, sampling_rate=16000)\n",
+ "trainer = Trainer()\n",
"trainer.fit(model)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "6558a563",
+ "metadata": {},
+ "source": [
+ "For example, if you want to add/change `stride` of dataset\n",
+ "\n",
+ "```bash\n",
+ "mayavoz-train \\\n",
+ " model=Demucs \\\n",
+ " Demucs.sampling_rate=16000 \\\n",
+ " dataset=VCTK dataset.root_dir = \"your_root_directory\" dataset.stride=1\\\n",
+ "\n",
+ "```"
+ ]
+ },
{
"cell_type": "markdown",
"id": "eb26692c",
@@ -405,9 +596,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "mayavoz",
+ "display_name": "enhancer",
"language": "python",
- "name": "mayavoz"
+ "name": "enhancer"
},
"language_info": {
"codemirror_mode": {
@@ -420,6 +611,11 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "aa065deb7c1aa0a1a524e1ebced87b297febfedb61bf47eab2415d34995331a2"
+ }
}
},
"nbformat": 4,