commit
f2111321bf
10
README.md
10
README.md
|
|
@ -30,13 +30,15 @@ model("noisy_audio.wav")
|
||||||
|
|
||||||
| Model | Dataset | STOI | PESQ | URL |
|
| Model | Dataset | STOI | PESQ | URL |
|
||||||
| :---: | :---: | :---: | :---: | :---: |
|
| :---: | :---: | :---: | :---: | :---: |
|
||||||
| WaveUnet | Vctk-28spk | 0.836 | 2.78 | shahules786/mayavoz-waveunet-valentini-28spk |
|
| WaveUnet | Valentini-28spk | 0.836 | 2.78 | shahules786/mayavoz-waveunet-valentini-28spk |
|
||||||
| Demucs | Vctk-28spk | 0.961 | 2.56 | shahules786/mayavoz-demucs-valentini-28spk |
|
| Demucs | Valentini-28spk | 0.961 | 2.56 | shahules786/mayavoz-demucs-valentini-28spk |
|
||||||
| DCCRN | Vctk-28spk | 0.724 | 2.55 | shahules786/mayavoz-dccrn-valentini-28spk |
|
| DCCRN | Valentini-28spk | 0.724 | 2.55 | shahules786/mayavoz-dccrn-valentini-28spk |
|
||||||
| Demucs | DNS2020 (20hrs) | 0.56 | 1.26 | shahules786/mayavoz-demucs-dns2020-20hr |
|
| Demucs | MS-SNSD-20hrs | 0.56 | 1.26 | shahules786/mayavoz-demucs-ms-snsd-20 |
|
||||||
|
|
||||||
Test scores are based on respective test set associated with train dataset.
|
Test scores are based on respective test set associated with train dataset.
|
||||||
|
|
||||||
|
**See [tutorials](/notebooks/) to train your custom model**
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Only Python 3.8+ is officially supported (though it might work with Python 3.7)
|
Only Python 3.8+ is officially supported (though it might work with Python 3.7)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
_target_: mayavoz.data.dataset.MayaDataset
|
||||||
name : dns-2020
|
name : MS-SDSD
|
||||||
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
||||||
duration : 2.0
|
duration : 2.0
|
||||||
sampling_rate: 16000
|
sampling_rate: 16000
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
_target_: mayavoz.data.dataset.MayaDataset
|
||||||
name : vctk
|
name : Valentini
|
||||||
root_dir : /scratch/c.sistc3/DS_10283_2791
|
root_dir : /scratch/c.sistc3/DS_10283_2791
|
||||||
duration : 4.5
|
duration : 4.5
|
||||||
stride : 2
|
stride : 2
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
|
||||||
name : vctk
|
|
||||||
root_dir : /scratch/c.sistc3/DS_10283_2791
|
|
||||||
duration : 4.5
|
|
||||||
stride : 2
|
|
||||||
sampling_rate: 16000
|
|
||||||
batch_size: 32
|
|
||||||
valid_minutes : 15
|
|
||||||
files:
|
|
||||||
train_clean : clean_trainset_28spk_wav
|
|
||||||
test_clean : clean_testset_wav
|
|
||||||
train_noisy : noisy_trainset_28spk_wav
|
|
||||||
test_noisy : noisy_testset_wav
|
|
||||||
|
|
@ -95,7 +95,7 @@ class Fileprocessor:
|
||||||
if matching_function is None:
|
if matching_function is None:
|
||||||
if name.lower() in ("vctk", "valentini"):
|
if name.lower() in ("vctk", "valentini"):
|
||||||
return cls(clean_dir, noisy_dir, ProcessorFunctions.one_to_one)
|
return cls(clean_dir, noisy_dir, ProcessorFunctions.one_to_one)
|
||||||
elif name.lower() == "dns-2020":
|
elif name.lower() == "ms-snsd":
|
||||||
return cls(clean_dir, noisy_dir, ProcessorFunctions.one_to_many)
|
return cls(clean_dir, noisy_dir, ProcessorFunctions.one_to_many)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@
|
||||||
"- `min_valid_minutes`: minimum validation in minutes. Validation is automatically selected from training set. (exclusive users).\n",
|
"- `min_valid_minutes`: minimum validation in minutes. Validation is automatically selected from training set. (exclusive users).\n",
|
||||||
"- `matching_function`: there are two types of mapping functions.\n",
|
"- `matching_function`: there are two types of mapping functions.\n",
|
||||||
" - `one_to_one` : In this one clean file will only have one corresponding noisy file. For example Valentini datasets\n",
|
" - `one_to_one` : In this one clean file will only have one corresponding noisy file. For example Valentini datasets\n",
|
||||||
" - `one_to_many` : In this one clean file will only have one corresponding noisy file. For example DNS dataset.\n"
|
" - `one_to_many` : In this one clean file will only have one corresponding noisy file. For example MS-SNSD dataset.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -302,7 +302,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"Now there are two types of `matching_function`\n",
|
"Now there are two types of `matching_function`\n",
|
||||||
"- `one_to_one` : In this one clean file will only have one corresponding noisy file. For example Valentini datasets\n",
|
"- `one_to_one` : In this one clean file will only have one corresponding noisy file. For example Valentini datasets\n",
|
||||||
"- `one_to_many` : In this one clean file will only have one corresponding noisy file. For example DNS dataset."
|
"- `one_to_many` : In this one clean file will only have one corresponding noisy file. For example MS-SNSD dataset."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -479,7 +479,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### TL;DR\n",
|
"### TL;DR\n",
|
||||||
"Calling the following command would train mayavoz Demucs model on DNS-2020 dataset.\n",
|
"Calling the following command would train mayavoz Demucs model on MS-SNSD dataset.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"```bash\n",
|
"```bash\n",
|
||||||
"mayavoz-train \\\n",
|
"mayavoz-train \\\n",
|
||||||
|
|
@ -540,7 +540,7 @@
|
||||||
"mayavoz-train --cfg job \\\n",
|
"mayavoz-train --cfg job \\\n",
|
||||||
" model=Demucs \\\n",
|
" model=Demucs \\\n",
|
||||||
" Demucs.sampling_rate=16000 \\\n",
|
" Demucs.sampling_rate=16000 \\\n",
|
||||||
" dataset=DNS-2020\n",
|
" dataset=MS-SNSD\n",
|
||||||
"\n",
|
"\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -562,7 +562,7 @@
|
||||||
"```bash\n",
|
"```bash\n",
|
||||||
"mayavoz-train \\\n",
|
"mayavoz-train \\\n",
|
||||||
" model=Demucs model.sampling_rate=16000 \\\n",
|
" model=Demucs model.sampling_rate=16000 \\\n",
|
||||||
" dataset=DNS-2020\n",
|
" dataset=MS-SNSD\n",
|
||||||
"\n",
|
"\n",
|
||||||
"```"
|
"```"
|
||||||
]
|
]
|
||||||
|
|
@ -570,9 +570,9 @@
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "enhancer",
|
"display_name": "Python 3.8.13 ('enhancer')",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "enhancer"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
|
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
### DNS Challenge's dataset
|
|
||||||
|
|
||||||
The Deep Noise Suppression (DNS) Challenge is a single-channel speech enhancement
|
|
||||||
challenge organized by Microsoft, with a focus on real-time applications.
|
|
||||||
More info can be found on the [official page](https://dns-challenge.azurewebsites.net/).
|
|
||||||
|
|
||||||
**References**
|
|
||||||
The challenge paper, [here](https://arxiv.org/abs/2001.08662).
|
|
||||||
```BibTex
|
|
||||||
@misc{DNSChallenge2020,
|
|
||||||
title={The INTERSPEECH 2020 Deep Noise Suppression Challenge: Datasets, Subjective Speech Quality and Testing Framework},
|
|
||||||
author={Chandan K. A. Reddy and Ebrahim Beyrami and Harishchandra Dubey and Vishak Gopal and Roger Cheng and Ross Cutler and Sergiy Matusevych and Robert Aichner and Ashkan Aazami and Sebastian Braun and Puneet Rana and Sriram Srinivasan and Johannes Gehrke}, year={2020},
|
|
||||||
doi=https://doi.org/10.48550/arXiv.2001.08662,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
|
||||||
name : vctk
|
|
||||||
root_dir : /scratch/c.sistc3/DS_10283_2791
|
|
||||||
duration : 4.5
|
|
||||||
stride : 2
|
|
||||||
sampling_rate: 16000
|
|
||||||
batch_size: 32
|
|
||||||
valid_minutes : 15
|
|
||||||
files:
|
|
||||||
train_clean : clean_trainset_28spk_wav
|
|
||||||
test_clean : clean_testset_wav
|
|
||||||
train_noisy : noisy_trainset_28spk_wav
|
|
||||||
test_noisy : noisy_testset_wav
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
loss : mae
|
|
||||||
metric : [stoi,pesq,si-sdr]
|
|
||||||
lr : 0.0003
|
|
||||||
ReduceLr_patience : 5
|
|
||||||
ReduceLr_factor : 0.2
|
|
||||||
min_lr : 0.000001
|
|
||||||
EarlyStopping_factor : 10
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
_target_: mayavoz.models.waveunet.WaveUnet
|
|
||||||
num_channels : 1
|
|
||||||
depth : 9
|
|
||||||
initial_output_channels: 24
|
|
||||||
sampling_rate : 16000
|
|
||||||
|
|
@ -19,9 +19,9 @@ JOB_ID = os.environ.get("SLURM_JOBID", "0")
|
||||||
|
|
||||||
|
|
||||||
@hydra.main(config_path="train_config", config_name="config")
|
@hydra.main(config_path="train_config", config_name="config")
|
||||||
def main(config: DictConfig):
|
def train(config: DictConfig):
|
||||||
|
|
||||||
OmegaConf.save(config, "config_log.yaml")
|
OmegaConf.save(config, "config.yaml")
|
||||||
|
|
||||||
callbacks = []
|
callbacks = []
|
||||||
logger = MLFlowLogger(
|
logger = MLFlowLogger(
|
||||||
|
|
@ -96,7 +96,7 @@ def main(config: DictConfig):
|
||||||
trainer.test(model)
|
trainer.test(model)
|
||||||
|
|
||||||
logger.experiment.log_artifact(
|
logger.experiment.log_artifact(
|
||||||
logger.run_id, f"{trainer.default_root_dir}/config_log.yaml"
|
logger.run_id, f"{trainer.default_root_dir}/config.yaml"
|
||||||
)
|
)
|
||||||
|
|
||||||
saved_location = os.path.join(
|
saved_location = os.path.join(
|
||||||
|
|
@ -117,4 +117,4 @@ def main(config: DictConfig):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
train()
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
defaults:
|
defaults:
|
||||||
- model : Demucs
|
- model : Demucs
|
||||||
- dataset : Vctk
|
- dataset : MS-SNSD
|
||||||
- optimizer : Adam
|
- optimizer : Adam
|
||||||
- hyperparameters : default
|
- hyperparameters : default
|
||||||
- trainer : default
|
- trainer : default
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
_target_: mayavoz.data.dataset.MayaDataset
|
||||||
|
name : MS-SDSD
|
||||||
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
||||||
name : dns-2020
|
duration : 1.5
|
||||||
duration : 2.0
|
stride : 1
|
||||||
sampling_rate: 16000
|
sampling_rate: 16000
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
valid_size: 0.05
|
min_valid_minutes: 25
|
||||||
files:
|
files:
|
||||||
train_clean : CleanSpeech_training
|
train_clean : CleanSpeech_training
|
||||||
test_clean : CleanSpeech_training
|
test_clean : CleanSpeech_training
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
loss : si-snr
|
||||||
|
metric : [stoi,pesq]
|
||||||
|
lr : 0.001
|
||||||
|
ReduceLr_patience : 10
|
||||||
|
ReduceLr_factor : 0.5
|
||||||
|
min_lr : 0.000001
|
||||||
|
EarlyStopping_factor : 10
|
||||||
|
|
@ -19,9 +19,9 @@ JOB_ID = os.environ.get("SLURM_JOBID", "0")
|
||||||
|
|
||||||
|
|
||||||
@hydra.main(config_path="train_config", config_name="config")
|
@hydra.main(config_path="train_config", config_name="config")
|
||||||
def main(config: DictConfig):
|
def train(config: DictConfig):
|
||||||
|
|
||||||
OmegaConf.save(config, "config_log.yaml")
|
OmegaConf.save(config, "config.yaml")
|
||||||
|
|
||||||
callbacks = []
|
callbacks = []
|
||||||
logger = MLFlowLogger(
|
logger = MLFlowLogger(
|
||||||
|
|
@ -96,7 +96,7 @@ def main(config: DictConfig):
|
||||||
trainer.test(model)
|
trainer.test(model)
|
||||||
|
|
||||||
logger.experiment.log_artifact(
|
logger.experiment.log_artifact(
|
||||||
logger.run_id, f"{trainer.default_root_dir}/config_log.yaml"
|
logger.run_id, f"{trainer.default_root_dir}/config.yaml"
|
||||||
)
|
)
|
||||||
|
|
||||||
saved_location = os.path.join(
|
saved_location = os.path.join(
|
||||||
|
|
@ -117,4 +117,4 @@ def main(config: DictConfig):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
train()
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
defaults:
|
defaults:
|
||||||
- model : Demucs
|
- model : Demucs
|
||||||
- dataset : Vctk
|
- dataset : MS-SNSD
|
||||||
- optimizer : Adam
|
- optimizer : Adam
|
||||||
- hyperparameters : default
|
- hyperparameters : default
|
||||||
- trainer : default
|
- trainer : default
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
_target_: mayavoz.data.dataset.MayaDataset
|
_target_: mayavoz.data.dataset.MayaDataset
|
||||||
|
name : MS-SDSD
|
||||||
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
root_dir : /Users/shahules/Myprojects/MS-SNSD
|
||||||
name : dns-2020
|
duration : 5
|
||||||
duration : 2.0
|
stride : 1
|
||||||
sampling_rate: 16000
|
sampling_rate: 16000
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
valid_size: 0.05
|
min_valid_minutes: 25
|
||||||
files:
|
files:
|
||||||
train_clean : CleanSpeech_training
|
train_clean : CleanSpeech_training
|
||||||
test_clean : CleanSpeech_training
|
test_clean : CleanSpeech_training
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
loss : mae
|
||||||
|
metric : [stoi,pesq]
|
||||||
|
lr : 0.0003
|
||||||
|
ReduceLr_patience : 10
|
||||||
|
ReduceLr_factor : 0.5
|
||||||
|
min_lr : 0.000001
|
||||||
|
EarlyStopping_factor : 10
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
experiment_name : shahules/mayavoz
|
||||||
|
run_name : demucs-ms-snsd
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
### Microsoft Scalable Noisy Speech Dataset (MS-SNSD)
|
||||||
|
|
||||||
|
MS-SNSD is a speech datasetthat can scale to arbitrary sizes depending on the number of speakers, noise types, and Speech to Noise Ratio (SNR) levels desired.
|
||||||
|
|
||||||
|
### Dataset download & setup
|
||||||
|
- Follow steps in the official repo [here](https://github.com/microsoft/MS-SNSD) to download and setup the dataset.
|
||||||
|
|
||||||
|
**References**
|
||||||
|
```BibTex
|
||||||
|
@article{reddy2019scalable,
|
||||||
|
title={A Scalable Noisy Speech Dataset and Online Subjective Test Framework},
|
||||||
|
author={Reddy, Chandan KA and Beyrami, Ebrahim and Pool, Jamie and Cutler, Ross and Srinivasan, Sriram and Gehrke, Johannes},
|
||||||
|
journal={Proc. Interspeech 2019},
|
||||||
|
pages={1816--1820},
|
||||||
|
year={2019}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
loss : mae
|
|
||||||
metric : [stoi,pesq,si-sdr]
|
|
||||||
lr : 0.0003
|
|
||||||
ReduceLr_patience : 5
|
|
||||||
ReduceLr_factor : 0.2
|
|
||||||
min_lr : 0.000001
|
|
||||||
EarlyStopping_factor : 10
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
experiment_name : shahules/mayavoz
|
|
||||||
run_name : Demucs + Vtck with stride + augmentations
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
_target_: mayavoz.models.dccrn.DCCRN
|
|
||||||
num_channels: 1
|
|
||||||
sampling_rate : 16000
|
|
||||||
complex_lstm : True
|
|
||||||
complex_norm : True
|
|
||||||
complex_relu : True
|
|
||||||
masking_mode : True
|
|
||||||
|
|
||||||
encoder_decoder:
|
|
||||||
initial_output_channels : 32
|
|
||||||
depth : 6
|
|
||||||
kernel_size : 5
|
|
||||||
growth_factor : 2
|
|
||||||
stride : 2
|
|
||||||
padding : 2
|
|
||||||
output_padding : 1
|
|
||||||
|
|
||||||
lstm:
|
|
||||||
num_layers : 2
|
|
||||||
hidden_size : 256
|
|
||||||
|
|
||||||
stft:
|
|
||||||
window_len : 400
|
|
||||||
hop_size : 100
|
|
||||||
nfft : 512
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
_target_: mayavoz.models.demucs.Demucs
|
|
||||||
num_channels: 1
|
|
||||||
resample: 4
|
|
||||||
sampling_rate : 16000
|
|
||||||
|
|
||||||
encoder_decoder:
|
|
||||||
depth: 4
|
|
||||||
initial_output_channels: 64
|
|
||||||
kernel_size: 8
|
|
||||||
stride: 4
|
|
||||||
growth_factor: 2
|
|
||||||
glu: True
|
|
||||||
|
|
||||||
lstm:
|
|
||||||
bidirectional: False
|
|
||||||
num_layers: 2
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
_target_: mayavoz.models.waveunet.WaveUnet
|
|
||||||
num_channels : 1
|
|
||||||
depth : 9
|
|
||||||
initial_output_channels: 24
|
|
||||||
sampling_rate : 16000
|
|
||||||
|
|
@ -4,7 +4,7 @@ Clean and noisy parallel speech database. The database was designed to train and
|
||||||
|
|
||||||
**References**
|
**References**
|
||||||
```BibTex
|
```BibTex
|
||||||
@misc{DNSChallenge2020,
|
@misc{
|
||||||
title={Noisy speech database for training speech enhancement algorithms and TTS models},
|
title={Noisy speech database for training speech enhancement algorithms and TTS models},
|
||||||
author={Valentini-Botinhao, Cassia}, year={2017},
|
author={Valentini-Botinhao, Cassia}, year={2017},
|
||||||
doi=https://doi.org/10.7488/ds/2117,
|
doi=https://doi.org/10.7488/ds/2117,
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ def test_fileprocessor_vctk():
|
||||||
assert len(matching_dict) == 2
|
assert len(matching_dict) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dataset_name", ["vctk", "dns-2020"])
|
@pytest.mark.parametrize("dataset_name", ["vctk", "MS-SNSD"])
|
||||||
def test_fileprocessor_names(dataset_name):
|
def test_fileprocessor_names(dataset_name):
|
||||||
fp = Fileprocessor.from_name(dataset_name, "clean_dir", "noisy_dir")
|
fp = Fileprocessor.from_name(dataset_name, "clean_dir", "noisy_dir")
|
||||||
assert hasattr(fp.matching_function, "__call__")
|
assert hasattr(fp.matching_function, "__call__")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue