changes to prep dns 2020

2022-10-14 15:20:34 +05:30 · 2022-10-14 15:20:34 +05:30 · 8d1c057b86
parent 6e0f69f575
commit 8d1c057b86
7 changed files with 280 additions and 18 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -40,4 +40,5 @@ repos:
      - id: end-of-file-fixer
      - id: requirements-txt-fixer
      - id: mixed-line-ending
+        exclude: noisyspeech_synthesizer.cfg
        args: ['--fix=no']
--- a/audiolib.py
+++ b/audiolib.py
@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 26 15:54:05 2019
+
+@author: chkarada
+"""
+import os
+
+import numpy as np
+import soundfile as sf
+
+
+# Function to read audio
+def audioread(path, norm=True, start=0, stop=None):
+    path = os.path.abspath(path)
+    if not os.path.exists(path):
+        raise ValueError("[{}] does not exist!".format(path))
+    try:
+        x, sr = sf.read(path, start=start, stop=stop)
+    except RuntimeError:  # fix for sph pcm-embedded shortened v2
+        print("WARNING: Audio type not supported")
+
+    if len(x.shape) == 1:  # mono
+        if norm:
+            rms = (x**2).mean() ** 0.5
+            scalar = 10 ** (-25 / 20) / (rms)
+            x = x * scalar
+        return x, sr
+    else:  # multi-channel
+        x = x.T
+        x = x.sum(axis=0) / x.shape[0]
+        if norm:
+            rms = (x**2).mean() ** 0.5
+            scalar = 10 ** (-25 / 20) / (rms)
+            x = x * scalar
+        return x, sr
+
+
+# Funtion to write audio
+def audiowrite(data, fs, destpath, norm=False):
+    if norm:
+        eps = 0.0
+        rms = (data**2).mean() ** 0.5
+        scalar = 10 ** (-25 / 10) / (rms + eps)
+        data = data * scalar
+        if max(abs(data)) >= 1:
+            data = data / max(abs(data), eps)
+
+    destpath = os.path.abspath(destpath)
+    destdir = os.path.dirname(destpath)
+
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    sf.write(destpath, data, fs)
+    return
+
+
+# Function to mix clean speech and noise at various SNR levels
+def snr_mixer(clean, noise, snr):
+    # Normalizing to -25 dB FS
+    rmsclean = (clean**2).mean() ** 0.5
+    scalarclean = 10 ** (-25 / 20) / rmsclean
+    clean = clean * scalarclean
+    rmsclean = (clean**2).mean() ** 0.5
+
+    rmsnoise = (noise**2).mean() ** 0.5
+    scalarnoise = 10 ** (-25 / 20) / rmsnoise
+    noise = noise * scalarnoise
+    rmsnoise = (noise**2).mean() ** 0.5
+
+    # Set the noise level for a given SNR
+    noisescalar = np.sqrt(rmsclean / (10 ** (snr / 20)) / rmsnoise)
+    noisenewlevel = noise * noisescalar
+    noisyspeech = clean + noisenewlevel
+    return clean, noisenewlevel, noisyspeech
--- a/hpc_entrypoint.sh
+++ b/hpc_entrypoint.sh
@ -34,6 +34,6 @@ pwd

 #python transcriber/tasks/embeddings/timit.py --directory /scratch/$USER/TIMIT/data/lisa/data/timit/raw/TIMIT/TRAIN --output ./data/train
 #python transcriber/tasks/embeddings/timit.py --directory /scratch/$USER/TIMIT/data/lisa/data/timit/raw/TIMIT/TEST --output ./data/test
-
+python noisyspeech_synthesizer.py
 echo "Start Training..."
-python enhancer/cli/train.py
+#python enhancer/cli/train.py
--- a/noisyspeech_synthesizer.cfg
+++ b/noisyspeech_synthesizer.cfg
@ -0,0 +1,29 @@
+# Configuration for generating Noisy Speech Dataset
+
+# - sampling_rate: Specify the sampling rate. Default is 16 kHz
+# - audioformat: default is .wav
+# - audio_length: Minimum Length of each audio clip (noisy and clean speech) in seconds that will be generated by augmenting utterances.
+# - silence_length: Duration of silence introduced between clean speech utterances.
+# - total_hours: Total number of hours of data required. Units are in hours.
+# - snr_lower: Lower bound for SNR required (default: 0 dB)
+# - snr_upper: Upper bound for SNR required (default: 40 dB)
+# - total_snrlevels: Number of SNR levels required (default: 5, which means there are 5 levels between snr_lower and snr_upper)
+# - noise_dir: Default is None. But specify the noise directory path if noise files are not in the source directory
+# - Speech_dir: Default is None. But specify the speech directory path if speech files are not in the source directory
+# - noise_types_excluded: Noise files starting with the following tags to be excluded in the noise list. Example: noise_types_excluded: Babble, AirConditioner
+#                         Specify 'None' if no noise files to be excluded.
+
+[noisy_speech]
+
+sampling_rate: 16000
+audioformat: *.wav
+audio_length: 10
+silence_length: 0.2
+total_hours: 20
+snr_lower: 0
+snr_upper: 40
+total_snrlevels: 5
+
+noise_dir: /scratch/c.sistc3/MS-SNSD/noise_train
+speech_dir: /scratch/c.sistc3/MS-SNSD/clean_train
+noise_types_excluded: None
--- a/noisyspeech_synthesizer.py
+++ b/noisyspeech_synthesizer.py
@ -0,0 +1,153 @@
+"""
+@author: chkarada
+"""
+import argparse
+import configparser as CP
+import glob
+import os
+
+import numpy as np
+
+from audiolib import audioread, audiowrite, snr_mixer
+
+
+def main(cfg):
+    snr_lower = float(cfg["snr_lower"])
+    snr_upper = float(cfg["snr_upper"])
+    total_snrlevels = int(cfg["total_snrlevels"])
+
+    clean_dir = os.path.join(os.path.dirname(__file__), "clean_train")
+    if cfg["speech_dir"] != "None":
+        clean_dir = cfg["speech_dir"]
+    if not os.path.exists(clean_dir):
+        assert False, "Clean speech data is required"
+
+    noise_dir = os.path.join(os.path.dirname(__file__), "noise_train")
+    if cfg["noise_dir"] != "None":
+        noise_dir = cfg["noise_dir"]
+    if not os.path.exists(noise_dir):
+        assert False, "Noise data is required"
+
+    fs = float(cfg["sampling_rate"])
+    audioformat = cfg["audioformat"]
+    total_hours = float(cfg["total_hours"])
+    audio_length = float(cfg["audio_length"])
+    silence_length = float(cfg["silence_length"])
+    noisyspeech_dir = os.path.join(
+        os.path.dirname(__file__), "NoisySpeech_training"
+    )
+    if not os.path.exists(noisyspeech_dir):
+        os.makedirs(noisyspeech_dir)
+    clean_proc_dir = os.path.join(
+        os.path.dirname(__file__), "CleanSpeech_training"
+    )
+    if not os.path.exists(clean_proc_dir):
+        os.makedirs(clean_proc_dir)
+    noise_proc_dir = os.path.join(os.path.dirname(__file__), "Noise_training")
+    if not os.path.exists(noise_proc_dir):
+        os.makedirs(noise_proc_dir)
+
+    total_secs = total_hours * 60 * 60
+    total_samples = int(total_secs * fs)
+    audio_length = int(audio_length * fs)
+    SNR = np.linspace(snr_lower, snr_upper, total_snrlevels)
+    cleanfilenames = glob.glob(os.path.join(clean_dir, audioformat))
+    if cfg["noise_types_excluded"] == "None":
+        noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
+    else:
+        filestoexclude = cfg["noise_types_excluded"].split(",")
+        noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
+        for i in range(len(filestoexclude)):
+            noisefilenames = [
+                fn
+                for fn in noisefilenames
+                if not os.path.basename(fn).startswith(filestoexclude[i])
+            ]
+
+    filecounter = 0
+    num_samples = 0
+
+    while num_samples < total_samples:
+        idx_s = np.random.randint(0, np.size(cleanfilenames))
+        clean, fs = audioread(cleanfilenames[idx_s])
+
+        if len(clean) > audio_length:
+            clean = clean
+
+        else:
+
+            while len(clean) <= audio_length:
+                idx_s = idx_s + 1
+                if idx_s >= np.size(cleanfilenames) - 1:
+                    idx_s = np.random.randint(0, np.size(cleanfilenames))
+                newclean, fs = audioread(cleanfilenames[idx_s])
+                cleanconcat = np.append(
+                    clean, np.zeros(int(fs * silence_length))
+                )
+                clean = np.append(cleanconcat, newclean)
+
+        idx_n = np.random.randint(0, np.size(noisefilenames))
+        noise, fs = audioread(noisefilenames[idx_n])
+
+        if len(noise) >= len(clean):
+            noise = noise[0 : len(clean)]
+
+        else:
+
+            while len(noise) <= len(clean):
+                idx_n = idx_n + 1
+                if idx_n >= np.size(noisefilenames) - 1:
+                    idx_n = np.random.randint(0, np.size(noisefilenames))
+                newnoise, fs = audioread(noisefilenames[idx_n])
+                noiseconcat = np.append(
+                    noise, np.zeros(int(fs * silence_length))
+                )
+                noise = np.append(noiseconcat, newnoise)
+        noise = noise[0 : len(clean)]
+        filecounter = filecounter + 1
+
+        for i in range(np.size(SNR)):
+            clean_snr, noise_snr, noisy_snr = snr_mixer(
+                clean=clean, noise=noise, snr=SNR[i]
+            )
+            noisyfilename = (
+                "noisy"
+                + str(filecounter)
+                + "_SNRdb_"
+                + str(SNR[i])
+                + "_clnsp"
+                + str(filecounter)
+                + ".wav"
+            )
+            cleanfilename = "clnsp" + str(filecounter) + ".wav"
+            noisefilename = (
+                "noisy" + str(filecounter) + "_SNRdb_" + str(SNR[i]) + ".wav"
+            )
+            noisypath = os.path.join(noisyspeech_dir, noisyfilename)
+            cleanpath = os.path.join(clean_proc_dir, cleanfilename)
+            noisepath = os.path.join(noise_proc_dir, noisefilename)
+            audiowrite(noisy_snr, fs, noisypath, norm=False)
+            audiowrite(clean_snr, fs, cleanpath, norm=False)
+            audiowrite(noise_snr, fs, noisepath, norm=False)
+            num_samples = num_samples + len(noisy_snr)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Configurations: read noisyspeech_synthesizer.cfg
+    parser.add_argument(
+        "--cfg",
+        default="noisyspeech_synthesizer.cfg",
+        help="Read noisyspeech_synthesizer.cfg for all the details",
+    )
+    parser.add_argument("--cfg_str", type=str, default="noisy_speech")
+    args = parser.parse_args()
+
+    cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
+    assert os.path.exists(cfgpath), f"No configuration file as [{cfgpath}]"
+    cfg = CP.ConfigParser()
+    cfg._interpolation = CP.ExtendedInterpolation()
+    cfg.read(cfgpath)
+
+    main(cfg._sections[args.cfg_str])
--- a/pyproject.toml
+++ b/pyproject.toml
@ -2,7 +2,6 @@
 line-length = 80
 target-version = ['py38']
 exclude = '''
-
 (
  /(
      \.eggs         # exclude a few common directories in the
@ -10,6 +9,9 @@ exclude = '''
    | \.mypy_cache
    | \.tox
    | \.venv
+    | noisyspeech_synthesizer.py
+    | noisyspeech_synthesizer.cfg
+
  )/
 )
 '''
--- a/requirements.txt
+++ b/requirements.txt
@ -1,18 +1,19 @@
-boto3>=1.24.86
-huggingface-hub>=0.10.0
-hydra-core>=1.2.0
-joblib>=1.2.0
-librosa>=0.9.2
-mlflow>=1.29.0
+# torch>=1.12.1
+# torchaudio>=0.12.1
+# tqdm>=4.64.1
+configparser
+# boto3>=1.24.86
+# huggingface-hub>=0.10.0
+# hydra-core>=1.2.0
+# joblib>=1.2.0
+# librosa>=0.9.2
+# mlflow>=1.29.0
 numpy>=1.23.3
-pesq==0.0.4
-protobuf>=3.19.6
-pystoi==0.3.3
-pytest-lazy-fixture>=0.6.3
-pytorch-lightning>=1.7.7
-scikit-learn>=1.1.2
+# pesq==0.0.4
+# protobuf>=3.19.6
+# pystoi==0.3.3
+# pytest-lazy-fixture>=0.6.3
+# pytorch-lightning>=1.7.7
+# scikit-learn>=1.1.2
 scipy>=1.9.1
 soundfile>=0.11.0
-torch>=1.12.1
-torchaudio>=0.12.1
-tqdm>=4.64.1