diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 807429c..6a31dc2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,4 +40,5 @@ repos: - id: end-of-file-fixer - id: requirements-txt-fixer - id: mixed-line-ending + exclude: noisyspeech_synthesizer.cfg args: ['--fix=no'] diff --git a/audiolib.py b/audiolib.py new file mode 100644 index 0000000..47e1784 --- /dev/null +++ b/audiolib.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 26 15:54:05 2019 + +@author: chkarada +""" +import os + +import numpy as np +import soundfile as sf + + +# Function to read audio +def audioread(path, norm=True, start=0, stop=None): + path = os.path.abspath(path) + if not os.path.exists(path): + raise ValueError("[{}] does not exist!".format(path)) + try: + x, sr = sf.read(path, start=start, stop=stop) + except RuntimeError: # fix for sph pcm-embedded shortened v2 + print("WARNING: Audio type not supported") + + if len(x.shape) == 1: # mono + if norm: + rms = (x**2).mean() ** 0.5 + scalar = 10 ** (-25 / 20) / (rms) + x = x * scalar + return x, sr + else: # multi-channel + x = x.T + x = x.sum(axis=0) / x.shape[0] + if norm: + rms = (x**2).mean() ** 0.5 + scalar = 10 ** (-25 / 20) / (rms) + x = x * scalar + return x, sr + + +# Funtion to write audio +def audiowrite(data, fs, destpath, norm=False): + if norm: + eps = 0.0 + rms = (data**2).mean() ** 0.5 + scalar = 10 ** (-25 / 10) / (rms + eps) + data = data * scalar + if max(abs(data)) >= 1: + data = data / max(abs(data), eps) + + destpath = os.path.abspath(destpath) + destdir = os.path.dirname(destpath) + + if not os.path.exists(destdir): + os.makedirs(destdir) + + sf.write(destpath, data, fs) + return + + +# Function to mix clean speech and noise at various SNR levels +def snr_mixer(clean, noise, snr): + # Normalizing to -25 dB FS + rmsclean = (clean**2).mean() ** 0.5 + scalarclean = 10 ** (-25 / 20) / rmsclean + clean = clean * scalarclean + rmsclean = (clean**2).mean() ** 0.5 + + rmsnoise = (noise**2).mean() ** 0.5 + scalarnoise = 10 ** (-25 / 20) / rmsnoise + noise = noise * scalarnoise + rmsnoise = (noise**2).mean() ** 0.5 + + # Set the noise level for a given SNR + noisescalar = np.sqrt(rmsclean / (10 ** (snr / 20)) / rmsnoise) + noisenewlevel = noise * noisescalar + noisyspeech = clean + noisenewlevel + return clean, noisenewlevel, noisyspeech diff --git a/hpc_entrypoint.sh b/hpc_entrypoint.sh index 4c77127..b235bbc 100644 --- a/hpc_entrypoint.sh +++ b/hpc_entrypoint.sh @@ -34,6 +34,6 @@ pwd #python transcriber/tasks/embeddings/timit.py --directory /scratch/$USER/TIMIT/data/lisa/data/timit/raw/TIMIT/TRAIN --output ./data/train #python transcriber/tasks/embeddings/timit.py --directory /scratch/$USER/TIMIT/data/lisa/data/timit/raw/TIMIT/TEST --output ./data/test - +python noisyspeech_synthesizer.py echo "Start Training..." -python enhancer/cli/train.py +#python enhancer/cli/train.py diff --git a/noisyspeech_synthesizer.cfg b/noisyspeech_synthesizer.cfg new file mode 100644 index 0000000..1fece0e --- /dev/null +++ b/noisyspeech_synthesizer.cfg @@ -0,0 +1,29 @@ +# Configuration for generating Noisy Speech Dataset + +# - sampling_rate: Specify the sampling rate. Default is 16 kHz +# - audioformat: default is .wav +# - audio_length: Minimum Length of each audio clip (noisy and clean speech) in seconds that will be generated by augmenting utterances. +# - silence_length: Duration of silence introduced between clean speech utterances. +# - total_hours: Total number of hours of data required. Units are in hours. +# - snr_lower: Lower bound for SNR required (default: 0 dB) +# - snr_upper: Upper bound for SNR required (default: 40 dB) +# - total_snrlevels: Number of SNR levels required (default: 5, which means there are 5 levels between snr_lower and snr_upper) +# - noise_dir: Default is None. But specify the noise directory path if noise files are not in the source directory +# - Speech_dir: Default is None. But specify the speech directory path if speech files are not in the source directory +# - noise_types_excluded: Noise files starting with the following tags to be excluded in the noise list. Example: noise_types_excluded: Babble, AirConditioner +# Specify 'None' if no noise files to be excluded. + +[noisy_speech] + +sampling_rate: 16000 +audioformat: *.wav +audio_length: 10 +silence_length: 0.2 +total_hours: 20 +snr_lower: 0 +snr_upper: 40 +total_snrlevels: 5 + +noise_dir: /scratch/c.sistc3/MS-SNSD/noise_train +speech_dir: /scratch/c.sistc3/MS-SNSD/clean_train +noise_types_excluded: None diff --git a/noisyspeech_synthesizer.py b/noisyspeech_synthesizer.py new file mode 100644 index 0000000..00e9b62 --- /dev/null +++ b/noisyspeech_synthesizer.py @@ -0,0 +1,153 @@ +""" +@author: chkarada +""" +import argparse +import configparser as CP +import glob +import os + +import numpy as np + +from audiolib import audioread, audiowrite, snr_mixer + + +def main(cfg): + snr_lower = float(cfg["snr_lower"]) + snr_upper = float(cfg["snr_upper"]) + total_snrlevels = int(cfg["total_snrlevels"]) + + clean_dir = os.path.join(os.path.dirname(__file__), "clean_train") + if cfg["speech_dir"] != "None": + clean_dir = cfg["speech_dir"] + if not os.path.exists(clean_dir): + assert False, "Clean speech data is required" + + noise_dir = os.path.join(os.path.dirname(__file__), "noise_train") + if cfg["noise_dir"] != "None": + noise_dir = cfg["noise_dir"] + if not os.path.exists(noise_dir): + assert False, "Noise data is required" + + fs = float(cfg["sampling_rate"]) + audioformat = cfg["audioformat"] + total_hours = float(cfg["total_hours"]) + audio_length = float(cfg["audio_length"]) + silence_length = float(cfg["silence_length"]) + noisyspeech_dir = os.path.join( + os.path.dirname(__file__), "NoisySpeech_training" + ) + if not os.path.exists(noisyspeech_dir): + os.makedirs(noisyspeech_dir) + clean_proc_dir = os.path.join( + os.path.dirname(__file__), "CleanSpeech_training" + ) + if not os.path.exists(clean_proc_dir): + os.makedirs(clean_proc_dir) + noise_proc_dir = os.path.join(os.path.dirname(__file__), "Noise_training") + if not os.path.exists(noise_proc_dir): + os.makedirs(noise_proc_dir) + + total_secs = total_hours * 60 * 60 + total_samples = int(total_secs * fs) + audio_length = int(audio_length * fs) + SNR = np.linspace(snr_lower, snr_upper, total_snrlevels) + cleanfilenames = glob.glob(os.path.join(clean_dir, audioformat)) + if cfg["noise_types_excluded"] == "None": + noisefilenames = glob.glob(os.path.join(noise_dir, audioformat)) + else: + filestoexclude = cfg["noise_types_excluded"].split(",") + noisefilenames = glob.glob(os.path.join(noise_dir, audioformat)) + for i in range(len(filestoexclude)): + noisefilenames = [ + fn + for fn in noisefilenames + if not os.path.basename(fn).startswith(filestoexclude[i]) + ] + + filecounter = 0 + num_samples = 0 + + while num_samples < total_samples: + idx_s = np.random.randint(0, np.size(cleanfilenames)) + clean, fs = audioread(cleanfilenames[idx_s]) + + if len(clean) > audio_length: + clean = clean + + else: + + while len(clean) <= audio_length: + idx_s = idx_s + 1 + if idx_s >= np.size(cleanfilenames) - 1: + idx_s = np.random.randint(0, np.size(cleanfilenames)) + newclean, fs = audioread(cleanfilenames[idx_s]) + cleanconcat = np.append( + clean, np.zeros(int(fs * silence_length)) + ) + clean = np.append(cleanconcat, newclean) + + idx_n = np.random.randint(0, np.size(noisefilenames)) + noise, fs = audioread(noisefilenames[idx_n]) + + if len(noise) >= len(clean): + noise = noise[0 : len(clean)] + + else: + + while len(noise) <= len(clean): + idx_n = idx_n + 1 + if idx_n >= np.size(noisefilenames) - 1: + idx_n = np.random.randint(0, np.size(noisefilenames)) + newnoise, fs = audioread(noisefilenames[idx_n]) + noiseconcat = np.append( + noise, np.zeros(int(fs * silence_length)) + ) + noise = np.append(noiseconcat, newnoise) + noise = noise[0 : len(clean)] + filecounter = filecounter + 1 + + for i in range(np.size(SNR)): + clean_snr, noise_snr, noisy_snr = snr_mixer( + clean=clean, noise=noise, snr=SNR[i] + ) + noisyfilename = ( + "noisy" + + str(filecounter) + + "_SNRdb_" + + str(SNR[i]) + + "_clnsp" + + str(filecounter) + + ".wav" + ) + cleanfilename = "clnsp" + str(filecounter) + ".wav" + noisefilename = ( + "noisy" + str(filecounter) + "_SNRdb_" + str(SNR[i]) + ".wav" + ) + noisypath = os.path.join(noisyspeech_dir, noisyfilename) + cleanpath = os.path.join(clean_proc_dir, cleanfilename) + noisepath = os.path.join(noise_proc_dir, noisefilename) + audiowrite(noisy_snr, fs, noisypath, norm=False) + audiowrite(clean_snr, fs, cleanpath, norm=False) + audiowrite(noise_snr, fs, noisepath, norm=False) + num_samples = num_samples + len(noisy_snr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # Configurations: read noisyspeech_synthesizer.cfg + parser.add_argument( + "--cfg", + default="noisyspeech_synthesizer.cfg", + help="Read noisyspeech_synthesizer.cfg for all the details", + ) + parser.add_argument("--cfg_str", type=str, default="noisy_speech") + args = parser.parse_args() + + cfgpath = os.path.join(os.path.dirname(__file__), args.cfg) + assert os.path.exists(cfgpath), f"No configuration file as [{cfgpath}]" + cfg = CP.ConfigParser() + cfg._interpolation = CP.ExtendedInterpolation() + cfg.read(cfgpath) + + main(cfg._sections[args.cfg_str]) diff --git a/pyproject.toml b/pyproject.toml index b3e5d7c..d1d1c3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,6 @@ line-length = 80 target-version = ['py38'] exclude = ''' - ( /( \.eggs # exclude a few common directories in the @@ -10,6 +9,9 @@ exclude = ''' | \.mypy_cache | \.tox | \.venv + | noisyspeech_synthesizer.py + | noisyspeech_synthesizer.cfg + )/ ) ''' diff --git a/requirements.txt b/requirements.txt index 95f145d..022661f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,19 @@ -boto3>=1.24.86 -huggingface-hub>=0.10.0 -hydra-core>=1.2.0 -joblib>=1.2.0 -librosa>=0.9.2 -mlflow>=1.29.0 +# torch>=1.12.1 +# torchaudio>=0.12.1 +# tqdm>=4.64.1 +configparser +# boto3>=1.24.86 +# huggingface-hub>=0.10.0 +# hydra-core>=1.2.0 +# joblib>=1.2.0 +# librosa>=0.9.2 +# mlflow>=1.29.0 numpy>=1.23.3 -pesq==0.0.4 -protobuf>=3.19.6 -pystoi==0.3.3 -pytest-lazy-fixture>=0.6.3 -pytorch-lightning>=1.7.7 -scikit-learn>=1.1.2 +# pesq==0.0.4 +# protobuf>=3.19.6 +# pystoi==0.3.3 +# pytest-lazy-fixture>=0.6.3 +# pytorch-lightning>=1.7.7 +# scikit-learn>=1.1.2 scipy>=1.9.1 soundfile>=0.11.0 -torch>=1.12.1 -torchaudio>=0.12.1 -tqdm>=4.64.1