From 22b017daeab361ced6b5044baa10f2f19abdb8c9 Mon Sep 17 00:00:00 2001 From: shahules786 Date: Fri, 26 Aug 2022 16:56:11 +0530 Subject: [PATCH] process data files --- enhancer/utils/fileprocessor.py | 96 +++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 enhancer/utils/fileprocessor.py diff --git a/enhancer/utils/fileprocessor.py b/enhancer/utils/fileprocessor.py new file mode 100644 index 0000000..e7cd033 --- /dev/null +++ b/enhancer/utils/fileprocessor.py @@ -0,0 +1,96 @@ +import glob +import os +import numpy as np +from scipy.io import wavfile + +class Fileprocessor: + + def __init__( + self, + clean_dir, + noisy_dir, + sr = 16000, + matching_function = None + ): + self.clean_dir = clean_dir + self.noisy_dir = noisy_dir + self.sr = sr + self.matching_function = matching_function + + @classmethod + def from_name(cls, + name:str, + clean_dir, + noisy_dir, + sr, + matching_function=None + ): + + if name.lower() == "vctk": + return cls(clean_dir,noisy_dir,sr, Fileprocessor.match_vtck) + elif name.lower() == "dns-2020": + return cls(clean_dir,noisy_dir,sr, Fileprocessor.match_dns2020) + else: + return cls(clean_dir,noisy_dir,sr, matching_function) + + def prepare_matching_dict(self): + + if self.matching_function is None: + raise ValueError("Not a valid matching function") + + return self.matching_function(self.clean_dir,self.noisy_dir,self.sr) + + @staticmethod + def match_vtck(clean_path,noisy_path,sr): + + matching_wavfiles = dict() + clean_filenames = [file.split('/')[-1] for file in glob.glob(os.path.join(clean_path,"*.wav"))] + noisy_filenames = [file.split('/')[-1] for file in glob.glob(os.path.join(noisy_path,"*.wav"))] + common_filenames = np.intersect1d(noisy_filenames,clean_filenames) + + for file_name in common_filenames: + + sr_clean, clean_file = wavfile.read(os.path.join(clean_path,file_name)) + sr_noisy, noisy_file = wavfile.read(os.path.join(noisy_path,file_name)) + if ((clean_file.shape[-1]==noisy_file.shape[-1]) and + (sr_clean==sr) and + (sr_noisy==sr)): + matching_wavfiles.update( + {os.path.join(clean_path,file_name):{"noisy":os.path.join(noisy_path,file_name), + "duration":clean_file.shape[-1]/sr} + } + ) + return matching_wavfiles + + @staticmethod + def match_dns2020(clean_path,noisy_path,sr): + + matching_wavfiles = dict() + clean_filenames = [file.split('/')[-1] for file in glob.glob(os.path.join(clean_path,"*.wav"))] + for clean_file in clean_filenames: + noisy_filenames = glob.glob(os.path.join(noisy_path,f"*_{clean_file}.wav")) + for noisy_file in noisy_filenames: + + sr_clean, clean_file = wavfile.read(os.path.join(clean_path,clean_file)) + sr_noisy, noisy_file = wavfile.read(noisy_file) + if ((clean_file.shape[-1]==noisy_file.shape[-1]) and + (sr_clean==sr) and + (sr_noisy==sr)): + matching_wavfiles.update( + {os.path.join(clean_path,clean_file):{"noisy":noisy_file, + "duration":clean_file.shape[-1]/sr} + } + ) + + return matching_wavfiles + + + + + + + + + + +