129 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			129 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
import os
 | 
						|
from pathlib import Path
 | 
						|
from typing import Optional, Union
 | 
						|
 | 
						|
import librosa
 | 
						|
import numpy as np
 | 
						|
import torch
 | 
						|
import torchaudio
 | 
						|
 | 
						|
 | 
						|
class Audio:
 | 
						|
    """
 | 
						|
    Audio utils
 | 
						|
    parameters:
 | 
						|
        sampling_rate : int, defaults to 16KHz
 | 
						|
            audio sampling rate
 | 
						|
        mono: bool, defaults to True
 | 
						|
        return_tensors: bool, defaults to True
 | 
						|
            returns torch tensor type if set to True else numpy ndarray
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self, sampling_rate: int = 16000, mono: bool = True, return_tensor=True
 | 
						|
    ) -> None:
 | 
						|
 | 
						|
        self.sampling_rate = sampling_rate
 | 
						|
        self.mono = mono
 | 
						|
        self.return_tensor = return_tensor
 | 
						|
 | 
						|
    def __call__(
 | 
						|
        self,
 | 
						|
        audio: Union[Path, np.ndarray, torch.Tensor],
 | 
						|
        sampling_rate: Optional[int] = None,
 | 
						|
        offset: Optional[float] = None,
 | 
						|
        duration: Optional[float] = None,
 | 
						|
    ):
 | 
						|
        """
 | 
						|
        read and process input audio
 | 
						|
        parameters:
 | 
						|
            audio: Path to audio file or numpy array or torch tensor
 | 
						|
                single input audio
 | 
						|
            sampling_rate : int, optional
 | 
						|
                sampling rate of the audio input
 | 
						|
            offset: float, optional
 | 
						|
                offset from which the audio must be read, reads from beginning if unused.
 | 
						|
            duration: float (seconds), optional
 | 
						|
                read duration, reads full audio starting from offset if not used
 | 
						|
        """
 | 
						|
        if isinstance(audio, str):
 | 
						|
            if os.path.exists(audio):
 | 
						|
                audio, sampling_rate = librosa.load(
 | 
						|
                    audio,
 | 
						|
                    sr=sampling_rate,
 | 
						|
                    mono=False,
 | 
						|
                    offset=offset,
 | 
						|
                    duration=duration,
 | 
						|
                )
 | 
						|
                if len(audio.shape) == 1:
 | 
						|
                    audio = audio.reshape(1, -1)
 | 
						|
            else:
 | 
						|
                raise FileNotFoundError(f"File {audio} deos not exist")
 | 
						|
        elif isinstance(audio, np.ndarray):
 | 
						|
            if len(audio.shape) == 1:
 | 
						|
                audio = audio.reshape(1, -1)
 | 
						|
        else:
 | 
						|
            raise ValueError("audio should be either filepath or numpy ndarray")
 | 
						|
 | 
						|
        if self.mono:
 | 
						|
            audio = self.convert_mono(audio)
 | 
						|
 | 
						|
        if sampling_rate:
 | 
						|
            audio = self.__class__.resample_audio(
 | 
						|
                audio, self.sampling_rate, sampling_rate
 | 
						|
            )
 | 
						|
        if self.return_tensor:
 | 
						|
            return torch.tensor(audio)
 | 
						|
        else:
 | 
						|
            return audio
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def convert_mono(audio: Union[np.ndarray, torch.Tensor]):
 | 
						|
        """
 | 
						|
        convert input audio into mono (1)
 | 
						|
        parameters:
 | 
						|
            audio: np.ndarray or torch.Tensor
 | 
						|
        """
 | 
						|
        if len(audio.shape) > 2:
 | 
						|
            assert (
 | 
						|
                audio.shape[0] == 1
 | 
						|
            ), "convert mono only accepts single waveform"
 | 
						|
            audio = audio.reshape(audio.shape[1], audio.shape[2])
 | 
						|
 | 
						|
        assert (
 | 
						|
            audio.shape[1] >> audio.shape[0]
 | 
						|
        ), f"expected input format (num_channels,num_samples) got {audio.shape}"
 | 
						|
        num_channels, num_samples = audio.shape
 | 
						|
        if num_channels > 1:
 | 
						|
            return audio.mean(axis=0).reshape(1, num_samples)
 | 
						|
        return audio
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def resample_audio(
 | 
						|
        audio: Union[np.ndarray, torch.Tensor], sr: int, target_sr: int
 | 
						|
    ):
 | 
						|
        """
 | 
						|
        resample audio to desired sampling rate
 | 
						|
        parameters:
 | 
						|
            audio : Path to audio file or numpy array or torch tensor
 | 
						|
                audio waveform
 | 
						|
            sr : int
 | 
						|
                current sampling rate
 | 
						|
            target_sr : int
 | 
						|
                target sampling rate
 | 
						|
 | 
						|
        """
 | 
						|
        if sr != target_sr:
 | 
						|
            if isinstance(audio, np.ndarray):
 | 
						|
                audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
 | 
						|
            elif isinstance(audio, torch.Tensor):
 | 
						|
                audio = torchaudio.functional.resample(
 | 
						|
                    audio, orig_freq=sr, new_freq=target_sr
 | 
						|
                )
 | 
						|
            else:
 | 
						|
                raise ValueError(
 | 
						|
                    "Input should be either numpy array or torch tensor"
 | 
						|
                )
 | 
						|
 | 
						|
        return audio
 |