mayavoz/enhancer/utils/io.py

import os
import librosa
from pathlib import Path
from typing import Optional, Union
import numpy as np
import torch
import torchaudio


class Audio:
    """
    Audio utils
    parameters:
        sampling_rate : int, defaults to 16KHz
            audio sampling rate
        mono: bool, defaults to True
        return_tensors: bool, defaults to True
            returns torch tensor type if set to True else numpy ndarray
    """

    def __init__(
        self, sampling_rate: int = 16000, mono: bool = True, return_tensor=True
    ) -> None:

        self.sampling_rate = sampling_rate
        self.mono = mono
        self.return_tensor = return_tensor

    def __call__(
        self,
        audio: Union[Path, np.ndarray, torch.Tensor],
        sampling_rate: Optional[int] = None,
        offset: Optional[float] = None,
        duration: Optional[float] = None,
    ):
        """
        read and process input audio
        parameters:
            audio: Path to audio file or numpy array or torch tensor
                single input audio
            sampling_rate : int, optional
                sampling rate of the audio input
            offset: float, optional
                offset from which the audio must be read, reads from beginning if unused.
            duration: float (seconds), optional
                read duration, reads full audio starting from offset if not used
        """
        if isinstance(audio, str):
            if os.path.exists(audio):
                audio, sampling_rate = librosa.load(
                    audio,
                    sr=sampling_rate,
                    mono=False,
                    offset=offset,
                    duration=duration,
                )
                if len(audio.shape) == 1:
                    audio = audio.reshape(1, -1)
            else:
                raise FileNotFoundError(f"File {audio} deos not exist")
        elif isinstance(audio, np.ndarray):
            if len(audio.shape) == 1:
                audio = audio.reshape(1, -1)
        else:
            raise ValueError("audio should be either filepath or numpy ndarray")

        if self.mono:
            audio = self.convert_mono(audio)

        if sampling_rate:
            audio = self.__class__.resample_audio(
                audio, self.sampling_rate, sampling_rate
            )
        if self.return_tensor:
            return torch.tensor(audio)
        else:
            return audio

    @staticmethod
    def convert_mono(audio: Union[np.ndarray, torch.Tensor]):
        """
        convert input audio into mono (1)
        parameters:
            audio: np.ndarray or torch.Tensor
        """
        if len(audio.shape) > 2:
            assert (
                audio.shape[0] == 1
            ), "convert mono only accepts single waveform"
            audio = audio.reshape(audio.shape[1], audio.shape[2])

        assert (
            audio.shape[1] >> audio.shape[0]
        ), f"expected input format (num_channels,num_samples) got {audio.shape}"
        num_channels, num_samples = audio.shape
        if num_channels > 1:
            return audio.mean(axis=0).reshape(1, num_samples)
        return audio

    @staticmethod
    def resample_audio(
        audio: Union[np.ndarray, torch.Tensor], sr: int, target_sr: int
    ):
        """
        resample audio to desired sampling rate
        parameters:
            audio : Path to audio file or numpy array or torch tensor
                audio waveform
            sr : int
                current sampling rate
            target_sr : int
                target sampling rate

        """
        if sr != target_sr:
            if isinstance(audio, np.ndarray):
                audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
            elif isinstance(audio, torch.Tensor):
                audio = torchaudio.functional.resample(
                    audio, orig_freq=sr, new_freq=target_sr
                )
            else:
                raise ValueError(
                    "Input should be either numpy array or torch tensor"
                )

        return audio