Source code for emonet.modules

"""
Various NN modules.
"""
import logging
from typing import Callable, Optional, Tuple

import torch
from speechbrain.lobes.augment import TimeDomainSpecAugment
from speechbrain.processing.speech_augmentation import (
    AddNoise,
    DropChunk,
    DropFreq,
    SpeedPerturb,
)
from torch import nn as nn

from emonet import SAMPLE_RATE
from emonet.utils import get_random_segment


[docs]def get_vad(ret="model"):
    """Get pretrained Silero VAD model and/or utilities."""
    mod, utils = torch.hub.load(
        repo_or_dir="snakers4/silero-vad",
        model="silero_vad",
        force_reload=False,
        onnx=False,
    )
    obj = {"model": mod, "utils": utils, "both": (mod, utils)}
    try:
        return obj[ret]
    except KeyError:
        raise ValueError("Parameter `ret` must be one of {`model`, `utils`, `both`}")


[docs]class VadChunk(nn.Module):
    """Concatenate VAD chunks from signal."""

[docs]    def __init__(
        self, model: nn.Module, utils: Tuple[Callable], sample_rate: int = SAMPLE_RATE
    ):
        """
        Constructor method.

        Parameters
        ----------
        model: nn.Module
            Pre-trained VAD model.
        utils: Tuple[Callable]
            Tuple of utility functions.
        sample_rate: int
            Audio sample rate; default 16,000Hz
        """
        super().__init__()
        self.model = model
        self.get_speech_timestamps = utils[0]
        self.collect_chunks = utils[-1]
        self.sample_rate = sample_rate

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through VAD model layer."""
        speech = self.get_speech_timestamps(
            x, self.model, sampling_rate=self.sample_rate
        )
        try:
            return self.collect_chunks(speech, x)
        except NotImplementedError:
            print("No voice activity detected.")
            return torch.zeros(1)


[docs]class RandomSegment(nn.Module):
    """Get a random segment from sample."""

[docs]    def __init__(self, seconds: int, sample_rate: int = SAMPLE_RATE):
        """
        Constructor method.

        Parameters
        ----------
        seconds: int
            Duration(seconds) of random segment to retrieve.
        sample_rate: int
            Audio sample rate; default 16,000Hz
        """
        self.seconds = seconds
        self.sample_rate = sample_rate
        super().__init__()

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through random segment model layer."""
        if x.shape[-1] <= self.seconds * self.sample_rate:
            return x  # can't sample if it's not big enough
        return get_random_segment(x, seconds=self.seconds, sample_rate=self.sample_rate)


[docs]class SBAugment(TimeDomainSpecAugment):
    """Do multiple Speech Brain augmentations."""

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through random segment model layer."""
        x = x[
            :, :, None
        ]  # speechbrain expects tensor shape (batch, timesteps, channels)
        lengths = torch.ones(x.shape[0])
        x = self.speed_perturb(x)
        x = self.drop_freq(x)
        x = self.drop_chunk(x, lengths)
        return x.squeeze(-1)  # drop last dim


[docs]class TimeDistributed(nn.Module):
    """
    Apply a nn.Module across a time dimension.

    Expects a batch first tensor with shape (batch, time, channel, height, width), where time is the
    chunked spectrogram across time steps.

    The Pytorch-Forecasting version (https://pytorch-forecasting.readthedocs.io/en/stable/_modules/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.html)
    does not properly apply to images with shape (batch, channel, height, width) or (channel, height, width)/

    Source: https://github.com/Data-Science-kosta/Speech-Emotion-Classification-with-PyTorch/blob/master/notebooks/stacked_cnn_lstm.ipynb
    """

[docs]    def __init__(self, module: nn.Module):
        """
        Constructor method.

        Parameters
        ----------
        module: nn.Module
            Module to create time-distributed module/block from.
        """
        super().__init__()
        self.module = module

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through time-distributed module layer."""
        if len(x.size()) <= 2:
            return self.module(x)
        # squash samples and timesteps into a single axis
        elif len(x.size()) == 3:  # (samples, timesteps, inp1)
            x_reshape = x.contiguous().view(
                -1, x.size(2)
            )  # (samples * timesteps, inp1)
        elif len(x.size()) == 4:  # (samples,timesteps,inp1,inp2)
            x_reshape = x.contiguous().view(
                -1, x.size(2), x.size(3)
            )  # (samples*timesteps,inp1,inp2)
        else:  # (samples,timesteps,inp1,inp2,inp3)
            x_reshape = x.contiguous().view(
                -1, x.size(2), x.size(3), x.size(4)
            )  # (samples*timesteps,inp1,inp2,inp3)

        y = self.module(x_reshape)

        # we have to reshape Y
        if len(x.size()) == 3:
            y = y.contiguous().view(
                x.size(0), -1, y.size(1)
            )  # (samples, timesteps, out1)
        elif len(x.size()) == 4:
            y = y.contiguous().view(
                x.size(0), -1, y.size(1), y.size(2)
            )  # (samples, timesteps, out1,out2)
        else:
            y = y.contiguous().view(
                x.size(0), -1, y.size(1), y.size(2), y.size(3)
            )  # (samples, timesteps, out1,out2, out3)
        return y


[docs]class ChunkDropper(nn.Module):
    """Drop time chunks from signal."""

[docs]    def __init__(self, bs: int, p: float = 1.0):
        """
        Constructor method.

        Parameters
        ----------
        bs: int
            Batch size used.
        p: float
            Probability of dropping chunk of signal.
        """
        super().__init__()
        self.bs = bs
        self.drop = DropChunk(drop_prob=p)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through chunk-dropping augmentation layer."""
        x = x.cuda()
        return self.drop(x, torch.ones(self.bs).cuda())


[docs]class FreqDropper(nn.Module):
    """Drop frequencies from signal."""

[docs]    def __init__(self, p: float = 1.0):
        """
        Constructor method.

        Parameters
        ----------
        p: float
            Probability of dropping chunk of signal.
        """
        super().__init__()
        self.drop = DropFreq(drop_prob=p)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through frequency-dropping augmentation layer."""
        x = x.cuda()
        return self.drop(x)


[docs]class Perturber(nn.Module):
    """Perturb signal speed."""

[docs]    def __init__(self, p: float = 1.0):
        """
        Constructor method.

        Parameters
        ----------
        p: float
            Probability of dropping frequency from signal.
        """
        super().__init__()
        self.perturb = SpeedPerturb(SAMPLE_RATE, perturb_prob=p)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through speed perturbation augmentation layer."""
        x = x.cuda()
        return self.perturb(x)


[docs]class WhiteNoise(nn.Module):
    """Add white noise to signal."""

[docs]    def __init__(self, bs: int, snr_low: int = 15, snr_high: int = 30):
        """
        Constructor method.

        Parameters
        ----------
        bs: int
            Batch size.
        snr_low: int
            Signal-to-noise ratio floor.
        snr_high: int
            Signal-to-noise ratio ceiling.
        """
        super().__init__()
        self.bs = bs
        self.noise = AddNoise(snr_low=snr_low, snr_high=snr_high)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through white noise augmentation layer."""
        return self.noise(x, torch.ones(self.bs))


[docs]class LSTMOutput(nn.Module):
    """Grabs only output of LSTM module; discards hidden state stuff."""

    def __init__(self, *args, **kwargs):
        super().__init__()
        self.lstm = nn.LSTM(*args, **kwargs)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through LSTM layer."""
        output, _ = self.lstm(x)
        return output


[docs]class PrintShape(nn.Module):
    """Print layer shape for debugging."""

    def __init__(self):
        super().__init__()

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Get layer shape."""
        print(x.shape)
        return x


[docs]class LogMessage(nn.Module):
    """Log for debugging."""

[docs]    def __init__(self, logger: logging.Logger, msg: Optional[str] = None):
        """
        Constructor method.

        Parameters
        ----------
        logger: logging.Logger
            Program logger.
        msg: Optional[str]
            Optional message to send.
        """
        super().__init__()
        self.logger = logger
        self.msg = msg

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Log message."""
        self.logger.info(f"{self.msg} - Shape: {x.shape}")
        return x


[docs]class ChunkSpectrogram(nn.Module):
    """Separate a spectrogram into n chunks."""

[docs]    def __init__(self, n_chunks: int = 6):
        """
        Constructor method.

        Parameters
        ----------
        n_chunks: int
            Number of chunks to split spectrogram into.
        """
        super().__init__()
        self.n_chunks = n_chunks

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Split spectrogram `x` ubti separate chunks,"""
        chunks = torch.chunk(
            x, self.n_chunks, -1
        )  # in dims (batch, channel, height, width)
        return torch.stack(chunks, 1)  # out dims (batch, time, channel, height, width)


[docs]class GRUOutput(nn.Module):
    """Grab only output of GRU module; discard hidden state stuff."""

    def __init__(self, *args, **kwargs):
        super().__init__()
        self.gru = nn.GRU(*args, **kwargs)

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through GRU layer."""
        output, _ = self.gru(x)
        return output


[docs]class ReshapeOutput(nn.Module):
    """Reshape LSTM output to remove timestep dimension."""

    def __init__(self, method: str = "mean"):
        super().__init__()
        self.method = method

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through reshaping layer."""
        if self.method == "last":
            return x[:, -1, :]
        if self.method == "mean":
            return x.mean(1)
        raise ValueError("Method must be one of {`mean`, `last`}")