Commit dcf061aa authored by Quentin Aristote's avatar Quentin Aristote
Browse files

changed dim_embedding

parent f79e4ead
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "_MmWFnLDCZKx",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Environment"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"autoscroll": false,
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
"name": "ae.ipynb",
"provenance": [],
"collapsed_sections": [
"_MmWFnLDCZKx",
"i06F5roda4Y-",
"7Tlg9Jpynpta"
],
"toc_visible": true,
"machine_shape": "hm"
},
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"executionInfo": {
"elapsed": 7124,
"status": "ok",
"timestamp": 1578468998874,
"user": {
"displayName": "Quentin Aristote",
"photoUrl": "",
"userId": "04831542028559105983"
},
"user_tz": -60
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"id": "SOo6RfcuEkra",
"outputId": "b03aab3d-9ddd-47eb-aa47-7e7257f38a7e",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: torchaudio in /usr/local/lib/python3.6/dist-packages (0.3.1)\n",
"Requirement already satisfied: torch==1.3.0 in /usr/local/lib/python3.6/dist-packages (from torchaudio) (1.3.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==1.3.0->torchaudio) (1.17.4)\n"
]
}
],
"source": [
"# Import libraries\n",
"\n",
"import numpy as np\n",
"import os.path\n",
"import torch\n",
"import torch.nn as nn\n",
"import torchaudio"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "J_ifRZzbU2MD",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"from importlib.machinery import SourceFileLoader \n",
"get_sounds = SourceFileLoader('get_sounds', 'data/get_sounds.py').load_module()\n",
"get_spectrograms = SourceFileLoader('get_spectrograms', 'data/get_spectrograms.py').load_module()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "f8CaPTljfgIK",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Configure the device\n",
"\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "i06F5roda4Y-",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "0PGoB130B7tg",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"## Preparing the dataset\n",
"\n",
"The source code for `get_sounds.py` and `get_spectrograms.py` can be found [here](https://git.eleves.ens.fr/qaristote/soundscapes/)."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "VPrviVLiOmlF",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"class DatasetConfig :\n",
" duration = 5 # seconds\n",
" sample_rate = 16000 # Hz\n",
" nb_mels = 64\n",
" path_sounds = 'data/sounds'\n",
" path_spectrograms = 'data/spectrograms_5s'\n",
" \n",
"data_opt = DatasetConfig()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "bDiE9HCmmhpy",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Download the sounds\n",
"# You only need to do this once\n",
"\n",
"if False :\n",
" try :\n",
" os.mkdir(data_opt.path_sounds)\n",
" except :\n",
" pass\n",
" get_sounds.getSounds(directory = data_opt.path_sounds, \n",
" soundscape = True, validated = True,\n",
" to_ignore = {'2019-419'})"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "XNw92C88-Tj7",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Compute the Mel-spectrograms\n",
"# You only need to do this once\n",
"\n",
"if False :\n",
" try :\n",
" os.mkdir(data_opt.path_spectrograms)\n",
" except : \n",
" pass\n",
" get_spectrograms.getSpectrograms(data_opt.path_sounds, data_opt.path_spectrograms, \n",
" duration = data_opt.duration, \n",
" sample_rate = data_opt.sample_rate,\n",
" nb_mels = data_opt.nb_mels)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "I4x8wEI8CSOI",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"## Getting the dataset\n"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "fRk5jQphLOX6",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Create a Dataset class for spectrograms\n",
"class SpectrogramsDataset(torch.utils.data.Dataset) :\n",
" def __init__(self, path_spectrograms, sample_rate, duration, \n",
" nb_mels = data_opt.nb_mels,\n",
" download = False,\n",
" compute = False,\n",
" to_ignore = [],\n",
" path_sounds = None) :\n",
" if download :\n",
" try :\n",
" os.mkdir(path_sounds)\n",
" except :\n",
" pass\n",
" get_sounds.getSounds(directory = path_sounds, \n",
" soundscape = True, \n",
" validated = True, \n",
" to_ignore = to_ignore,\n",
" overwrite = True)\n",
" if compute :\n",
" try :\n",
" os.mkdir(path_spectrograms)\n",
" except : \n",
" pass\n",
" get_spectrograms.getSpectrograms(path_sounds, path_spectrograms, \n",
" duration = duration, \n",
" sample_rate = sample_rate,\n",
" nb_mels = nb_mels,\n",
" to_ignore = to_ignore,\n",
" overwrite = True)\n",
" \n",
" self.time = duration * sample_rate\n",
" self.spectrogram_paths = []\n",
"\n",
" for i, filename in enumerate(os.listdir(path_spectrograms)) :\n",
" title, _ = os.path.splitext(filename)\n",
" filename = title + '.pt'\n",
" path = os.path.join(path_spectrograms, filename)\n",
"\n",
" self.spectrogram_paths.append(path)\n",
" \n",
" \n",
" def __len__(self) :\n",
" return len(self.spectrogram_paths)\n",
"\n",
"\n",
" def __getitem__(self, indices) :\n",
" try :\n",
" items = torch.zeros(len(indices) + [1, 128, self.time])\n",
" for index in indices :\n",
" path_target = self.spectrogram_paths[index]\n",
" spectrogram = torch.load(path_target)\n",
" items[index] = spectrogram\n",
" except TypeError :\n",
" path_target = self.spectrogram_paths[indices]\n",
" spectrogram = torch.load(path_target)\n",
" items = spectrogram\n",
"\n",
" return items"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "eBC3cyQ1aMUi",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Models\n"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "f2f8DTXMZjCw",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"class NetworkConfig :\n",
" nb_mels = data_opt.nb_mels\n",
" nb_frames = 401\n",
" nb_channels = 1 # 1 for mono, 2 for stereo\n",
" dim_embedding = 1024\n",
" batch_size = 32\n",
" \n",
"net_opt = NetworkConfig()"
]
"accelerator": "GPU"
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "EJYcKPXSFgZD",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Load the data\n",
"\n",
"dataset = SpectrogramsDataset(data_opt.path_spectrograms,\n",
" data_opt.sample_rate,\n",
" data_opt.duration)\n",
"dataloader = torch.utils.data.DataLoader(dataset = dataset,\n",
" batch_size = net_opt.batch_size,\n",
" shuffle = True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "3gh6LXUJbX9X",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"## Convolutional Variational Auto Encoder"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "6Uy5ccG0nmK7",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### 1D"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "eAX7XmN3aq_O",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"class Convolutional1D_VAE(nn.Module) :\n",
" def __init__(self, nb_mels) :\n",
" super(Convolutional1D_VAE, self).__init__()\n",
"\n",
" self.nb_channels = nb_mels\n",
" self.dim_embedding = self.nb_channels * 16\n",
"\n",
" self.encoder_conv = nn.Sequential(\n",
" nn.Sigmoid(),\n",
"\n",
" # 64 x 401 -> 128 x 99\n",
" nn.Conv1d(self.nb_channels, self.nb_channels * 2,\n",
" 9, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 2),\n",
" nn.LeakyReLU(),\n",
"\n",
" # 128 x 99 -> 256 x 24\n",
" nn.Conv1d(self.nb_channels * 2, self.nb_channels * 4,\n",
" 9, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 4),\n",
" nn.LeakyReLU(),\n",
"\n",
" # 256 x 24 -> 512 x 5\n",
" nn.Conv1d(self.nb_channels * 4, self.nb_channels * 8,\n",
" 9, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 8),\n",
" nn.LeakyReLU(),\n",
"\n",
" # 512 x 5 -> 1024 x 1\n",
" nn.Conv1d(self.nb_channels * 8, self.dim_embedding,\n",
" 7, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.dim_embedding),\n",
" nn.LeakyReLU()\n",
" )\n",
"\n",
" self.encoder_linear = nn.Sequential(\n",
" nn.Linear(self.dim_embedding,\n",
" self.dim_embedding * 2)\n",
" )\n",
"\n",
" self.decoder_linear = nn.Sequential(\n",
" nn.Linear(self.dim_embedding,\n",
" self.dim_embedding)\n",
" )\n",
" \n",
" self.decoder_conv = nn.Sequential(\n",
" # nn.BatchNorm1d(self.dim_embedding),\n",
" nn.LeakyReLU(),\n",
"\n",
" nn.ConvTranspose1d(self.dim_embedding, self.nb_channels * 8,\n",
" 7, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 8), \n",
" nn.LeakyReLU(),\n",
" \n",
" nn.ConvTranspose1d(self.nb_channels * 8, self.nb_channels * 4,\n",
" 9, stride = 4, padding = 1, output_padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 4),\n",
" nn.LeakyReLU(),\n",
"\n",
" nn.ConvTranspose1d(self.nb_channels * 4, self.nb_channels * 2,\n",
" 9, stride = 4, padding = 1,\n",
" bias = True),\n",
" # nn.BatchNorm1d(self.nb_channels * 2),\n",
" nn.LeakyReLU(),\n",
"\n",
" nn.ConvTranspose1d(self.nb_channels * 2, self.nb_channels,\n",
" 9, stride = 4, padding = 1, output_padding = 2,\n",
" bias = True)\n",
" )\n",
" \n",
" def encode(self, x) :\n",
" x = self.encoder_conv(x)\n",
" x = x.squeeze(-1)\n",
" x = self.encoder_linear(x)\n",
" mu = x[:, :self.dim_embedding]\n",
" log_var = x[:, self.dim_embedding:]\n",
" return mu, log_var\n",
"\n",
" def reparameterize(self, mu, log_var) :\n",
" std = torch.exp(log_var / 2)\n",
" epsilon = torch.randn_like(std)\n",
" return mu + std * epsilon\n",
"\n",
" def decode(self, x) :\n",
" x = self.decoder_linear(x)\n",
" x = x.unsqueeze(-1)\n",
" x = self.decoder_conv(x)\n",
" return x\n",
"\n",
" def forward(self, x) :\n",
" mu, log_var = self.encode(x)\n",
" embedding = self.reparameterize(mu, log_var)\n",
" reconstruction = self.decode(embedding)\n",
" return reconstruction, mu, log_var\n",
"\n",
" def sample(self, x) : \n",
" embedding = torch.randn(1, 32, 37)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"ein.tags": "worksheet-0",
"id": "7Tlg9Jpynpta",
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### 2D"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": false,
"colab": null,
"colab_type": "code",
"collapsed": false,
"ein.hycell": false,
"ein.tags": "worksheet-0",
"id": "xPNjQFP3bMNx",
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"class Convolutional2D_VAE(nn.Module) :\n",
" def __init__(self, nb_channels, dim_embedding) :\n",
" super(Convolutional2D_VAE, self).__init__()\n",
"\n",
" self.nb_channels = nb_channels\n",
" self.dim_embedding = dim_embedding\n",
"\n",
" self.encoder_conv = nn.Sequential(\n",
" # 128 x 2401 -> 63 x 1200\n",
" nn.Conv2d(self.nb_channels, self.nb_channels * 2, \n",
" 5, stride = 2, padding = 1, \n",
" bias = False), \n",
" nn.LeakyReLU(0.2, inplace = True), \n",
" \n",
" # 63 x 1200 -> 31 x 599\n",
" nn.Conv2d(self.nb_channels * 2, self.nb_channels * 4,\n",
" 5, stride = 2, padding = 1, \n",
" bias = False),\n",
" nn.BatchNorm2d(self.nb_channels * 4),\n",
" nn.LeakyReLU(0.2, inplace = True),\n",
" \n",
" # 31 x 599 -> 15 x 299\n",
" nn.Conv2d(self.nb_channels * 4, self.nb_channels * 8,\n",
" 5, stride = 2, padding = 1, \n",
" bias = False))\n",
" \n",
" \"\"\" \n",
" nn.BatchNorm2d(self.nb_channels * 8),\n",
" nn.LeakyReLU(0.2, inplace = True),\n",
" \n",
" # 15 x 299 -> 8 x 150\n",
" nn.Conv2d(self.nb_channels * 8, self.nb_channels * 16,\n",
" 3, stride = 2, padding = 1, \n",
" bias = False), \n",
" nn.BatchNorm2d(self.nb_channels * 16),\n",
" nn.LeakyReLU(0.2, inplace = True),\n",
"\n",
" # 8 x 150 -> 4 x 75\n",
" nn.Conv2d(self.nb_channels * 16, self.nb_channels * 32,\n",
" 3, stride = 2, padding = 1, \n",
" bias = False), \n",
" nn.BatchNorm2d(self.nb_channels * 32),\n",
" nn.LeakyReLU(0.2, inplace = True),\n",
"\n",
" # 4 x 75 -> 1 x 37\n",
" # we need both a mean and a standard deviation as output\n",
" nn.Conv2d(self.nb_channels * 32, self.nb_channels * 64,\n",
" 3, stride = 2, padding = 0, \n",
" bias = False)\n",
" )\n",
" \"\"\"\n",
"\n",
" self.encoder_linear = nn.Sequential(\n",
" nn.Linear(self.nb_channels * 8 * 15 * 299,\n",
" self.dim_embedding * 2)\n",
" )\n",
"\n",
" self.decoder_linear = nn.Sequential(\n",
" nn.Linear(self.dim_embedding,\n",
" self.nb_channels * 8 * 15 * 299)\n",
" )\n",
" \n",
" \"\"\"\n",
" nn.ConvTranspose2d(self.nb_channels * 64, self.nb_channels * 32, \n",
" 3, stride = 2, padding = 0, \n",
" output_padding = (1,0),\n",
" bias = False),\n",
" nn.BatchNorm2d(self.nb_channels * 32),\n",
" nn.ReLU(True),\n",
"\n",
" nn.ConvTranspose2d(self.nb_channels * 32, self.nb_channels * 16, \n",
" 3, stride = 2, padding = 1, \n",
" output_padding = 1,\n",
" bias = False),\n",
" nn.BatchNorm2d(self.nb_channels * 16),\n",
" nn.ReLU(True),\n",
"\n",
" nn.ConvTranspose2d(self.nb_channels * 16, self.nb_channels * 8, \n",
" 3, stride = 2, padding = 1, \n",
" output_padding = 0,\n",
" bias = False),\n",
" nn.BatchNorm2d(self.nb_channels * 8),\n",
" nn.ReLU(True),\n",
" \"\"\"\n",
"\n",
" self.decoder_conv = nn.Sequential(\n",
" nn.ConvTranspose2d(self.nb_channels * 8, self.nb_channels * 4, \n",