GANで大きな顔を描く

import os
import glob
import cv2
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.spectral_norm import spectral_norm
from torch.utils import tensorboard
from torch.utils.data import Dataset, DataLoader
import torchvision
from tqdm import tqdm


class CelebDataset(Dataset):
def __init__(self, data_dir, img_width):
self.img_width = img_width
self.filelist = glob.glob(os.path.join(data_dir, '*'))

def __len__(self):
return len(self.filelist)

def __getitem__(self, idx):
img = cv2.imread(self.filelist[idx])
img = cv2.resize(img, dsize=(self.img_width, self.img_width))
img = img[:, :, ::-1]
img = img.astype('float32') / 255.0
img = img.transpose(2, 0, 1)
return img


class ResBlock(nn.Module):

def __init__(self, in_ch, out_ch, stride):
super().__init__()
conv1_ = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False)
conv2_ = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False)
nn.init.xavier_uniform_(conv1_.weight.data, 1.)
nn.init.xavier_uniform_(conv2_.weight.data, 1.)
self.conv1 = spectral_norm(conv1_)
self.conv2 = spectral_norm(conv2_)

downsample_ = nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride, padding=0, bias=False)
nn.init.xavier_uniform_(downsample_.weight.data, 1.)
self.downsample = spectral_norm(downsample_)

def forward(self, x):
out = F.leaky_relu(self.conv1(x), negative_slope=0.2, inplace=True)
out = self.conv2(out)

identity = self.downsample(x)

out += identity
return out


class Discriminator(nn.Module):
def __init__(self):
super().__init__()
self.last_ch = 512
self.block1 = ResBlock(3, self.last_ch // 16, 2)
self.block2 = ResBlock(self.last_ch // 16, self.last_ch // 8, 2)
self.block3 = ResBlock(self.last_ch // 8, self.last_ch // 4, 2)
self.block4 = ResBlock(self.last_ch // 4, self.last_ch // 2, 2)
self.block5 = ResBlock(self.last_ch // 2, self.last_ch, 2)
self.last_conv = nn.Conv2d(self.last_ch, 1, 4, 1, 0, bias=False)

def forward(self, x):
x = F.leaky_relu(self.block1(x), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.block2(x), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.block3(x), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.block4(x), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.block5(x), negative_slope=0.2, inplace=True)
return self.last_conv(x).view(-1, 1)


class Generator(nn.Module):
def __init__(self, latent_size):
super().__init__()
self.first_ch = 512
self.deconv0 = nn.ConvTranspose2d(latent_size, self.first_ch, 4, 1, 0, bias=False)
self.bn0 = nn.BatchNorm2d(self.first_ch)
self.deconv1 = nn.ConvTranspose2d(self.first_ch, self.first_ch // 2, 4, 2, 1, bias=False)
self.bn1 = nn.BatchNorm2d(self.first_ch // 2)
self.deconv2 = nn.ConvTranspose2d(self.first_ch // 2, self.first_ch // 4, 4, 2, 1, bias=False)
self.bn2 = nn.BatchNorm2d(self.first_ch // 4)
self.deconv3 = nn.ConvTranspose2d(self.first_ch // 4, self.first_ch // 8, 4, 2, 1, bias=False)
self.bn3 = nn.BatchNorm2d(self.first_ch // 8)
self.deconv4 = nn.ConvTranspose2d(self.first_ch // 8, self.first_ch // 16, 4, 2, 1, bias=False)
self.bn4 = nn.BatchNorm2d(self.first_ch // 16)
self.deconv5 = nn.ConvTranspose2d(self.first_ch // 16, 3, 4, 2, 1, bias=False)

nn.init.xavier_uniform_(self.deconv0.weight.data, 1.)
nn.init.xavier_uniform_(self.deconv1.weight.data, 1.)
nn.init.xavier_uniform_(self.deconv2.weight.data, 1.)
nn.init.xavier_uniform_(self.deconv3.weight.data, 1.)
nn.init.xavier_uniform_(self.deconv4.weight.data, 1.)
nn.init.xavier_uniform_(self.deconv5.weight.data, 1.)

def forward(self, x):
x = F.leaky_relu(self.bn0(self.deconv0(x)), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.bn1(self.deconv1(x)), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.bn2(self.deconv2(x)), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.bn3(self.deconv3(x)), negative_slope=0.2, inplace=True)
x = F.leaky_relu(self.bn4(self.deconv4(x)), negative_slope=0.2, inplace=True)
return torch.tanh(self.deconv5(x))


def train_epoch(data_loader, discriminator, generator, d_optimizer, g_optimizer,
criterion, latent_size, writer, epoch):

discriminator.train()
generator.train()

d_loss_sum = 0
g_loss_sum = 0
cnt = 0

for real_images in tqdm(data_loader):

real_images = real_images.cuda()

real_labels = torch.ones(real_images.shape[0], 1).cuda()
fake_labels = torch.zeros(real_images.shape[0], 1).cuda()

# loss of discriminator for real data
d_real_output = discriminator(real_images)
# d_real_loss = criterion(d_real_output, real_labels)
d_real_loss = nn.ReLU()(1.0 - d_real_output).mean()

# loss of discriminator for fake data
z_for_discriminator = torch.randn(real_images.shape[0], latent_size, 1, 1).cuda()
d_fake_output = discriminator(generator(z_for_discriminator))
# d_fake_loss = criterion(d_fake_output, fake_labels)
d_fake_loss = nn.ReLU()(1.0 + d_fake_output).mean()

# optimize discriminator
d_loss = d_real_loss + d_fake_loss
d_optimizer.zero_grad()
d_loss.backward()
d_optimizer.step()

# loss of generator
z_for_generator = torch.randn(real_images.shape[0], latent_size, 1, 1).cuda()
fake_imgs = generator(z_for_generator)
g_fake_output = discriminator(fake_imgs)
# g_loss = criterion(g_fake_output, real_labels)
g_loss = -g_fake_output.mean()

# optimize generator
g_optimizer.zero_grad()
g_loss.backward()
g_optimizer.step()

d_loss_sum += d_loss
g_loss_sum += g_loss

cnt += 1

writer.add_scalar("discriminator loss", d_loss_sum.item() / cnt, epoch)
writer.add_scalar("generator loss", g_loss_sum.item() / cnt, epoch)
torchvision.utils.save_image(fake_imgs[:64] * 0.5 + 0.5, "img/epoch_{0:03d}.png".format(epoch), nrow=8)


def main():

torch.manual_seed(0)

img_width = 128
latent_size = 128
batch_size = 512

transform = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=(0.5,), std=(0.5,))
])

dataset = CelebDataset('img_align_celeba', img_width=img_width)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8)

discriminator = Discriminator().cuda()
generator = Generator(latent_size).cuda()

discriminator = nn.DataParallel(discriminator)
generator = nn.DataParallel(generator)

criterion = nn.BCEWithLogitsLoss()
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))

writer = tensorboard.SummaryWriter(log_dir="logs")
if not os.path.exists('img'):
os.mkdir('img')
if not os.path.exists('checkpoints'):
os.mkdir('checkpoints')

for epoch in range(300):
print(epoch)
train_epoch(data_loader, discriminator, generator, d_optimizer, g_optimizer,
criterion, latent_size, writer, epoch)

if epoch % 10 == 0:
torch.save(discriminator.state_dict(), 'checkpoints/discriminator{0:03d}.pth'.format(epoch))
torch.save(generator.state_dict(), 'checkpoints/generator{0:03d}.pth'.format(epoch))


if __name__ == '__main__':
main()

f:id:LeMU_Research:20201118225211p:plain

f:id:LeMU_Research:20201118225518p:plainf:id:LeMU_Research:20201118225529p:plain

 

GANで顔を描く

import os
import glob
import cv2
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils import tensorboard
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


class CelebDataset(Dataset):
def __init__(self, data_dir, img_width):
self.img_width = img_width
self.filelist = glob.glob(os.path.join(data_dir, '*'))

def __len__(self):
return len(self.filelist)

def __getitem__(self, idx):
img = cv2.imread(self.filelist[idx])
img = cv2.resize(img, dsize=(self.img_width, self.img_width)).astype('float32') / 255.0
img = img.transpose(2, 0, 1)
return img


class Discriminator(nn.Module):
def __init__(self):
super().__init__()
self.last_ch = 128
self.conv1 = nn.Conv2d(3, self.last_ch // 8, 4, 2, 1, bias=False)
self.bn1 = nn.BatchNorm2d(self.last_ch // 8)
self.conv2 = nn.Conv2d(self.last_ch // 8, self.last_ch // 4, 4, 2, 1, bias=False)
self.bn2 = nn.BatchNorm2d(self.last_ch // 4)
self.conv3 = nn.Conv2d(self.last_ch // 4, self.last_ch // 2, 4, 2, 1, bias=False)
self.bn3 = nn.BatchNorm2d(self.last_ch // 2)
self.conv4 = nn.Conv2d(self.last_ch // 2, self.last_ch, 4, 2, 1, bias=False)
self.bn4 = nn.BatchNorm2d(self.last_ch)
self.conv5 = nn.Conv2d(self.last_ch, 1, 4, 1, 0, bias=False)

def forward(self, x):
x = F.leaky_relu(self.bn1(self.conv1(x)))
x = F.leaky_relu(self.bn2(self.conv2(x)))
x = F.leaky_relu(self.bn3(self.conv3(x)))
x = F.leaky_relu(self.bn4(self.conv4(x)))
return self.conv5(x).view(-1, 1)


class Generator(nn.Module):
def __init__(self, latent_size):
super().__init__()
self.first_ch = 512
self.deconv0 = nn.ConvTranspose2d(latent_size, self.first_ch, 4, 1, 0, bias=False)
self.bn0 = nn.BatchNorm2d(self.first_ch)
self.deconv1 = nn.ConvTranspose2d(self.first_ch, self.first_ch // 2, 4, 2, 1, bias=False)
self.bn1 = nn.BatchNorm2d(self.first_ch // 2)
self.deconv2 = nn.ConvTranspose2d(self.first_ch // 2, self.first_ch // 4, 4, 2, 1, bias=False)
self.bn2 = nn.BatchNorm2d(self.first_ch // 4)
self.deconv3 = nn.ConvTranspose2d(self.first_ch // 4, self.first_ch // 8, 4, 2, 1, bias=False)
self.bn3 = nn.BatchNorm2d(self.first_ch // 8)
self.deconv4 = nn.ConvTranspose2d(self.first_ch // 8, 3, 4, 2, 1, bias=False)

def forward(self, x):
x = F.leaky_relu(self.bn0(self.deconv0(x)))
x = F.leaky_relu(self.bn1(self.deconv1(x)))
x = F.leaky_relu(self.bn2(self.deconv2(x)))
x = F.leaky_relu(self.bn3(self.deconv3(x)))
return torch.sigmoid(self.deconv4(x))


def train_epoch(data_loader, discriminator, generator, d_optimizer, g_optimizer,
criterion, latent_size, writer, epoch):

discriminator.train()
generator.train()

d_loss_sum = 0
g_loss_sum = 0
cnt = 0

for real_images in tqdm(data_loader):

real_images = real_images.cuda()

real_labels = torch.ones(real_images.shape[0], 1).cuda()
fake_labels = torch.zeros(real_images.shape[0], 1).cuda()

# loss of discriminator for real data
d_real_output = discriminator(real_images)
d_real_loss = criterion(d_real_output, real_labels)

# loss of discriminator for fake data
z_for_discriminator = torch.randn(real_images.shape[0], latent_size, 1, 1).cuda()
d_fake_output = discriminator(generator(z_for_discriminator))
d_fake_loss = criterion(d_fake_output, fake_labels)

# optimize discriminator
d_loss = d_real_loss + d_fake_loss
d_optimizer.zero_grad()
d_loss.backward()
d_optimizer.step()

# loss of generator
z_for_generator = torch.randn(real_images.shape[0], latent_size, 1, 1).cuda()
g_fake_output = discriminator(generator(z_for_generator))
g_loss = criterion(g_fake_output, real_labels)

# optimize generator
g_optimizer.zero_grad()
g_loss.backward()
g_optimizer.step()

d_loss_sum += d_loss
g_loss_sum += g_loss

cnt += 1

writer.add_scalar("discriminator loss", d_loss_sum.item() / cnt, epoch)
writer.add_scalar("generator loss", g_loss_sum.item() / cnt, epoch)


def main():

torch.manual_seed(0)

img_width = 64
latent_size = 100
batch_size = 512

dataset = CelebDataset('img_align_celeba', img_width=img_width)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8)

discriminator = Discriminator().cuda()
generator = Generator(latent_size).cuda()

discriminator = nn.DataParallel(discriminator)
generator = nn.DataParallel(generator)

criterion = nn.BCEWithLogitsLoss()
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))

writer = tensorboard.SummaryWriter(log_dir="logs")

for epoch in range(300):
print(epoch)
train_epoch(data_loader, discriminator, generator, d_optimizer, g_optimizer,
criterion, latent_size, writer, epoch)

generator.eval()
z = torch.randn(1, latent_size, 1, 1).cuda()
fake_image = generator(z)
img = (fake_image[0].cpu().detach().numpy().transpose(1, 2, 0) * 255).astype('uint8')
cv2.imwrite('img/{0:04d}.png'.format(epoch), img)

if epoch % 10 == 0:
torch.save(discriminator.state_dict(), 'checkpoints/discriminator{0:03d}.pth'.format(epoch))
torch.save(generator.state_dict(), 'checkpoints/generator{0:03d}.pth'.format(epoch))


if __name__ == '__main__':
main()

f:id:LeMU_Research:20201115024535p:plain f:id:LeMU_Research:20201115024545p:plain

f:id:LeMU_Research:20201115024715p:plain

 

GANで0を描く

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils import tensorboard
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import cv2


class Generator(nn.Module):
def __init__(self, latent_size, image_size):
super().__init__()
hidden_size = 256
self.fc1 = nn.Linear(latent_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, image_size)

def forward(self, x):
x = F.dropout(F.leaky_relu(self.fc1(x)))
x = F.dropout(F.leaky_relu(self.fc2(x)))
return torch.sigmoid(self.fc3(x))


class Discriminator(nn.Module):
def __init__(self, image_size):
super().__init__()
hidden_size = 48
self.fc1 = nn.Linear(image_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, 1)

def forward(self, x):
x = F.dropout(F.leaky_relu(self.fc1(x)))
x = F.dropout(F.leaky_relu(self.fc2(x)))
return self.fc3(x)


def train_epoch(data_loader, discriminator, generator, d_optimizer,
g_optimizer, criterion, latent_size, writer, epoch):

discriminator.train()
generator.train()

d_loss_sum = 0
g_loss_sum = 0
cnt = 0

for data, target in data_loader:

# extract real images(target == 0)
real_images = data[target == 0]
batch_size = real_images.shape[0]
real_images = real_images.reshape(batch_size, -1).cuda()

# label
real_labels = torch.ones(batch_size, 1).cuda()
fake_labels = torch.zeros(batch_size, 1).cuda()

# loss of discriminator for real data
d_real_output = discriminator(real_images)
d_real_loss = criterion(d_real_output, real_labels)

# loss of discriminator for fake data
z_for_discriminator = torch.randn(batch_size, latent_size).cuda()
d_fake_output = discriminator(generator(z_for_discriminator))
d_fake_loss = criterion(d_fake_output, fake_labels)

# optimize discriminator
d_loss = d_real_loss + d_fake_loss
d_optimizer.zero_grad()
d_loss.backward()
d_optimizer.step()

# loss of generator
z_for_generator = torch.randn(batch_size, latent_size).cuda()
g_fake_output = discriminator(generator(z_for_generator))
g_loss = criterion(g_fake_output, real_labels)

# optimize generator
g_optimizer.zero_grad()
g_loss.backward()
g_optimizer.step()

d_loss_sum += d_loss
g_loss_sum += g_loss

cnt += 1

writer.add_scalar("discriminator loss", d_loss_sum.item() / cnt, epoch)
writer.add_scalar("generator loss", g_loss_sum.item() / cnt, epoch)


def main():

torch.manual_seed(0)

width = 28
height = 28
channel_num = 1
image_size = width * height * channel_num
latent_size = 64

transform = transforms.Compose([transforms.ToTensor()])
mnist = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
data_loader = DataLoader(dataset=mnist, batch_size=1000, shuffle=True)

discriminator = Discriminator(image_size).cuda()
generator = Generator(latent_size, image_size).cuda()

criterion = nn.BCEWithLogitsLoss()
d_optimizer = torch.optim.Adam(discriminator.parameters())
g_optimizer = torch.optim.Adam(generator.parameters())

writer = tensorboard.SummaryWriter(log_dir="logs")

for epoch in range(100):
print(epoch)
train_epoch(data_loader, discriminator, generator, d_optimizer,
g_optimizer, criterion, latent_size, writer, epoch)
if epoch % 5 == 0:
generator.eval()
z = torch.randn(1, latent_size).cuda()
fake_image = generator(z)
img = fake_image[0].reshape(height, width, channel_num)
img = (img.cpu().detach().numpy() * 255).astype('uint8')
cv2.imwrite('img/{0:04d}.png'.format(epoch), img)


if __name__ == '__main__':
main()

f:id:LeMU_Research:20201114021805p:plain      f:id:LeMU_Research:20201114021839p:plain

f:id:LeMU_Research:20201114021956p:plain

 

scipy.signal.firwinのsimple example

import numpy as np
from scipy.io import wavfile
from scipy.signal import lfilter, firwin
import matplotlib.pyplot as plt

sample_rate, x = wavfile.read('combined_sine.wav')
sample_interval = 1.0 / sample_rate
N = x.shape[0]

nyquist = sample_rate / 2
cutFreq = 1500 / nyquist # ナイキスト周波数で正規化
b = firwin(100, cutFreq)

filtered = lfilter(b, 1, x)
F = np.fft.fft(filtered)
Amp = np.abs(F)

freq = np.linspace(0, sample_rate, N)
plt.plot(freq[:int(N/2)], Amp[:int(N/2)])
plt.show()

scipy.signal.lfilterのsimple example

import numpy as np
from scipy.io import wavfile
from scipy.signal import lfilter
import matplotlib.pyplot as plt

sample_rate, x = wavfile.read('combined_sine.wav')
sample_interval = 1.0 / sample_rate
N = x.shape[0]

filtered = lfilter([0.5, 0.5], 1, x)
F = np.fft.fft(filtered)
Amp = np.abs(F)

freq = np.linspace(0, sample_rate, N)
plt.plot(freq[:int(N/2)], Amp[:int(N/2)])
plt.show()

wave, scipy.io.wavfile, librosaでwaveファイル入力

waveはpylabを介すので分かりにくい。

librosaはサンプリングレートを与えないといけない。

scipy.io.wavfileが一番使い勝手が良さそう。

import numpy as np
import matplotlib.pyplot as plt

# import wave
# from pylab import frombuffer
# wf = wave.open('combined_sine.wav', 'r')
# sample_rate = wf.getframerate() # サンプリングレート[Hz]
# sample_interval = 1.0 / sample_rate # サンプリング間隔[s]
# N = wf.getnframes() # フレーム数
# x = frombuffer(wf.readframes(N), dtype='int16') / 32768.0

# from scipy.io import wavfile
# from copy import deepcopy
# sample_rate, x = wavfile.read('combined_sine.wav')
# sample_interval = 1.0 / sample_rate
# N = x.shape[0]
# x = deepcopy(x)
# x = x / 32768

import librosa
sample_rate = 8000
sample_interval = 1.0 / sample_rate
x = librosa.load('combined_sine.wav', sr=sample_rate)[0]
N = x.shape[0]

F = np.fft.fft(x)
Amp = np.abs(F)

t = np.arange(0, sample_interval * N, sample_interval)
freq = np.linspace(0, sample_rate, N)

#plt.plot(t, x)
plt.plot(freq[:int(N/2)], Amp[:int(N/2)])
plt.show()

メルスペクトラムとデータ水増しでESC50の精度を上げる

以下の記事の改良。

test accuracyが8%から30%くらいまで上がりました。

ESC50音声分類をシンプルなCNNでやってみた - LeMU_Researchの日記

 

import os
import pandas as pd
import numpy as np
import random
import librosa
import torch
from torch import optim, nn
from torch.nn.functional import avg_pool2d
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.utils.tensorboard as tensorboard


def add_white_noise(x, rate=0.002):
return x + rate * np.random.randn(len(x))


def shift_sound(x, rate=2):
return np.roll(x, int(len(x) // rate))


def stretch_sound(x, rate=1.1):
input_length = len(x)
x = librosa.effects.time_stretch(x, rate)
if len(x) > input_length:
return x[:input_length]
else:
return np.pad(x, (0, max(0, input_length - len(x))), "constant")


def make_cba(in_channels, out_channels, kernel_size, stride):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)


class EscDataset(Dataset):

def __init__(self, data_dir, train_flag):

self.data_dir = data_dir
self.train_flag = train_flag

self.df = pd.read_csv(os.path.join(self.data_dir, 'meta/esc50.csv'))
if train_flag:
self.df = self.df[self.df['fold'] != 5]
else:
self.df = self.df[self.df['fold'] == 5]
self.df = self.df.reset_index()

def __len__(self):
return len(self.df)

def __getitem__(self, idx):

# waveファイルの読み込み
fname = self.df['filename'][idx]
data, sr = librosa.load(os.path.join(self.data_dir, 'audio/' + fname), sr=44100)

# Data Augmentation
if self.train_flag:
rand1 = random.randint(0, 49)
rand2 = random.randint(0, 49)
rand3 = random.randint(0, 49)
if rand1 < 25:
data = add_white_noise(data, 0.0002 * rand1) # [0, 0.005)
if rand2 < 25:
data = shift_sound(data, rand2 // 5 + 2) # [2, 6]
if rand3 < 25:
data = stretch_sound(data, 1.0 + (rand3 - 12.5) / 72.5) # [0.8, 1.2)

# 短時間フーリエ変換
# n_fft:窓サイズ, hop_length:窓間の距離
# stft:[周波数サンプル数, 時間サンプル数], 周波数サンプル数=n_fft/2, 時間サンプル数=wave_data.shape[0]/hop_length
stft = np.abs(librosa.stft(data, n_fft=1024, hop_length=128)) ** 2

# メルスペクトラムの算出
# n_mels:周波数サンプル数
# melsp:[n_mels, 時間サンプル数]
log_stft = librosa.power_to_db(stft)
melsp = librosa.feature.melspectrogram(S=log_stft, n_mels=128)[np.newaxis, ...]

return melsp.astype('float32'), self.df['target'][idx]


class EscNet(nn.Module):

def __init__(self):
super().__init__()
self.cba1_1 = make_cba(1, 32, (1, 8), (1, 2))
self.cba1_2 = make_cba(32, 32, (8, 1), (2, 1))
self.cba1_3 = make_cba(32, 64, (1, 8), (1, 2))
self.cba1_4 = make_cba(64, 64, (8, 1), (2, 1))
self.cba2_1 = make_cba(1, 32, (1, 16), (1, 2))
self.cba2_2 = make_cba(32, 32, (16, 1), (2, 1))
self.cba2_3 = make_cba(32, 64, (1, 16), (1, 2))
self.cba2_4 = make_cba(64, 64, (16, 1), (2, 1))
self.cba3_1 = make_cba(1, 32, (1, 32), (1, 2))
self.cba3_2 = make_cba(32, 32, (32, 1), (2, 1))
self.cba3_3 = make_cba(32, 64, (1, 32), (1, 2))
self.cba3_4 = make_cba(64, 64, (32, 1), (2, 1))
self.fc = nn.Linear(64 * 3, 50)

def forward(self, x):
x1 = self.cba1_1(x)
x1 = self.cba1_2(x1)
x1 = self.cba1_3(x1)
x1 = self.cba1_4(x1)
x1 = avg_pool2d(x1, kernel_size=x1.size()[2:])
x2 = self.cba2_1(x)
x2 = self.cba2_2(x2)
x2 = self.cba2_3(x2)
x2 = self.cba2_4(x2)
x2 = avg_pool2d(x2, kernel_size=x2.size()[2:])
x3 = self.cba3_1(x)
x3 = self.cba3_2(x3)
x3 = self.cba3_3(x3)
x3 = self.cba3_4(x3)
x3 = avg_pool2d(x3, kernel_size=x3.size()[2:])
x = torch.cat([x1, x2, x3], dim=3)
x = x.view(x.shape[0], -1)
return self.fc(x)


def train_epoch(data_loader, model, criterion, optimizer, epoch, writer):
model.train()
loss_sum = 0
for data, target in tqdm(data_loader):
data = data.cuda()
target = target.cuda()

output = model(data)
loss = criterion(output, target)

optimizer.zero_grad()
loss.backward()
optimizer.step()

loss_sum += loss.item()

loss_epoch = loss_sum / len(data_loader)
print('train_loss = ', loss_epoch)
writer.add_scalar("train loss", loss_epoch, epoch)


def val_epoch(data_loader, model, criterion, epoch, writer):
model.eval()
loss_sum = 0
correct = 0
data_num = 0
for data, target in tqdm(data_loader):
data = data.cuda()
target = target.cuda()

output = model(data)
loss = criterion(output, target)
loss_sum += loss.item()

_, preds = torch.max(output, axis=1)
correct += (preds == target).sum().item()
data_num += target.size(0)

loss_epoch = loss_sum / len(data_loader)
print('val_loss = ', loss_epoch)

accuracy = float(correct) / data_num
print('accuracy = ', accuracy)

writer.add_scalar("val loss", loss_epoch, epoch)
writer.add_scalar("val accuracy", accuracy, epoch)


def main():

data_dir = 'data'
train_dataset = EscDataset(data_dir, train_flag=True)
val_dataset = EscDataset(data_dir, train_flag=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=8)

model = EscNet().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

writer = tensorboard.SummaryWriter(log_dir="logs")

for epoch in range(100):
train_epoch(train_loader, model, criterion, optimizer, epoch, writer)
val_epoch(val_loader, model, criterion, epoch, writer)

state = {'state_dict': model.state_dict()}
filename = 'checkpoints/{0:04d}.pth.tar'.format(epoch)
torch.save(state, filename)


if __name__ == '__main__':
main()

 

・参考

ディープラーニングで音声分類 - Qiita