# Tutorial 4

In this tutorial, we'll look at gradient descent, its variants and optimization algorithms.

In [1]:
import numpy as np
import math
from matplotlib import pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms

print("Torch version:", torch.__version__)

Torch version: 2.8.0+cu128


Let's first load the MNIST dataset and split it into a train and test set:

In [2]:
batch_size = 64

In [3]:
data_train = datasets.MNIST(
    root = "data",
    train = True,
    download = True,
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0, 1)])
)
data_test = datasets.MNIST(
    root = "data",
    train = False,
    download = True,
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0, 1)])
)

dataloader_train = DataLoader(data_train, batch_size = batch_size, shuffle = True)
dataloader_test = DataLoader(data_test, batch_size = batch_size, shuffle = True)

Let's defined helper functions totrain the model:

In [4]:
def calculate_accuracy(model, dataloader):
    num_correct = 0
    
    with torch.no_grad():
        for (X, y) in dataloader:
            pred = model(X)
            num_correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    accuracy = num_correct / len(dataloader.dataset)
    return accuracy

In [5]:
def calculate_loss_accuracy(model, dataloader, loss_fn):
    loss = 0
    num_correct = 0
    
    with torch.no_grad():
        for (X, y) in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            num_correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss /= len(dataloader)
    accuracy = num_correct / len(dataloader.dataset)
    return loss, accuracy

In [6]:
def train_model(model, loss_fn, optimizer, epochs, dataloader_train, dataloader_test, early_stopper = None, log_period = 10000):
    for epoch in range(epochs):
        processed_since_log = 0
        for batch, (X, y) in enumerate(dataloader_train):
            model.train()
            pred = model(X)
            loss = loss_fn(pred, y)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            processed_since_log += dataloader_train.batch_size

            if processed_since_log >= log_period:
                current = min((batch + 1) * dataloader_train.batch_size, len(data_train))
                loss = loss.item()
                model.eval()
                train_acc = calculate_accuracy(model, dataloader_train)
                test_loss, test_acc = calculate_loss_accuracy(model, dataloader_test, loss_fn)
                print(f"train loss: {loss:>7f}  test loss: {test_loss:>7f}  train accuracy: {train_acc:>3f}  test accuracy: {test_acc:>3f}  [sample {current:>5d}/{len(data_train):>5d}] [epoch {epoch+1:>2d}/{epochs:>2d}]")
                processed_since_log -= log_period

Since we will be creating multiple identical models, let's also create a function for that:

In [7]:
def create_model():
    model = nn.Sequential()
    model.append(nn.Flatten())
    model.append(nn.Linear(data_train.data.shape[1] * data_train.data.shape[2], 100))
    model.append(nn.ReLU())
    model.append(nn.Linear(100, 10))
    return model

## Gradient descent

In [8]:
class Simple_SGD(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3):
        defaults = dict(lr = lr)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                p.add_(p.grad, alpha = -group["lr"])

In [None]:
model = create_model()
optimizer = Simple_SGD(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

train loss: 2.263719  test loss: 2.276265  train accuracy: 0.158967  test accuracy: 0.166300  [sample 10048/60000] [epoch  1/10]
train loss: 2.236584  test loss: 2.245551  train accuracy: 0.233133  test accuracy: 0.240700  [sample 20032/60000] [epoch  1/10]


## Momentum

In [None]:
class Simple_Momentum(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, momentum_gamma = 0.9):
        defaults = dict(lr = lr, momentum_gamma = momentum_gamma)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                momentum_v = self.state[p].get("momentum_v")
                if momentum_v is None:
                    momentum_v = torch.clone(p.grad).detach()
                else:
                    momentum_v.mul_(group["momentum_gamma"]).add_(p.grad)
                
                self.state[p]["momentum_v"] = momentum_v
                p.add_(momentum_v, alpha = -group["lr"])

In [None]:
model = create_model()
optimizer = Simple_Momentum(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

## Task 1

Create an optimizer for the Nesterov accelerated gradients (NAG) method.

In [None]:
class Simple_Nesterov(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, momentum_gamma = 0.9):
        defaults = dict(lr = lr, momentum_gamma = momentum_gamma)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                # TODO
                pass

In [None]:
model = create_model()
optimizer = Simple_Nesterov(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

## Task 2

Create an optimizer for the ADAM method.

In [None]:
class Simple_ADAM(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-8):
        defaults = dict(lr = lr, beta_1 = beta_1, beta_2 = beta_2, epsilon = epsilon)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                # TODO
                pass

In [None]:
model = create_model()
optimizer = Simple_ADAM(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)