Introduction to PyTorch

After reading Chapter 1 of Natural Language Processing with PyTorch, I decided that I wanted to learn more about the library before continuing reading the rest of the book. I read through the Introduction to PyTorch and Learning PyTorch tutorials on the PyTorch website, and here are the notes.

DOWNLOAD NOTEBOOK

2 488

Pytorch Documentation

PyTorch is an optimzed tensor library for deep learning using GPUs and CPUs. It is an opern source machine learning framework that accelerates the path from research prototyping to production deployment.

Introduction to PyTorch

The default datatypes for elements in PyTorch tensors is 32-bit floating point numbers. PyTorch tesors perform arithmetic operations intuitively. Tensors of similar shapes may be added, multiplied, etc. Operations with scalars are distributed over the tensor.

import torch # import pytorch

z = torch.zeros(5,3)
print(z)
print(z.dtype)

out[2]

tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
torch.float32

i = torch.ones((5,3),dtype=torch.int16) # Overide default dtype
print(i.dtype)

out[3]

torch.int16

torch.manual_seed(42) # Seed PRNG
rn = torch.rand(2,2)
print("A random tensor:",rn)

ones = torch.ones((2,3))
print(ones)
twos = torch.ones((2,3),dtype=torch.int16)*2
print(twos)
print(ones+twos)

out[4]

A random tensor: tensor([[0.8823, 0.9150],
[0.3829, 0.9593]])
tensor([[1., 1., 1.],
[1., 1., 1.]])
tensor([[2, 2, 2],
[2, 2, 2]], dtype=torch.int16)
tensor([[3., 3., 3.],
[3., 3., 3.]])

r = (torch.rand((2,2)) - 0.5) * 2 # Values between -1 and 1
print("A random matrix:")
print(r)

# Common Mathematical operations are supported
print("\nAbosulte value of r:")
print(torch.abs(r))

# As are trigonometric functions:
print("\nInverse sine of r:")
print(torch.asin(r))

# and linear algrebra operations like determinant and singular value decomposition
print("\nDeterminant of r:")
print(torch.det(r))
print("\nSingular Value Decomposition of r:")
print(torch.svd(r))

# and statsistical and aggegrate operations
print("\nAverage and Standard Deviation of r:")
print(torch.std_mean(r))
print("\nMax value of r:")
print(torch.max(r))

out[5]

A random matrix:
tensor([[ 0.8815, -0.7336],
[ 0.8692, 0.1872]])

Abosulte value of r:
tensor([[0.8815, 0.7336],
[0.8692, 0.1872]])

Inverse sine of r:
tensor([[ 1.0791, -0.8236],
[ 1.0536, 0.1883]])

Determinant of r:
tensor(0.8027)

Singular Value Decomposition of r:
torch.return_types.svd(
U=tensor([[-0.8322, -0.5545],
[-0.5545, 0.8322]]),
S=tensor([1.3170, 0.6095]),
V=tensor([[-0.9230, 0.3848],
[ 0.3848, 0.9230]]))

Average and Standard Deviation of r:
(tensor(0.7623), tensor(0.3011))

Max value of r:
tensor(0.8815)

PyTorch Models

import torch # for all things PyTorch
import torch.nn as nn # for torch.nn.Module, the parent object for PyTorch modules
import torch.nn.functional as F # for the activation fucntion

class LeNet(nn.Module):
  """
  Implementation of an abbridged version of LeNet-5
  Typical PyTprch Model:
  - Inherits from torch.nn.Module
  - A model will have an __init__() function, where it instantiates its layers, and loads any artifacts it might need (e.g., an NLP might need a vocab)
  - A model will have a forward() function. This is where the actual computation happens. An input is passed through the network layers and various functions to generate an output
  - Other than that, you can build a model however you want
  """
  def __init__(self):
    super(LeNet,self).__init__()
    # 1 input image channel (black and white), 6 output channels, 5x5 square convolution
    # kernel
    self.conv1 = nn.Conv2d(1,6,5)
    self.conv2 = nn.Conv2d(6,16,5)
    # an affine operation: y = Wx+b
    self.fc1 = nn.Linear(16*5*5,120) # 5*5 for image dimension
    self.fc2 = nn.Linear(120,84)
    self.fc3 = nn.Linear(84,10)
  def forward(self,x):
    # Max Pooling over a (2,2) window
    x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
    # If the size is a square you can only specify a single number
    x = F.max_pool2d(F.relu(self.conv2(x)),2)
    x = x.view(-1, self.num_flat_features(x))
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x
  def num_flat_features(selx,x):
    size = x.size()[1:] # all dimensions except the batch dimension
    num_features = 1
    for s in size:
      num_features *= s
    return num_features

net = LeNet()
print(net) # What does the object tell us about itself?
"""
A subclass of torch.nn.Module will report the layers it has created and their shapes and parameters. This can provide a handy overview of a model if you wnat to get the gist of its processing.
"""

input = torch.rand(1,1,32,32) # stand-in for a 32x32 black and white image

print('\nImage batch shape:')
print(input.shape)
"""
PyTorch models assume they are working with batches of data
"""

output = net(input) # We don;t call forward() directly
"""
Asking the model for an inference. The output of this call represents the model's confidence that the input represents a particualr digit.
"""
print('\nRaw output:')
print(output)
print(output.shape)

out[7]

LeNet(
(conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
(conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
(fc1): Linear(in_features=400, out_features=120, bias=True)
(fc2): Linear(in_features=120, out_features=84, bias=True)
(fc3): Linear(in_features=84, out_features=10, bias=True)
)

Image batch shape:
torch.Size([1, 1, 32, 32])

Raw output:
tensor([[ 0.0841, -0.0161, 0.0597, -0.0240, 0.0773, 0.0904, -0.0287, -0.0777,
0.0152, -0.1638]], grad_fn=<AddmmBackward0>)
torch.Size([1, 10])

Datasets and Dataloaders

Below is a demonstration of how to use one of the ready-to-download, open-acess datasets from TorchVision, how to transform the images for consumption by your model, and how to use the DataLoader to feed batches of data to your model.

%matplotlib inline
import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.ToTensor(), # Converts images loaded by Pillow to PyTorch tensors
    # Adjusts the values of the tensor so that their avaerage is zero
    # and their standard deviation is 1.0. Most activation functions have their
    # strongest gradient around 0, so centering data their can speed learning
    # The values passed to the transform are the means (first tuple) and std
    # deviations of the rgb images in the dataset.
    transforms.Normalize(
        (0.4914, 0.4822, 0.4465),
         (0.2470, 0.2435, 0.2616)
    )
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

out[9]

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz

100%|██████████| 170498071/170498071 [00:02<00:00, 60459109.08it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data

This is an example of creating a dataset object in PyTorch. Downloadable datasets (like CIFAR-10 above) are subclasses of torch.utils.data.Dataset. Dataset classes in PyTorch include the downloadable datasets in TorchVision, Torchtext, and TorchAudio, as well as utility dataset classes such as torchvision.datasets.ImageFolder, which will read a folder of labeled images.

When you instantiate dataset, you need to tell it:

the filesystem path to where we want the data to go
whether or not we are using this set for training; most datasets will be split into training/test
Whether we would like to download the dataset if we haven't already
the transformers we want to apply to the data

"""
Give batches of 4 images from trainset, randomize order, and use two
workers to load data
"""
trainloader = torch.utils.data.DataLoader(
    trainset,
    batch_size=4, # 4 images from trainset
    shuffle=True, # randomize order
    num_workers=2 # use two workers to load data
)

out[11]

A Dataset subclass wraps access to the data, and is specialized to the type of data it's serving. The DataLoader knows nothing about the data, but organizes the input tensors served by the Dataset into batches with the parametrs you specify.

import matplotlib.pyplot as plt
import numpy as np

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

out[13]

WARNING:matplotlib.image:Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

dog truck horse frog

Training your PyTorch Model

#%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

criterion = nn.CrossEntropyLoss() # Loss Function
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) # Optimizer

out[15]

Files already downloaded and verified
Files already downloaded and verified
bird ship plane truck

The loss function is a measure of how far from our ideal output the model's prediction was. Cross-entropy loss is a typical loss function for classification models like ours.

The optimizer is what drives the learning.

for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs) # Ask the model for its predictions
        loss = criterion(outputs, labels) # Compute difference between correct and predicted outputs
        loss.backward() # Calculate the gradients that will direct the learning
        optimizer.step() # Perform one leanring step
        # uses the gradients from teh backward() call to nudge the
        # learning weights in the direction that it thinks will reduce the loss

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

out[17]

[1, 2000] loss: 2.186
[1, 4000] loss: 1.854
[1, 6000] loss: 1.649
[1, 8000] loss: 1.563
[1, 10000] loss: 1.516
[1, 12000] loss: 1.472
[2, 2000] loss: 1.412
[2, 4000] loss: 1.379
[2, 6000] loss: 1.357
[2, 8000] loss: 1.319
[2, 10000] loss: 1.309
[2, 12000] loss: 1.293
Finished Training

The code above goes through 2 training epochs - that is, 2 passes over the training dataset. Each pass has an inner loop that iterates over the training data, serving batches of transformed input images and their correct labels.

Zeroing the gradients is an important step. Gradients are accumulated over a batch; if we do not reset them for every batch, they will keep accumulating, which will provide incorrect gradient values, making learning impossible.

correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

out[19]

Accuracy of the network on the 10000 test images: 54 %

Introduction to PyTorch Tensors

Tensors are the central data abstraction in PyTorch. torch.Tensor is an alias for torch.FloatTensor. By default, tensors are populated with 32-bit floating point numbers. Manually seeding before generating random numbers ensures repoducability of results.

import torch
import math
"""
Creating Tensors
"""
# empty() call allocates memort for the tensor, but does not initialize it with
# any values
x = torch.empty((3,4)) # Simplest way to create a tensor
print(x)

zeros = torch.zeros((2,3))
print(zeros)

ones = torch.ones((2,3))
print(ones)

torch.manual_seed(1729)
random = torch.randn((2,3))
print(random)

out[21]

tensor([[-3.1763e+06, 4.4573e-41, -1.4530e+20, 3.2381e-41],
[ 4.2599e-43, 0.0000e+00, nan, nan],
[ 7.7330e+28, 4.4573e-41, 7.7330e+28, 4.4573e-41]])
tensor([[0., 0., 0.],
[0., 0., 0.]])
tensor([[1., 1., 1.],
[1., 1., 1.]])
tensor([[-1.1257, -0.0057, -1.3975],
[ 1.4364, -0.1068, -0.8413]])

Tensor Shapes

torch.*_like() methods lets you create a tensor of the same shape as another tensor. The .shape property of a tensor contains a list of the extent of each dimension of a tensor. Using torch.tensor() is the most straightforward way to create a tensor of you already have data in a Python tuple or list.

Tensor Data Types

You can set the datatype of a tensor multiple ways. When using the factory methods to create a tensor, you can specify the tensor's shape as a series of integer arguments or choose to group them together in a tuple.

Math and Logic with PyTorch Tensors

Arithmetic operations between tensors and scalars, such as addition, subtraction, multiplication, and exponentiation are distributed over every element of the tensor. Similar operations between two tensors also behave as you'd intuitively expect. The same rules for broadcasting in NumPy apply here.

PyTorch tensors have over three hundred operations that can be perofrmed on them.

Most binary operations on tensors will return a third, new tensor. There are ties that you may wish to alter a tensor in place. For this, most of the math functions have a version with an appended underscore (_) that will alter the tensor in place.

Many of the methods and functions receive an out keyword argument that lets you specify a tensor to receive the output - if the tensor has the correct shape and dtype, then this can happen without a new memory allocation.

"""
Tensor Shapes
"""
x = torch.empty((2,2,3))
print(x.shape)
empty_like_x = torch.empty_like(x)
print(empty_like_x.shape)
zeros_like_x = torch.zeros_like(x)
print(zeros_like_x.shape)
ones_like_x = torch.ones_like(x)
print(ones_like_x.shape)
random_like_x = torch.randn_like(x)
print(random_like_x.shape)

"""
Tensor Data Types
"""
a = torch.ones((2,3),dtype=torch.int16)
b = torch.randn((2,3),dtype=torch.float64) * 20
print(b)
c = b.to(torch.int32)
print(c)

"""
Math and Logic with PyTorch Tensors
"""
ones = torch.zeros((2,2)) + 1
twos = ones * 2
threes = (torch.ones((2,2)) * 7 - 1) / 2
fours = twos ** 2
sqrt2s = twos ** 0.5

print(ones)
print(twos)
print(threes)
print(fours )
print(sqrt2s)

powers2 = twos ** torch.tensor([[1,2],[3,4]])
print(powers2)
fives = ones + fours
print(fives)
dozens = threes * fours
print(dozens)

out[23]

torch.Size([2, 2, 3])
torch.Size([2, 2, 3])
torch.Size([2, 2, 3])
torch.Size([2, 2, 3])
torch.Size([2, 2, 3])
tensor([[ 9.7309, -19.2977, -0.2074],
[-36.3183, 4.9187, -23.8042]], dtype=torch.float64)
tensor([[ 9, -19, 0],
[-36, 4, -23]], dtype=torch.int32)
tensor([[1., 1.],
[1., 1.]])
tensor([[2., 2.],
[2., 2.]])
tensor([[3., 3.],
[3., 3.]])
tensor([[4., 4.],
[4., 4.]])
tensor([[1.4142, 1.4142],
[1.4142, 1.4142]])
tensor([[ 2., 4.],
[ 8., 16.]])
tensor([[5., 5.],
[5., 5.]])
tensor([[12., 12.],
[12., 12.]])

b:
tensor([0.0000, 0.7854, 1.5708, 2.3562])
tensor([0.0000, 0.7071, 1.0000, 0.7071])
tensor([0.0000, 0.7071, 1.0000, 0.7071])

"""
More Math with Tensors
"""
a = torch.rand((2,4)) * 2 - 1
# Common Functions
print(torch.clamp(a,-0.5,0.5))
# Comparisons
d = torch.tensor([[1.,2.],[3.,4.]])
e = torch.ones(1,2) # Many comparison operations support broadcasting
print(torch.eq(d,e)) # Returns a Tensor of type bool
# Reductions
print(torch.max(d)) # Returns a single-element tensor
print(torch.max(d).item()) # Extracts the value from the returned tensor
print(torch.unique(torch.tensor([1,2,1,2,1,2,3]))) # Filter unique elements

# vector and linear algebra operations
v1 = torch.tensor([1., 0., 0.])         # x unit vector
v2 = torch.tensor([0., 1., 0.])         # y unit vector
m1 = torch.rand(2, 2)                   # random matrix
m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix
print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
print(m1)
m3 = torch.linalg.matmul(m1, m2)
print(m3)                  # 3 times m1
print(torch.linalg.svd(m3))       # singular value decomposition


# Altering Tensor in Place
b = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
print('\nb:')
print(b)
print(torch.sin_(b))  # note the underscore
print(b)

a = torch.ones(2, 2)
b = torch.rand(2, 2)

print('Before:')
print(a)
print(b)
print('\nAfter adding:')
print(a.add_(b))
print(a)
print(b)

out[24]

tensor([[-0.4253, 0.5000, -0.5000, 0.4164],
[ 0.5000, 0.3570, 0.5000, 0.5000]])
tensor([[ True, False],
[False, False]])
tensor(4.)
4.0
tensor([1, 2, 3])
tensor([ 0., 0., -1.])
tensor([[0.6644, 0.7673],
[0.4209, 0.3795]])
tensor([[1.9932, 2.3019],
[1.2628, 1.1384]])
torch.return_types.linalg_svd(
U=tensor([[-0.8739, -0.4860],
[-0.4860, 0.8739]]),
S=tensor([3.4826, 0.1831]),
Vh=tensor([[-0.6764, -0.7365],
[ 0.7365, -0.6764]]))

b:
tensor([0.0000, 0.7854, 1.5708, 2.3562])
tensor([0.0000, 0.7071, 1.0000, 0.7071])
tensor([0.0000, 0.7071, 1.0000, 0.7071])
Before:
tensor([[1., 1.],
[1., 1.]])
tensor([[0.9197, 0.9399],
[0.7340, 0.7090]])

After adding:
tensor([[1.9197, 1.9399],
[1.7340, 1.7090]])
tensor([[1.9197, 1.9399],
[1.7340, 1.7090]])
tensor([[0.9197, 0.9399],
[0.7340, 0.7090]])

Copying Tensors

As with any object in Python, assigning a tensor to a variable makes the variable a labvel of the tensor, and does not copy it. If you need a separate copy of the data, you can use the clone() method. An important thing to be aware of when using clone(): if your source tensor has autograd enabled, then so will the clone. If this behavior is not preferable, then you can use .detach().clone().

a = torch.ones(2, 2)
b = a

a[0][1] = 561  # we change a...
print(b)       # ...and b is also altered

b = a.clone()

assert b is not a      # different objects in memory...
print(torch.eq(a, b))  # ...but still with the same contents!

a[0][1] = 561          # a changes...
print(b)               # ...but b is still all ones

Moving to GPU

One of the major advantages of PyTorch is its robust acceleration on CUDA-compatible NVIDIA GPUs.

First check whether a GPU is available, with the is_available() method:

if torch.cuda.is_available():
    print('We have a GPU!')
else:
    print('Sorry, CPU only.')

Once yoy;ve determined that one or more GPUs is available, you need to put the data someplace where the GPU can see it. The GPU has dedictaed memory attacked to it. Whenever you want to perform a computation on a device, you must move all the data needed for that computation to memory accessible by that device. By default, new tensors are created on the CPU, so we have to specify when we want to create our tensor on the GPU with the optional device argument.

When doing computation involving two or more tensors, they all must be on the same device.

if torch.cuda.is_available():
    # Specifying the target device at Tensor creation time
    gpu_rand = torch.rand(2, 2, device='cuda')
    print(gpu_rand)
else:
    print('Sorry, CPU only.')

# More roubust way of specifying the device that will handle Tensors
if torch.cuda.is_available():
    my_device = torch.device('cuda')
else:
    my_device = torch.device('cpu')
print('Device: {}'.format(my_device))

x = torch.rand(2, 2, device=my_device)
print(x)

# Transfering existing Tensor to a certain device
y = torch.rand(2, 2)
y = y.to(my_device)

Manipulating Tensor Shapes

Sometimes you will need to change the shape of your tensor. You can use the unsqueeze() method to add a dimension of extent 1 and the squeeze() method to remove a dimension of extent 1. These methods also have inplace versions with an appended underscore. The reshape() method allows you to more radically change the shape of a Tensor (note that the number of elements must be the same before and after).

NumPy Bridge

Use the torch.from_numpy() and tensor.numpy() methods to convert a NumPy array to a tensor and a tensor to a NumPy array, respectively.

"""
Changing a (3,226,226) tensor to (1,2,226,226) tensor
"""
a = torch.rand(3, 226, 226)
"""
The unsqueeze() method adds a dimension of extent 1. unsqueeze(0) adds it
as a new zeroth dimension - so now you have a batch of one.
"""
b = a.unsqueeze(0)

print(a.shape)
print(b.shape)

a = torch.rand(1, 20)
print(a.shape)
print(a)

b = a.squeeze(0)
print(b.shape)
print(b)

c = torch.rand(2, 2)
print(c.shape)

d = c.squeeze(0)
print(d.shape)

out[26]

torch.Size([3, 226, 226])
torch.Size([1, 3, 226, 226])
torch.Size([1, 20])
tensor([[0.5848, 0.6677, 0.2326, 0.0459, 0.8976, 0.0565, 0.5108, 0.4328, 0.5471,
0.7946, 0.0084, 0.5709, 0.6242, 0.8383, 0.5668, 0.8762, 0.8578, 0.3846,
0.3876, 0.7604]])
torch.Size([20])
tensor([0.5848, 0.6677, 0.2326, 0.0459, 0.8976, 0.0565, 0.5108, 0.4328, 0.5471,
0.7946, 0.0084, 0.5709, 0.6242, 0.8383, 0.5668, 0.8762, 0.8578, 0.3846,
0.3876, 0.7604])
torch.Size([2, 2])
torch.Size([2, 2])

The Fundamentals of Autograd

PyTorch's Autograd feature is part of what make PyTorch flexible and fast for building machine leanring projects. It allows for the rapis and easy computation of multiple partial derivatives (also referred to as gradients) over a complex computation. The operation is central to backpropagation-based neural networks learning.

The power of autograd comes from the fact that it traces your computation dynamically at runtime, meaning that if your model has decision boundaries, or loops whose lengths are not known until runtime, the computation will still be traced correctly, and you'll get correct gradients to drive learning.

Every computed tensor in your PyTorch model carries a history of its input tensors and the function used to create it. Combined with the fact that PyTorch functions meant to act on tensors each have a built-in implementation of computing their own derivatives, this greatly speeds the computation of the local derivatives needed for learning.

Beware that only leaf nodes of the computation have their gradients computed.

Turning Autograd Off and On

You can set the requires_grad property of the tensor. You can temporarily turn autograd on or off using the methods below. When you copy an object with autograd on, you can use the detach() method to detach the clone from the computation history. Note: you should detach an object with autograd on before using it in matplotlib. You should be careful about using in-place operations when using autograd.

a = torch.ones(2, 3, requires_grad=True) * 2
b = torch.ones(2, 3, requires_grad=True) * 3
# If you need autograd temporarily off
with torch.no_grad():
    c2 = a + b

with torch.enable_grad():
    c2 = a + b


# This can also be used as a function or method decorator
@torch.no_grad() # temp off
def add_tensors2(x, y):
    return x + y

@torch.enable_grad() # temp on
def add_tensors2(x, y):
    return x + y

# If you need autograd temporarily on

# %matplotlib inline

import torch

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import math

# Setting requires_grad = True means that in every computation that
# follows, autograd will be accumulating the history of the computation in
# the output tensors of that computation.
a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
print(a)
b = torch.sin(a)
plt.plot(a.detach(), b.detach(),label=r"2 $ \cdot $ sin(a)")


c = 2*b
d = c+1
out = d.sum()

"""
Each grad_fn stored with our tensors allows you to walk the computation
all the way back to its inputs with its next_function property. We can
see below that drilling down this property on d dhows us the
gradient functions for all prior tensors.
"""
print('d:')
print(d.grad_fn)
print(d.grad_fn.next_functions)
print(d.grad_fn.next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
print('\nc:')
print(c.grad_fn)
print('\nb:')
print(b.grad_fn)
print('\na:')
print(a.grad_fn)

out.backward()
print(a.grad)

plt.gcf().gca().plot(a.detach(), a.grad.detach(),'-r',label=r"2 $ \cdot $ cos(a)")
plt.legend()
plt.show()

out[28]

tensor([0.0000, 0.2618, 0.5236, 0.7854, 1.0472, 1.3090, 1.5708, 1.8326, 2.0944,
2.3562, 2.6180, 2.8798, 3.1416, 3.4034, 3.6652, 3.9270, 4.1888, 4.4506,
4.7124, 4.9742, 5.2360, 5.4978, 5.7596, 6.0214, 6.2832],
requires_grad=True)
d:
<AddBackward0 object at 0x7ad586b51540>
((<MulBackward0 object at 0x7ad586aeeec0>, 0), (None, 0))
((<SinBackward0 object at 0x7ad586b51540>, 0), (None, 0))
((<AccumulateGrad object at 0x7ad586b50970>, 0),)
()

c:
<MulBackward0 object at 0x7ad586c1cb80>

b:
<SinBackward0 object at 0x7ad586c1cb80>

a:
None
tensor([ 2.0000e+00, 1.9319e+00, 1.7321e+00, 1.4142e+00, 1.0000e+00,
5.1764e-01, -8.7423e-08, -5.1764e-01, -1.0000e+00, -1.4142e+00,
-1.7321e+00, -1.9319e+00, -2.0000e+00, -1.9319e+00, -1.7321e+00,
-1.4142e+00, -1.0000e+00, -5.1764e-01, 2.3850e-08, 5.1764e-01,
1.0000e+00, 1.4142e+00, 1.7321e+00, 1.9319e+00, 2.0000e+00])

Building Models with PyTorch

The torch.nn.Module class is the base class meant to encapsulate behaviors specific to PyTorch Models and their components. One important behavior of torch.nn.Module is registering parameters. If a particular Module subclass has learning weights, these weighs are expressed as instances of torch.nn.Parameter. The Parameter class is a subclass of torch.Tensor, with the special behavior that when they are assigned as attributes of a Module, they are added add to the list of that modules parameters. These parameters may be accessed throught the parameters() method on the Module class.

import torch

class TinyModel(torch.nn.Module):
  """
  Example of a very simple model with two linear layers and an activation
  function
  """
  def __init__(self):
    super(TinyModel,self).__init__()
    self.linear1 = torch.nn.Linear(100,200)
    self.activation = torch.nn.ReLU()
    self.linear2 = torch.nn.Linear(200,10)
    self.softmax = torch.nn.Softmax()
  def forward(self,x):
    x = self.linear1(x)
    x = self.activation(x)
    x = self.linear2(x)
    x = self.softmax(x)
    return x

tinymodel = TinyModel()
print("The model:")
print(tinymodel)

print("\n\nJust one Layer:")
print(tinymodel.linear2)

print('\n\nModel params:')
for param in tinymodel.parameters():
    print(param)

print('\n\nLayer params:')
for param in tinymodel.linear2.parameters():
    print(param)

out[30]

The model:
TinyModel(
(linear1): Linear(in_features=100, out_features=200, bias=True)
(activation): ReLU()
(linear2): Linear(in_features=200, out_features=10, bias=True)
(softmax): Softmax(dim=None)
)

Just one Layer:
Linear(in_features=200, out_features=10, bias=True)

Model params:
Parameter containing:
tensor([[ 0.0069, 0.0788, 0.0563, ..., 0.0156, 0.0763, -0.0016],
[ 0.0587, 0.0940, 0.0106, ..., 0.0511, -0.0811, 0.0419],
[ 0.0628, 0.0282, 0.0127, ..., -0.0207, 0.0359, 0.0552],
...,
[-0.0001, -0.0043, 0.0694, ..., 0.0559, 0.0600, 0.0405],
[-0.0344, 0.0286, -0.0117, ..., 0.0177, 0.0956, 0.0956],
[ 0.0558, 0.0355, 0.0586, ..., 0.0630, 0.0939, -0.0931]],
requires_grad=True)
Parameter containing:
tensor([-0.0733, -0.0336, -0.0033, 0.0733, 0.0946, 0.0478, 0.0013, -0.0225,
0.0679, -0.0497, -0.0829, -0.0479, -0.0822, -0.0515, 0.0556, -0.0902,
-0.0051, 0.0662, -0.0506, -0.0406, -0.0395, -0.0941, -0.0099, -0.0489,
0.0472, -0.0795, -0.0070, 0.0690, -0.0354, -0.0125, 0.0648, -0.0023,
-0.0334, -0.0891, -0.0992, -0.0631, 0.0284, -0.0874, -0.0817, -0.0079,
0.0747, 0.0568, 0.0428, 0.0122, 0.0190, -0.0741, -0.0305, 0.0358,
-0.0296, 0.0087, 0.0860, 0.0414, -0.0061, -0.0903, 0.0740, -0.0909,
0.0022, 0.0557, -0.0944, -0.0893, -0.0984, -0.0713, 0.0185, -0.0274,
0.0054, -0.0528, 0.0463, -0.0959, 0.0292, -0.0076, -0.0911, 0.0502,
-0.0908, 0.0708, 0.0161, -0.0151, -0.0791, -0.0079, 0.0382, 0.0486,
-0.0603, -0.0903, -0.0825, -0.0434, -0.0185, -0.0797, 0.0705, -0.0272,
0.0799, 0.0944, -0.0426, -0.0509, -0.0871, 0.0461, 0.0804, -0.0114,
-0.0222, -0.0261, 0.0885, 0.0127, -0.0229, 0.0218, -0.0749, -0.0888,
0.0670, -0.0506, 0.0056, -0.0899, 0.0578, 0.0782, -0.0854, 0.0336,
0.0254, -0.0366, -0.0463, 0.0032, 0.0131, 0.0954, -0.0813, 0.0317,
0.0680, -0.0306, -0.0885, 0.0001, 0.0134, 0.0996, -0.0360, 0.0143,
0.0920, -0.0210, -0.0400, -0.0229, -0.0985, 0.0872, 0.0945, 0.0128,
0.0828, 0.0636, 0.0386, -0.0371, 0.0170, -0.0054, -0.0108, 0.0478,
0.0294, -0.0136, 0.0965, -0.0807, 0.0076, -0.0612, 0.0710, 0.0553,
-0.0634, -0.0609, 0.0222, -0.0280, -0.0481, 0.0816, 0.0871, -0.0696,
0.0052, -0.0635, -0.0213, 0.0456, -0.0429, 0.0946, -0.0138, 0.0823,
-0.0316, 0.0609, 0.0183, 0.0070, -0.0831, -0.0694, -0.0860, 0.0781,
0.0307, 0.0386, 0.0917, -0.0753, -0.0756, 0.0651, -0.0740, -0.0295,
0.0304, 0.0064, -0.0386, 0.0245, 0.0192, 0.0924, 0.0809, 0.0133,
-0.0693, 0.0220, 0.0365, -0.0054, -0.0526, 0.0450, -0.0906, -0.0366],
requires_grad=True)
Parameter containing:
tensor([[ 0.0521, 0.0543, 0.0354, ..., -0.0539, 0.0545, 0.0063],
[ 0.0197, 0.0307, 0.0652, ..., -0.0424, 0.0206, -0.0555],
[-0.0084, 0.0296, 0.0530, ..., 0.0600, -0.0488, 0.0087],
...,
[ 0.0116, -0.0546, -0.0656, ..., 0.0481, -0.0054, -0.0336],
[ 0.0129, -0.0355, 0.0223, ..., -0.0054, 0.0535, -0.0125],
[ 0.0238, 0.0411, 0.0606, ..., 0.0200, -0.0625, 0.0157]],
requires_grad=True)
Parameter containing:
tensor([-0.0116, -0.0061, 0.0229, -0.0315, 0.0605, -0.0411, 0.0557, -0.0424,
0.0321, 0.0542], requires_grad=True)

Layer params:
Parameter containing:
tensor([[ 0.0521, 0.0543, 0.0354, ..., -0.0539, 0.0545, 0.0063],
[ 0.0197, 0.0307, 0.0652, ..., -0.0424, 0.0206, -0.0555],
[-0.0084, 0.0296, 0.0530, ..., 0.0600, -0.0488, 0.0087],
...,
[ 0.0116, -0.0546, -0.0656, ..., 0.0481, -0.0054, -0.0336],
[ 0.0129, -0.0355, 0.0223, ..., -0.0054, 0.0535, -0.0125],
[ 0.0238, 0.0411, 0.0606, ..., 0.0200, -0.0625, 0.0157]],
requires_grad=True)
Parameter containing:
tensor([-0.0116, -0.0061, 0.0229, -0.0315, 0.0605, -0.0411, 0.0557, -0.0424,
0.0321, 0.0542], requires_grad=True)

Common Layer Types

Linear Layers

The most basic type of neural network layer is a linear or fully connected layer. This is a layer where every input influences every output of the layer to a degree specified by the layer's weights. If a model has m inputs and n outputs, the weights will be an m x n matrix.

If you do the matrix multiplication of x by the linear layer's weights and add the biases, you'll find that you get the output vector y. The default behavior of Parameter is to track gradients with autograd - this differs from Tensor.

lin = torch.nn.Linear(3, 2)
x = torch.rand(1, 3)
print('Input:')
print(x)

print('\n\nWeight and Bias parameters:')
for param in lin.parameters():
    print(param)

y = lin(x)
print('\n\nOutput:')
print(y)

Convolutional Layers

Convolutional layers are built to handle data with a high degree of spatial correlation. They are commonly used in computer vision, where they detect close groupings of features which they compose into higher-level features. They are used in NLP contexts as well.

For an example, see the convolutional layers in the LeNet example above.

The first argument to a convolutional layer is the number of input channels (number of color channels in an image).
The second argument to the constructor is the number of out features - a convolutional layer is like a window that scans over the image, looking for a pattern it recognizes, these patterns are called features, and one parameter of a convolutional layer is the number of features we would like to earn.
The Third argument is the window or kernel size. A scalar value means that you want a sqaure kernel, but you can also specify a rectangular kernel with a tuple.

The output of a convolutional layer is an activation map - a spatial representation of the presence of features in the input tensor.

There are convolutional layers for addressing 1D, 2D, and 3D tensors. You can also specify the stride length in theinput, padding, and more.

Recurrent Layers

Recurrent Neural Networks (or RNNs) are used for sequential data. An RNN maintains a hidden state that acts as a sot of memory for what it has seen in the sequence so far. The internal structure of an RNN layer - or its variants, the LSTM (long short-term memory) and GRU (gated recurrent unit) - is moderately complex.

Transformers

Transformers are multi-purpose networks that have taken over the state of the art in NLP with models like BERT. PyTorch has a Transformer class that allows you to define the overall parameters of a transformer model - the number of attention heads, the number of encoder & decoder layers, dropout and activation functions, etc.

Data Manipulation Layers

There are layers that perform important functions in models, but don't participate in the learning process themselves.

Max Pooling (torch.nn.MaxPool2d()) reduces a tensor by combining cells, and assigning the value of the input cells to the output cell.
*Normalization (torch.nn.BatchNorm1d()) re-center and normalize the output of one layer before feeding it to another. Centering and scaling the intermediate tensors has a number of beneficial effects, such as letting you use higher learning rates without exploding/vanishing gradients.
Dropout Layers (torch.nn.Dropout()) are a tool for encouraging sparse representations in your model - that is, pushing it to do inference with less data.

torch.nn.Module has objects encapsulating all of the major activation functions including ReLU and its many variants.

PyTorch contains a variety of loss functions.

PyTorch TensorBoard Support

PyTorch has TensorBoard support.

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("path_to_log_dir")
# Add an image to teh TensorBoard log dir
writer.add_image("NAME",image)

# add_graph() will trace the sample input through your model,
# and render it as a graph.
writer.add_graph(model, input_values)

# Log Embeddings
# The add_embedding() method will project a set of data onto the three
# dimensions with th highest variance, and display them as an interactive
# 3D chart
writer.add_embedding(features,metadata=class_labels,label_img=images.unsqueeze(1))

Training with PyTorch

The Dataset and DataLoader classes encapsulate the process of pulling your data from storage and exposing it to your training loop in batches. The Dataset is responsible for accessing and processing single instances of data. The DataLoader pulls instances of data from the Dataset, collects them in batches, and returns them for consumption by your training loop. The DataLoader works with all kinds of datasets.

# Create data loaders for our datasets; shuffle for training, not for validation
training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)

Model Understanding with Captum

Captum is an open source, extensible library for model interprebility built on top of PyTorch. Captum provides state-of-the-art algorithms, including Integrated Gradients, to provide researchers and developers with an easy way to understand which features are contributing to a model's output.

Captum's approach to model interprability is in terms of attributions:

Feature Attribution seeks to explain a particular output in terms of features of the input that generated it. Explaining whether a movie review was positive or negative in terms of certain words in the review is an example of feature attribution.
Layer Attribution examines the activity of a model's hidden layer subsequent to a particular input. Examining the spatially-mapped output of a convolutional layer in response to an input image in an example of layer attribution.
Neuron Attribution is analagous to layer attribution, but focuses on the activity of a single neuron.

Each of the three attribution types has multiple attribution algorithms associated with it.

Gradient-based algorithms calculate the backward gradients of a model output, layer output, or neuron activation with respect to the input. Integrated Gradients (for features), Layer Gradient Activation, and Neuron Conductance are all gradient-based algorithms.
Perturbation-based algorithms examine the changes in the output of a model, layer, or neuron in response to changes in the input. The input perturbations may be directed or random. Occlusion, Feature Ablation, and Feature Permutation are all perturbation-based algorithms.

Visualizing attribution data in ways that realte it easily to the input features being examined can be valuable. captum.attr.visualization module provides helpful functions for visualuzing attributions related to images.

Learning PyTorch

Tensors

Tensors are a specialized data structure taht are very similar to array and matrices. In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model's parameters.

"""
Tensor Initialization
"""
import torch
data = [[1,2], [3,4]]
x_data = torch.tensor(data)
import numpy as np
np_array = np.array(data)
x_np = torch.from_numpy(np_array)
x_ones = torch.ones_like(x_np)
x_rand = torch.rand_like(x_data,dtype=torch.float)
"""
With Random or Constant Values
"""
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)
"""
Tensor Attributes
"""
print(f"Shape of tensor: {rand_tensor.shape}")
print(f"Datatype of tensor: {rand_tensor.dtype}")
print(f"Device tensor is stored on: {rand_tensor.device}")
"""
Tensor Operations
"""
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

"""
Standard NumPy-like Indexing and Slicing
"""
tensor = torch.ones((4,4),device=device)
tensor[:,1] = 0
print(tensor)

"""
Joining Tnesors:

You can use torch.cat to concatenate a sequence of tensors along a given dimension
"""
t1 = torch.cat([tensor,tensor,tensor,tensor],dim=1)
print(t1)

"""
Multiplying Tensors
"""
# This computes the element-wise product
print(f"tensor.mul(tensor) \n {tensor.mul(tensor)} \n")
# Alternative syntax:
print(f"tensor * tensor \n {tensor * tensor}")

print(f"tensor.matmul(tensor.T) \n {tensor.matmul(tensor.T)} \n")
# Alternative syntax:
print(f"tensor @ tensor.T \n {tensor @ tensor.T}")

out[36]

Shape of tensor: torch.Size([2, 3])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu
tensor([[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.]])
tensor([[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]])
tensor.mul(tensor)
tensor([[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.]])

tensor * tensor
tensor([[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.],
[1., 0., 1., 1.]])
tensor.matmul(tensor.T)
tensor([[3., 3., 3., 3.],
[3., 3., 3., 3.],
[3., 3., 3., 3.],
[3., 3., 3., 3.]])

tensor @ tensor.T
tensor([[3., 3., 3., 3.],
[3., 3., 3., 3.],
[3., 3., 3., 3.],
[3., 3., 3., 3.]])

A Gentle Introduction to torch.autograd

torch.autograd is PyTorch's automatic differentiation engine that powers neural network training.

Neural Networks (NNs) are a collection of nested functions that are executed on some input data. These functions are defined by parameters (consisting of weights and biases), which in PyTorch are stored in tensors. Training happens in two steps:

Forward Propagation: In forward prop, the NN makes its best guess about the correct output. It runs the input data through each of its functions to make this guess.
Backward Propagation: In backprop, the NN adjusts its parameters proportionate to the error in its guess. It does this by traversing backwards from the output, collecting the derivatives of the error with respect to the parameters of the functions (gradients), and optimizing the parameters using gradient descent.

import torch
# Loas a pretrained resnet18 model from torchvision
from torchvision.models import resnet18, ResNet18_Weights
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data - torch.rand((1,3,64,64)) # 3 - channel 64 x 64 image batch of 1
labels = torch.rand(1,1000) # Label initialized to some random values
prediction = model(data) # forward pass
loss = (prediction - labels).sum() # Calculate loss
# The backward() method kicks off backpropagation.
# Autograd calculates and stroes the gradientrs for each model parameter in the parameter's .grad attribute
loss.backward() # Backward pass

# Load an optimizer - in this case SGD with a learning rate of 0.01 and a momentum of 0.9
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
# Call step() to initiate gradient descent. The optimizer adjusts
# each parameter by its gradient stored in .grad
optim.step()

out[38]

Learning PyTorch with Examples

Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The forward function computes output Tensors from input Tensors. The backward function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In PyTorch, you can easily define your own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward dunctions.

In TensorFlow, packages like Keras provide higher-level abstractions over raw computational graphs that are useful for building neural networks. In PyTorch, the nn package serves this same purpose. The nn packages defines a set of Modules, which are roughly equivalent to neural network layers. A module receives input Tensors and computes output Tensors, but may hold internal state such as Tensors containing learnable parameters. The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

The optim package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.

Sometimes you will want to specify models that are more complex than a sequence of existing Modules; for those cases you can define yourown Modules by subclassing nn.Module and defining a forward which recieves input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

import math
import torch

class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss with respect to the output, and we need to compute the gradient of
        the loss with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)

class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must
        return a Tensor of output data. We can use Modules defined in the
        constructor as well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'

out[40]

What is torch.nn really?

PyTorch provides the elegantly designed modules and classes torch.nn , torch.optim , Dataset , and DataLoader to help you create and train neural networks. In order to fully utilize their power and customize them for your problem, you need to really understand exactly what they're doing. To develop this understanding, we will first train basic neural net on the MNIST data set without using any features from these models; we will initially only use the most basic PyTorch tensor functionality. Then, we will incrementally add one feature from torch.nn, torch.optim, Dataset, or DataLoader at a time, showing exactly what each piece does, and how it works to make the code either more concise, or more flexible.

from pathlib import Path
import requests

DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True,exist_ok=True)

URL = "https://github.com/pytorch/tutorials/raw/main/_static/"
FILENAME = "mnist.pkl.gz"

if not (PATH / FILENAME).exists():
  content = requests.get(URL + FILENAME).content
  (PATH / FILENAME).open("wb").write(content)

out[42]

"""
The dataset is in numpy array format, and has been stored using pickle
"""
import pickle
import gzip

with gzip.open((PATH/FILENAME).as_posix(),"rb") as f:
  ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")

"""
Each image is 28 x 28, and is being stored as flattened row of length 784.
"""

from matplotlib import pyplot
import numpy as np

pyplot.imshow(x_train[0].reshape((28, 28)), cmap="gray")
# ``pyplot.show()`` only if not on Colab
try:
    import google.colab
except ImportError:
    pyplot.show()
print("x_train shape:",x_train.shape)

"""
PyTorch uses torch.tensor, rather than numpy arrays, so we need to convert our
data
"""
import torch

x_train, y_train, x_valid, y_valid = map(
    torch.tensor, (x_train, y_train, x_valid, y_valid)
)
n, c = x_train.shape
print("x_train shape (after transformation):",x_train.shape)
print("y_train min, y_train max:",y_train.min(),",",y_train.max())

out[43]

x_train shape: (50000, 784)
x_train shape (after transformation): torch.Size([50000, 784])
y_train min, y_train max: tensor(0) , tensor(9)

import math
# Xavier Initialization (http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
weights = torch.randn(784,10) / math.sqrt(784)
# Setting requires_grad after initialization, since we don;t want that step
# included in the gradient
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)

def log_softmax(x):
  """
  Activation function
  """
  return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
  """
  Matrix multiplication and broadcasted addition
  to create a simple linear model
  """
  return log_softmax(xb @ weights + bias)

bs = 64  # batch size

xb = x_train[0:bs]  # a mini-batch from x
# Call the function on the data - forward pass
preds = model(xb)  # predictions
print(preds[0], preds.shape)

out[44]

tensor([-2.6933, -2.2950, -2.3172, -2.5605, -2.7853, -2.0275, -2.3917, -1.8987,
-1.9486, -2.5421], grad_fn=<SelectBackward0>) torch.Size([64, 10])

def nll(input, target):
  """
  Non negative log-likelihood to use as the loss function
  """
  return -input[range(target.shape[0]), target].mean()

loss_func = nll
yb = y_train[0:bs]
print(loss_func(preds, yb))

def accuracy(out, yb):
  """
  Function to calculate the accuracy of the model. For each
  prediction, if the index with the largest value matches the target
  value, then the prediction was correct
  """
  preds = torch.argmax(out, dim=1)
  return (preds == yb).float().mean()

print(accuracy(preds, yb)) # Accuracy of the model

out[45]

tensor(2.3345, grad_fn=<NegBackward0>)
tensor(0.2031)

# from IPython.core.debugger import set_trace
"""
Running a training loop.
"""

lr = 0.5  # learning rate
epochs = 2  # how many epochs to train for

for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
    #         set_trace()
    """
    Selecting a mini-batch of data
    """
    start_i = i * bs
    end_i = start_i + bs
    xb = x_train[start_i:end_i]
    yb = y_train[start_i:end_i]
    # Making predictions - the forward pass
    pred = model(xb)
    # Calculating the loss
    loss = loss_func(pred, yb)
    # Update the gradients of the model
    loss.backward()

    with torch.no_grad():
      weights -= weights.grad * lr # Updating weights
      bias -= bias.grad * lr # Updating biases
      """
      Setting weight gradient to zero so that we are ready for next training
      loop. Otherwise, the gradients would record a running tally of all the
      operations that has happened (loss.backward() adds the gradients to whatever is already stored, rather than replacing them).
      """
      weights.grad.zero_()
      bias.grad.zero_()

out[46]

"""
Refactor code to take advantage of PyTorch;s nn classes to make the code more concise and flexible
"""
import torch.nn.functional as F
"""
Activation and loss functions can be found in the torch.nn.functional submodule (which is generally imported into namespace F by convention).

If you are using negative log likelihood and log softmax activation, then PyTorch provides a single function F.cross_entropy that combines the two.
"""
loss_func = F.cross_entropy
def model(xb):
  return xb @ weights + bias
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

out[47]

tensor(2.3345, grad_fn=<NllLossBackward0>) tensor(0.2031)

"""
Refactour using nn.Module and nn.Parameter, for a cleaner and more concise
training loop.
"""

from torch import nn

class Mnist_Logistic(nn.Module):
  def __init__(self):
    super().__init__()
    self.weights = nn.Parameter(torch.randn(784,10)  / math.sqrt(784) )
    self.bias = nn.Parameter(torch.zeros(10))

  def forward(self,xb):
    return xb @ weights + bias

model = Mnist_Logistic()
print(loss_func(model(xb), yb))

out[48]

tensor(2.3345, grad_fn=<NllLossBackward0>)

"""
Now it is easier to update model parameters:
"""
lr = 0.01
with torch.no_grad():
  for p in model.parameters(): p -= p.grad * lr
  model.zero_grad()


def fit():
  """
  Wrap our training loop in a fit function so we can run it again later
  """
  for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
      """
      Getting Batch
      """
      start_i = i * bs
      end_i = start_i + bs
      xb = x_train[start_i:end_i]
      yb = y_train[start_i:end_i]
      """ Prediction """
      pred = model(xb)
      """ Loss """
      loss = loss_func(pred, yb)
      """ Calculate gradients """
      loss.backward()
      """ Update parameters with gradients """
      with torch.no_grad():
        for p in model.parameters():
          p -= p.grad * lr
          model.zero_grad
print(loss_func(model(xb), yb))

out[49]

User Comments

There are currently no comments for this article.