Introduction to PyTorch

Why PyTorch?

PyTorch is the #1 deep learning framework in research and increasingly in production. It’s used by:

OpenAI (ChatGPT)
Meta AI (LLaMA)
Google DeepMind (many models)
Most academic ML papers

Why people love it:

Pythonic — feels natural, uses Python control flow
Dynamic computation graphs — easy to debug
Autograd — automatic differentiation
Excellent GPU support

import torch
print(torch.__version__)

Tensors — PyTorch’s Core Data Structure

A tensor is like a NumPy array, but:

Can run on GPUs
Supports automatic differentiation

import torch
import numpy as np

# 0D tensor (scalar)
scalar = torch.tensor(3.14)
print(scalar.shape)    # torch.Size([])

# 1D tensor (vector)
v = torch.tensor([1.0, 2.0, 3.0])
print(v.shape)         # torch.Size([3])

# 2D tensor (matrix)
m = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
print(m.shape)         # torch.Size([2, 2])

# 3D tensor (batch of images)
batch = torch.rand(32, 28, 28)  # 32 images, 28×28 pixels
print(batch.shape)     # torch.Size([32, 28, 28])

Creating Tensors

# From Python/NumPy
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr)

# Zeros, ones, random
torch.zeros(3, 4)
torch.ones(2, 5)
torch.rand(3, 3)        # uniform [0, 1)
torch.randn(3, 3)       # standard normal

# Range
torch.arange(0, 10, 2)               # [0, 2, 4, 6, 8]
torch.linspace(0, 1, 5)              # [0.00, 0.25, 0.50, 0.75, 1.00]

Tensor Operations

a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])

# Arithmetic
print(a + b)          # tensor([5., 7., 9.])
print(a * b)          # tensor([ 4., 10., 18.])
print(torch.dot(a, b))  # tensor(32.)  — dot product

# Matrix multiplication
A = torch.rand(3, 4)
B = torch.rand(4, 5)
C = A @ B             # or torch.mm(A, B)
print(C.shape)        # torch.Size([3, 5])

# Aggregation
print(a.sum())        # tensor(6.)
print(a.mean())       # tensor(2.)
print(a.max())        # tensor(3.)

Reshaping

x = torch.arange(12, dtype=torch.float32)
print(x.shape)          # torch.Size([12])

x_2d = x.view(3, 4)    # reshape (fast, shares memory)
x_new = x.reshape(4, 3) # reshape (safe, may copy)
x_flat = x.flatten()    # flatten to 1D

# Add/remove dimensions
x = torch.rand(3, 4)
x_3d = x.unsqueeze(0)   # torch.Size([1, 3, 4])
x_2d = x_3d.squeeze(0)  # torch.Size([3, 4])

GPU / CPU

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

# Move tensor to GPU
x = torch.rand(1000, 1000)
x_gpu = x.to(device)

# Move back to CPU (e.g., for visualization)
x_cpu = x_gpu.cpu()

Autograd — Automatic Differentiation ✨

PyTorch tracks operations on tensors to automatically compute gradients:

# Create tensor with gradient tracking
x = torch.tensor(3.0, requires_grad=True)

# Compute a function
y = x ** 2 + 2 * x + 1   # y = x² + 2x + 1

# Compute gradients via backprop
y.backward()

# dy/dx = 2x + 2 = 2*3 + 2 = 8
print(x.grad)   # tensor(8.)

Why This Matters

# Neural network weights need gradients
W = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)

x = torch.rand(10, 5)   # batch of 10 samples, 5 features
y_true = torch.randint(0, 3, (10,))  # true labels

# Forward pass
z = x @ W + b
loss = torch.nn.functional.cross_entropy(z, y_true)
print(f"Loss: {loss.item():.4f}")

# Backward pass — PyTorch computes ALL gradients automatically
loss.backward()

print(f"Gradient of W: {W.grad.shape}")  # same shape as W
print(f"Gradient of b: {b.grad.shape}")  # same shape as b

# Update weights (simple gradient descent)
with torch.no_grad():   # don't track this update
    W -= 0.01 * W.grad
    b -= 0.01 * b.grad

# Zero gradients for next step (important!)
W.grad.zero_()
b.grad.zero_()

`nn.Module` — Building Networks Cleanly

import torch.nn as nn

class SimpleNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create model
model = SimpleNetwork(784, 256, 10)
print(model)
# SimpleNetwork(
#   (fc1): Linear(in_features=784, out_features=256, bias=True)
#   (relu): ReLU()
#   (fc2): Linear(in_features=256, out_features=10, bias=True)
# )

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

# Forward pass
dummy_input = torch.rand(32, 784)  # batch of 32 images
output = model(dummy_input)
print(f"Output shape: {output.shape}")  # torch.Size([32, 10])

Optimizers

Instead of manually updating weights, use an optimizer:

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Other optimizers: SGD, AdaGrad, RMSProp

# Training step
loss.backward()
optimizer.step()     # update weights
optimizer.zero_grad()  # reset gradients

Knowledge Check

Why do we call `optimizer.zero_grad()` at the start of each training step?