Module 5 — Neural Networks & Deep Learning intermediate 28 min
Introduction to PyTorch
Why PyTorch?
PyTorch is the #1 deep learning framework in research and increasingly in production. It’s used by:
- OpenAI (ChatGPT)
- Meta AI (LLaMA)
- Google DeepMind (many models)
- Most academic ML papers
Why people love it:
- Pythonic — feels natural, uses Python control flow
- Dynamic computation graphs — easy to debug
- Autograd — automatic differentiation
- Excellent GPU support
import torch
print(torch.__version__)
Tensors — PyTorch’s Core Data Structure
A tensor is like a NumPy array, but:
- Can run on GPUs
- Supports automatic differentiation
import torch
import numpy as np
# 0D tensor (scalar)
scalar = torch.tensor(3.14)
print(scalar.shape) # torch.Size([])
# 1D tensor (vector)
v = torch.tensor([1.0, 2.0, 3.0])
print(v.shape) # torch.Size([3])
# 2D tensor (matrix)
m = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
print(m.shape) # torch.Size([2, 2])
# 3D tensor (batch of images)
batch = torch.rand(32, 28, 28) # 32 images, 28×28 pixels
print(batch.shape) # torch.Size([32, 28, 28])
Creating Tensors
# From Python/NumPy
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr)
# Zeros, ones, random
torch.zeros(3, 4)
torch.ones(2, 5)
torch.rand(3, 3) # uniform [0, 1)
torch.randn(3, 3) # standard normal
# Range
torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
torch.linspace(0, 1, 5) # [0.00, 0.25, 0.50, 0.75, 1.00]
Tensor Operations
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
# Arithmetic
print(a + b) # tensor([5., 7., 9.])
print(a * b) # tensor([ 4., 10., 18.])
print(torch.dot(a, b)) # tensor(32.) — dot product
# Matrix multiplication
A = torch.rand(3, 4)
B = torch.rand(4, 5)
C = A @ B # or torch.mm(A, B)
print(C.shape) # torch.Size([3, 5])
# Aggregation
print(a.sum()) # tensor(6.)
print(a.mean()) # tensor(2.)
print(a.max()) # tensor(3.)
Reshaping
x = torch.arange(12, dtype=torch.float32)
print(x.shape) # torch.Size([12])
x_2d = x.view(3, 4) # reshape (fast, shares memory)
x_new = x.reshape(4, 3) # reshape (safe, may copy)
x_flat = x.flatten() # flatten to 1D
# Add/remove dimensions
x = torch.rand(3, 4)
x_3d = x.unsqueeze(0) # torch.Size([1, 3, 4])
x_2d = x_3d.squeeze(0) # torch.Size([3, 4])
GPU / CPU
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")
# Move tensor to GPU
x = torch.rand(1000, 1000)
x_gpu = x.to(device)
# Move back to CPU (e.g., for visualization)
x_cpu = x_gpu.cpu()
Autograd — Automatic Differentiation ✨
PyTorch tracks operations on tensors to automatically compute gradients:
# Create tensor with gradient tracking
x = torch.tensor(3.0, requires_grad=True)
# Compute a function
y = x ** 2 + 2 * x + 1 # y = x² + 2x + 1
# Compute gradients via backprop
y.backward()
# dy/dx = 2x + 2 = 2*3 + 2 = 8
print(x.grad) # tensor(8.)
Why This Matters
# Neural network weights need gradients
W = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
x = torch.rand(10, 5) # batch of 10 samples, 5 features
y_true = torch.randint(0, 3, (10,)) # true labels
# Forward pass
z = x @ W + b
loss = torch.nn.functional.cross_entropy(z, y_true)
print(f"Loss: {loss.item():.4f}")
# Backward pass — PyTorch computes ALL gradients automatically
loss.backward()
print(f"Gradient of W: {W.grad.shape}") # same shape as W
print(f"Gradient of b: {b.grad.shape}") # same shape as b
# Update weights (simple gradient descent)
with torch.no_grad(): # don't track this update
W -= 0.01 * W.grad
b -= 0.01 * b.grad
# Zero gradients for next step (important!)
W.grad.zero_()
b.grad.zero_()
nn.Module — Building Networks Cleanly
import torch.nn as nn
class SimpleNetwork(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# Create model
model = SimpleNetwork(784, 256, 10)
print(model)
# SimpleNetwork(
# (fc1): Linear(in_features=784, out_features=256, bias=True)
# (relu): ReLU()
# (fc2): Linear(in_features=256, out_features=10, bias=True)
# )
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")
# Forward pass
dummy_input = torch.rand(32, 784) # batch of 32 images
output = model(dummy_input)
print(f"Output shape: {output.shape}") # torch.Size([32, 10])
Optimizers
Instead of manually updating weights, use an optimizer:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Other optimizers: SGD, AdaGrad, RMSProp
# Training step
loss.backward()
optimizer.step() # update weights
optimizer.zero_grad() # reset gradients
Why do we call `optimizer.zero_grad()` at the start of each training step?