Module 7 — Computer Vision intermediate 35 min
Convolutional Neural Networks
Why Not Just Use a Regular Neural Network?
A regular (fully connected) network treats an image as a flat list of pixels:
224 × 224 × 3 = 150,528 inputs
First hidden layer = 1024 neurons
Parameters in first layer = 150,528 × 1024 = 154 MILLION!
Problems:
- Too many parameters → overfits, uses too much memory
- Ignores spatial structure — pixel (50,50) is always just as related to pixel (0,0) as to pixel (51,50)
CNNs solve both problems with shared filters.
The Convolution Operation
A filter (or kernel) is a small matrix that slides across the image, detecting local patterns:
Image (6×6): 3×3 Filter (edge detector): Result (4×4 feature map):
┌─────────────────┐ ┌───────────┐ ┌─────────────────┐
│ 1 2 0 0 1 2│ │ -1 -1 -1 │ │ -3 0 1 0 │
│ 0 1 2 1 0 1│ * │ 0 0 0 │ = │ 2 3 -1 1 │
│ 1 0 3 1 2 0│ │ 1 1 1 │ │ 0 1 2 -1 │
│ 2 1 0 2 1 1│ └───────────┘ │ 1 2 0 1 │
│ 0 2 1 0 2 1│ └─────────────────┘
│ 1 1 0 1 0 2│
└─────────────────┘
The filter “looks at” a 3×3 region, computes a dot product, and slides to the next position.
Key insight: the filter weights are shared across all positions → far fewer parameters!
CNN Layers
import torch
import torch.nn as nn
# Single Conv layer
conv = nn.Conv2d(
in_channels=3, # RGB input
out_channels=32, # Learn 32 different filters
kernel_size=3, # Each filter is 3×3
padding=1, # Pad border to keep size constant
stride=1 # Move filter 1 pixel at a time
)
# Output size formula:
# out = floor((in + 2*padding - kernel_size) / stride) + 1
# With padding=1, stride=1, kernel=3: out = floor((32 + 2 - 3)/1) + 1 = 32 (same!)
x = torch.rand(1, 3, 32, 32) # 1 image, 3 channels, 32×32
out = conv(x)
print(out.shape) # torch.Size([1, 32, 32, 32]) — 32 feature maps
# Max Pooling: reduce spatial size, keep dominant feature
pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_pooled = pool(out)
print(out_pooled.shape) # torch.Size([1, 32, 16, 16]) — halved!
Build a CNN for CIFAR-10
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# --- Architecture ---
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# Feature Extractor
self.features = nn.Sequential(
# Block 1: 3 → 32 channels, 32×32 → 16×16
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2), # 32×32 → 16×16
nn.Dropout2d(0.25),
# Block 2: 32 → 64 channels, 16×16 → 8×8
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2), # 16×16 → 8×8
nn.Dropout2d(0.25),
# Block 3: 64 → 128 channels, 8×8 → 4×4
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2), # 8×8 → 4×4
)
# Classifier
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)), # 128×4×4 → 128×1×1 (global average pool)
nn.Flatten(),
nn.Linear(128, 256),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(256, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
model = SimpleCNN(num_classes=10)
print(model)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal parameters: {total_params:,}") # ~450,000 — tiny vs 154M!
# Test forward pass
x = torch.rand(4, 3, 32, 32)
out = model(x)
print(f"Output shape: {out.shape}") # (4, 10) — 10 class scores per image
Training Loop
# --- Data ---
transform_train = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, padding=4),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_set = datasets.CIFAR10("./data", train=True, download=True, transform=transform_train)
test_set = datasets.CIFAR10("./data", train=False, download=True, transform=transform_test)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)
# --- Training ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SimpleCNN().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
criterion = nn.CrossEntropyLoss()
def train_epoch(model, loader, optimizer, criterion):
model.train()
running_loss, correct, total = 0.0, 0, 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
correct += (outputs.argmax(1) == labels).sum().item()
total += labels.size(0)
return running_loss / len(loader), correct / total
def eval_epoch(model, loader, criterion):
model.eval()
running_loss, correct, total = 0.0, 0, 0
with torch.no_grad():
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
correct += (outputs.argmax(1) == labels).sum().item()
total += labels.size(0)
return running_loss / len(loader), correct / total
for epoch in range(1, 31):
train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
val_loss, val_acc = eval_epoch(model, test_loader, criterion)
scheduler.step()
if epoch % 5 == 0:
print(f"Epoch {epoch:2d} | "
f"Train {train_loss:.3f}/{train_acc:.1%} | "
f"Val {val_loss:.3f}/{val_acc:.1%}")
# Epoch 5 | Train 1.112/0.603 | Val 1.089/0.612
# Epoch 10 | Train 0.821/0.712 | Val 0.842/0.704
# Epoch 20 | Train 0.612/0.790 | Val 0.701/0.762
# Epoch 30 | Train 0.503/0.827 | Val 0.676/0.775 (~77.5% test accuracy!)
What Does Each Layer Learn?
Layer 1 Conv filters: Low-level features
- Horizontal edges
- Vertical edges
- Color gradients
Layer 2 Conv filters: Mid-level features
- Corners
- Curves
- Textures (fur, feathers, wheels)
Layer 3 Conv filters: High-level features
- Eyes, noses, faces
- Wheels, windows, handles
- Abstract task-specific patterns
This hierarchical feature learning is why CNNs are so powerful.
Why does Max Pooling (taking the maximum value in each region) help CNN training?