You're offline — showing cached content
Module 7 — Computer Vision intermediate 35 min

Convolutional Neural Networks

Why Not Just Use a Regular Neural Network?

A regular (fully connected) network treats an image as a flat list of pixels:

224 × 224 × 3 = 150,528 inputs
First hidden layer = 1024 neurons
Parameters in first layer = 150,528 × 1024 = 154 MILLION!

Problems:

  • Too many parameters → overfits, uses too much memory
  • Ignores spatial structure — pixel (50,50) is always just as related to pixel (0,0) as to pixel (51,50)

CNNs solve both problems with shared filters.


The Convolution Operation

A filter (or kernel) is a small matrix that slides across the image, detecting local patterns:

Image (6×6):           3×3 Filter (edge detector):     Result (4×4 feature map):
┌─────────────────┐    ┌───────────┐                   ┌─────────────────┐
│  1  2  0  0  1  2│   │ -1 -1 -1 │                   │ -3  0  1  0    │
│  0  1  2  1  0  1│ * │  0  0  0 │         =          │  2  3 -1  1    │
│  1  0  3  1  2  0│   │  1  1  1 │                   │  0  1  2 -1    │
│  2  1  0  2  1  1│   └───────────┘                   │  1  2  0  1    │
│  0  2  1  0  2  1│                                   └─────────────────┘
│  1  1  0  1  0  2│
└─────────────────┘

The filter “looks at” a 3×3 region, computes a dot product, and slides to the next position.

Key insight: the filter weights are shared across all positions → far fewer parameters!


CNN Layers

import torch
import torch.nn as nn

# Single Conv layer
conv = nn.Conv2d(
    in_channels=3,    # RGB input
    out_channels=32,  # Learn 32 different filters
    kernel_size=3,    # Each filter is 3×3
    padding=1,        # Pad border to keep size constant
    stride=1          # Move filter 1 pixel at a time
)

# Output size formula:
# out = floor((in + 2*padding - kernel_size) / stride) + 1
# With padding=1, stride=1, kernel=3: out = floor((32 + 2 - 3)/1) + 1 = 32 (same!)

x = torch.rand(1, 3, 32, 32)           # 1 image, 3 channels, 32×32
out = conv(x)
print(out.shape)  # torch.Size([1, 32, 32, 32])  — 32 feature maps

# Max Pooling: reduce spatial size, keep dominant feature
pool = nn.MaxPool2d(kernel_size=2, stride=2)
out_pooled = pool(out)
print(out_pooled.shape)  # torch.Size([1, 32, 16, 16])  — halved!

Build a CNN for CIFAR-10

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# --- Architecture ---
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        
        # Feature Extractor
        self.features = nn.Sequential(
            # Block 1: 3 → 32 channels, 32×32 → 16×16
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),          # 32×32 → 16×16
            nn.Dropout2d(0.25),
            
            # Block 2: 32 → 64 channels, 16×16 → 8×8
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),          # 16×16 → 8×8
            nn.Dropout2d(0.25),
            
            # Block 3: 64 → 128 channels, 8×8 → 4×4
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),          # 8×8 → 4×4
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),   # 128×4×4 → 128×1×1 (global average pool)
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = SimpleCNN(num_classes=10)
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal parameters: {total_params:,}")  # ~450,000 — tiny vs 154M!

# Test forward pass
x = torch.rand(4, 3, 32, 32)
out = model(x)
print(f"Output shape: {out.shape}")  # (4, 10) — 10 class scores per image

Training Loop

# --- Data ---
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_set = datasets.CIFAR10("./data", train=True,  download=True, transform=transform_train)
test_set  = datasets.CIFAR10("./data", train=False, download=True, transform=transform_test)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_set,  batch_size=128, shuffle=False, num_workers=2)

# --- Training ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SimpleCNN().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
criterion = nn.CrossEntropyLoss()

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)
    return running_loss / len(loader), correct / total

def eval_epoch(model, loader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
    return running_loss / len(loader), correct / total

for epoch in range(1, 31):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc     = eval_epoch(model, test_loader, criterion)
    scheduler.step()
    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d} | "
              f"Train {train_loss:.3f}/{train_acc:.1%} | "
              f"Val {val_loss:.3f}/{val_acc:.1%}")
# Epoch  5 | Train 1.112/0.603 | Val 1.089/0.612
# Epoch 10 | Train 0.821/0.712 | Val 0.842/0.704
# Epoch 20 | Train 0.612/0.790 | Val 0.701/0.762
# Epoch 30 | Train 0.503/0.827 | Val 0.676/0.775   (~77.5% test accuracy!)

What Does Each Layer Learn?

Layer 1 Conv filters:    Low-level features
  - Horizontal edges
  - Vertical edges
  - Color gradients

Layer 2 Conv filters:    Mid-level features
  - Corners
  - Curves
  - Textures (fur, feathers, wheels)

Layer 3 Conv filters:    High-level features
  - Eyes, noses, faces
  - Wheels, windows, handles
  - Abstract task-specific patterns

This hierarchical feature learning is why CNNs are so powerful.

Knowledge Check

Why does Max Pooling (taking the maximum value in each region) help CNN training?