Module 7 — Computer Vision intermediate 25 min
Image Data Basics
How Computers See Images
When you look at a photograph, you see colors and scenes. A computer sees a 3D grid of numbers.
A 4×4 grayscale image: A 4×4 RGB image (3 channels):
┌─────────────────┐ Channel 0 (Red) Channel 1 (Green) Channel 2 (Blue)
│ 200 45 189 112 │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ 30 255 67 88 │ │ 255 0 0 │ │ 0 255 0 │ │ 0 0 255 │
│ 150 90 200 34 │ │ 128 128 128 │ │ 64 64 64 │ │ 0 0 0 │
│ 10 178 55 220 │ └──────────────┘ └──────────────┘ └──────────────┘
└─────────────────┘
Shape: (4, 4) Shape: (3, 4, 4) [channels, height, width]
Loading Images with PIL
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
# Load an image
img = Image.open("cat.jpg")
print(f"Size: {img.size}") # (width, height) in pixels, e.g. (640, 480)
print(f"Mode: {img.mode}") # "RGB", "L" (grayscale), "RGBA", etc.
# Convert to numpy array
arr = np.array(img)
print(f"Shape: {arr.shape}") # (480, 640, 3) — height, width, channels
print(f"Dtype: {arr.dtype}") # uint8
print(f"Range: {arr.min()} – {arr.max()}") # 0 – 255
# Visualize the channels
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
axes[0].imshow(img)
axes[0].set_title("Original")
channel_names = ["Red", "Green", "Blue"]
cmaps = ["Reds", "Greens", "Blues"]
for i in range(3):
axes[i+1].imshow(arr[:, :, i], cmap=cmaps[i])
axes[i+1].set_title(f"{channel_names[i]} Channel")
for ax in axes:
ax.axis("off")
plt.tight_layout()
plt.show()
Images as PyTorch Tensors
PyTorch uses (C, H, W) format — channels first.
import torch
from torchvision import transforms
from PIL import Image
# Standard preprocessing pipeline
preprocess = transforms.Compose([
transforms.Resize((224, 224)), # Resize to 224×224 (standard for many CNNs)
transforms.ToTensor(), # PIL Image [0,255] → tensor [0.0, 1.0], shape (C,H,W)
transforms.Normalize(
mean=[0.485, 0.456, 0.406], # ImageNet mean per channel
std=[0.229, 0.224, 0.225] # ImageNet std per channel
)
])
img = Image.open("cat.jpg").convert("RGB")
tensor = preprocess(img)
print(f"Tensor shape: {tensor.shape}") # torch.Size([3, 224, 224])
print(f"Tensor dtype: {tensor.dtype}") # torch.float32
print(f"Value range: {tensor.min():.2f} – {tensor.max():.2f}") # approx -2.1 – 2.6
# Add batch dimension for model input
batch = tensor.unsqueeze(0)
print(f"Batch shape: {batch.shape}") # torch.Size([1, 3, 224, 224])
Why These Numbers for Normalize?
# ImageNet statistics — computed over 1.2 million images
# Every pretrained model trained on ImageNet expects this normalization!
IMAGENET_MEAN = [0.485, 0.456, 0.406] # R, G, B channel means
IMAGENET_STD = [0.229, 0.224, 0.225] # R, G, B channel stds
# To undo normalization (for visualization):
def denormalize(tensor):
mean = torch.tensor(IMAGENET_MEAN).view(3,1,1)
std = torch.tensor(IMAGENET_STD).view(3,1,1)
return (tensor * std + mean).clamp(0, 1)
Data Augmentation
The technique of randomly transforming training images to create more variety — preventing overfitting.
from torchvision import transforms
# Training transforms: augment!
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(224, scale=(0.7, 1.0)), # Random crop
transforms.RandomHorizontalFlip(p=0.5), # Mirror 50% of the time
transforms.RandomRotation(15), # Rotate ±15 degrees
transforms.ColorJitter(
brightness=0.3,
contrast=0.3,
saturation=0.3,
hue=0.1
),
transforms.RandomGrayscale(p=0.05),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])
# Validation transforms: NO augmentation, just resize + normalize
val_transforms = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])
Loading a Dataset with ImageFolder
The standard folder structure:
data/
train/
cats/ ← one folder per class
cat001.jpg
cat002.jpg
dogs/
dog001.jpg
val/
cats/
cat100.jpg
dogs/
dog100.jpg
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
train_dataset = ImageFolder(root="data/train", transform=train_transforms)
val_dataset = ImageFolder(root="data/val", transform=val_transforms)
print(f"Training samples: {len(train_dataset)}")
print(f"Classes: {train_dataset.classes}") # ['cats', 'dogs']
print(f"Class → index: {train_dataset.class_to_idx}") # {'cats': 0, 'dogs': 1}
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
# Sample a batch
images, labels = next(iter(train_loader))
print(f"Batch images: {images.shape}") # torch.Size([32, 3, 224, 224])
print(f"Batch labels: {labels.shape}") # torch.Size([32])
Using Built-in Datasets
For learning, use CIFAR-10 (60,000 images, 10 classes):
from torchvision.datasets import CIFAR10
simple_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010)),
])
train_set = CIFAR10(root="./data", train=True, download=True, transform=simple_transform)
test_set = CIFAR10(root="./data", train=False, download=True, transform=simple_transform)
print(f"Train: {len(train_set)} images") # 50000
print(f"Test: {len(test_set)} images") # 10000
classes = ['airplane','automobile','bird','cat','deer',
'dog','frog','horse','ship','truck']
A color image loaded with torchvision becomes a tensor of shape (3, 224, 224). What do those three numbers mean?