Image Data Basics

How Computers See Images

When you look at a photograph, you see colors and scenes. A computer sees a 3D grid of numbers.

A 4×4 grayscale image:         A 4×4 RGB image (3 channels):
┌─────────────────┐            Channel 0 (Red)    Channel 1 (Green)   Channel 2 (Blue)
│ 200  45 189 112 │            ┌──────────────┐  ┌──────────────┐    ┌──────────────┐
│  30 255  67  88 │            │ 255   0  0   │  │  0  255  0   │    │  0   0  255  │
│ 150  90 200  34 │            │ 128 128 128  │  │ 64  64  64   │    │  0   0   0   │
│  10 178  55 220 │            └──────────────┘  └──────────────┘    └──────────────┘
└─────────────────┘
Shape: (4, 4)                  Shape: (3, 4, 4)  [channels, height, width]

Loading Images with PIL

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Load an image
img = Image.open("cat.jpg")
print(f"Size:   {img.size}")       # (width, height) in pixels, e.g. (640, 480)
print(f"Mode:   {img.mode}")       # "RGB", "L" (grayscale), "RGBA", etc.

# Convert to numpy array
arr = np.array(img)
print(f"Shape:  {arr.shape}")      # (480, 640, 3) — height, width, channels
print(f"Dtype:  {arr.dtype}")      # uint8
print(f"Range:  {arr.min()} – {arr.max()}")  # 0 – 255

# Visualize the channels
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
axes[0].imshow(img)
axes[0].set_title("Original")

channel_names = ["Red", "Green", "Blue"]
cmaps = ["Reds", "Greens", "Blues"]
for i in range(3):
    axes[i+1].imshow(arr[:, :, i], cmap=cmaps[i])
    axes[i+1].set_title(f"{channel_names[i]} Channel")

for ax in axes:
    ax.axis("off")
plt.tight_layout()
plt.show()

Images as PyTorch Tensors

PyTorch uses (C, H, W) format — channels first.

import torch
from torchvision import transforms
from PIL import Image

# Standard preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),          # Resize to 224×224 (standard for many CNNs)
    transforms.ToTensor(),                  # PIL Image [0,255] → tensor [0.0, 1.0], shape (C,H,W)
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],         # ImageNet mean per channel
        std=[0.229, 0.224, 0.225]           # ImageNet std per channel
    )
])

img = Image.open("cat.jpg").convert("RGB")
tensor = preprocess(img)

print(f"Tensor shape:  {tensor.shape}")     # torch.Size([3, 224, 224])
print(f"Tensor dtype:  {tensor.dtype}")     # torch.float32
print(f"Value range:   {tensor.min():.2f} – {tensor.max():.2f}")  # approx -2.1 – 2.6

# Add batch dimension for model input
batch = tensor.unsqueeze(0)
print(f"Batch shape:   {batch.shape}")      # torch.Size([1, 3, 224, 224])

Why These Numbers for Normalize?

# ImageNet statistics — computed over 1.2 million images
# Every pretrained model trained on ImageNet expects this normalization!
IMAGENET_MEAN = [0.485, 0.456, 0.406]  # R, G, B channel means
IMAGENET_STD  = [0.229, 0.224, 0.225]  # R, G, B channel stds

# To undo normalization (for visualization):
def denormalize(tensor):
    mean = torch.tensor(IMAGENET_MEAN).view(3,1,1)
    std  = torch.tensor(IMAGENET_STD).view(3,1,1)
    return (tensor * std + mean).clamp(0, 1)

Data Augmentation

The technique of randomly transforming training images to create more variety — preventing overfitting.

from torchvision import transforms

# Training transforms: augment!
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.7, 1.0)),  # Random crop
    transforms.RandomHorizontalFlip(p=0.5),                # Mirror 50% of the time
    transforms.RandomRotation(15),                         # Rotate ±15 degrees
    transforms.ColorJitter(
        brightness=0.3,
        contrast=0.3,
        saturation=0.3,
        hue=0.1
    ),
    transforms.RandomGrayscale(p=0.05),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])

# Validation transforms: NO augmentation, just resize + normalize
val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])

Loading a Dataset with `ImageFolder`

The standard folder structure:

data/
  train/
    cats/      ← one folder per class
      cat001.jpg
      cat002.jpg
    dogs/
      dog001.jpg
  val/
    cats/
      cat100.jpg
    dogs/
      dog100.jpg

from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

train_dataset = ImageFolder(root="data/train", transform=train_transforms)
val_dataset   = ImageFolder(root="data/val",   transform=val_transforms)

print(f"Training samples: {len(train_dataset)}")
print(f"Classes: {train_dataset.classes}")      # ['cats', 'dogs']
print(f"Class → index: {train_dataset.class_to_idx}")  # {'cats': 0, 'dogs': 1}

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=4)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, num_workers=4)

# Sample a batch
images, labels = next(iter(train_loader))
print(f"Batch images: {images.shape}")   # torch.Size([32, 3, 224, 224])
print(f"Batch labels: {labels.shape}")   # torch.Size([32])

Using Built-in Datasets

For learning, use CIFAR-10 (60,000 images, 10 classes):

from torchvision.datasets import CIFAR10

simple_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
])

train_set = CIFAR10(root="./data", train=True,  download=True, transform=simple_transform)
test_set  = CIFAR10(root="./data", train=False, download=True, transform=simple_transform)

print(f"Train: {len(train_set)} images")   # 50000
print(f"Test:  {len(test_set)} images")    # 10000
classes = ['airplane','automobile','bird','cat','deer',
           'dog','frog','horse','ship','truck']

Knowledge Check

A color image loaded with torchvision becomes a tensor of shape (3, 224, 224). What do those three numbers mean?