Object Detection Basics

Classification vs. Detection

Task	Question	Output
Classification	What is in this image?	“cat” (one label)
Localization	What is it AND where?	“cat” + one bounding box
Object Detection	What objects AND where are all of them?	Multiple labels + multiple boxes
Segmentation	Pixel-level masks?	Label for every pixel

Bounding Boxes

A bounding box defines the rectangle around an object:

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np

def draw_boxes(image_path, boxes, labels, scores=None):
    """
    boxes: list of [x_min, y_min, x_max, y_max] in pixel coordinates
    labels: list of class name strings
    scores: list of confidence floats (optional)
    """
    img = np.array(Image.open(image_path))
    fig, ax = plt.subplots(1, figsize=(10, 8))
    ax.imshow(img)
    
    colors = plt.cm.tab10.colors
    for i, (box, label) in enumerate(zip(boxes, labels)):
        x_min, y_min, x_max, y_max = box
        w, h = x_max - x_min, y_max - y_min
        color = colors[i % len(colors)]
        
        rect = patches.Rectangle((x_min, y_min), w, h,
                                   linewidth=2, edgecolor=color, facecolor="none")
        ax.add_patch(rect)
        
        score_text = f" {scores[i]:.0%}" if scores else ""
        ax.text(x_min, y_min - 5, f"{label}{score_text}",
                color="white", fontsize=10, fontweight="bold",
                bbox=dict(facecolor=color, alpha=0.8, pad=2))
    
    ax.axis("off")
    plt.tight_layout()
    plt.show()

# Example
boxes  = [[50, 30, 200, 180], [220, 50, 380, 200]]
labels = ["cat", "dog"]
scores = [0.97, 0.88]
draw_boxes("pets.jpg", boxes, labels, scores)

Intersection over Union (IoU)

IoU measures how much a predicted box overlaps with the ground truth box:

$\text{IoU} = \frac{\text{Area of Intersection}}{\text{Area of Union}}$

def compute_iou(box1, box2):
    """
    Both boxes: [x_min, y_min, x_max, y_max]
    Returns IoU in [0, 1]
    """
    # Intersection
    x_min = max(box1[0], box2[0])
    y_min = max(box1[1], box2[1])
    x_max = min(box1[2], box2[2])
    y_max = min(box1[3], box2[3])
    
    inter_w = max(0, x_max - x_min)
    inter_h = max(0, y_max - y_min)
    intersection = inter_w * inter_h
    
    # Union
    area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
    area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0.0

# Test
predicted   = [50, 50, 200, 200]
ground_truth = [60, 40, 210, 195]

iou = compute_iou(predicted, ground_truth)
print(f"IoU: {iou:.3f}")  # ≈ 0.84 — good match!

# Decision threshold
# IoU ≥ 0.5 → "correct detection" (PASCAL VOC standard)
# IoU ≥ 0.75 → stricter standard (COCO challenge)

YOLO — You Only Look Once

Two-stage detectors (like Faster R-CNN):

Propose ~2000 regions of interest
Classify each region → Accurate but slow (~5 fps)

YOLO (single-stage):

Divides image into a grid (e.g., 13×13)
Each grid cell predicts boxes + class simultaneously in one forward pass → Very fast (30–80 fps for YOLO v5/v8)

Image divided into 13×13 = 169 grid cells
Each cell predicts:
  - B bounding boxes (5 numbers each: x, y, w, h, confidence)
  - C class probabilities
Output tensor: 13 × 13 × (B*5 + C)

Faster R-CNN with torchvision

PyTorch ships a pretrained Faster R-CNN (80 COCO classes):

import torch
from torchvision import models
from torchvision.transforms import v2 as T
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

# COCO class names
COCO_CLASSES = [
    "__background__", "person", "bicycle", "car", "motorcycle", "airplane",
    "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
    "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
    "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
    "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
    "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
    "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
    "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
    "teddy bear", "hair drier", "toothbrush",
]

# Load model (downloads weights on first run)
model = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.eval()

# Preprocess
def load_image_as_tensor(path):
    img = Image.open(path).convert("RGB")
    return T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale=True)])(img)

img_tensor = load_image_as_tensor("street.jpg")

# Run detection
with torch.no_grad():
    predictions = model([img_tensor])

pred = predictions[0]
print(f"Detected {len(pred['boxes'])} objects")
for box, label, score in zip(pred['boxes'], pred['labels'], pred['scores']):
    if score >= 0.7:   # confidence threshold
        name = COCO_CLASSES[label.item()]
        print(f"  {name:15s}  conf={score:.2%}  box=[{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]")

Visualizing Detections

def show_detections(img_path, predictions, threshold=0.7):
    img_np = np.array(Image.open(img_path).convert("RGB"))
    fig, ax = plt.subplots(1, figsize=(12, 9))
    ax.imshow(img_np)
    
    pred = predictions[0]
    colors = plt.cm.Set2.colors
    
    for box, label, score in zip(pred['boxes'], pred['labels'], pred['scores']):
        if score < threshold:
            continue
        x1, y1, x2, y2 = box.int().tolist()
        name = COCO_CLASSES[label.item()]
        c = colors[label.item() % len(colors)]
        
        ax.add_patch(patches.Rectangle(
            (x1, y1), x2-x1, y2-y1,
            linewidth=2, edgecolor=c, facecolor="none"
        ))
        ax.text(x1, y1-6, f"{name} {score:.0%}",
                color="white", fontsize=9, fontweight="bold",
                bbox=dict(facecolor=c, alpha=0.85, pad=2, linewidth=0))
    
    ax.axis("off")
    plt.title(f"Faster R-CNN Detections (threshold={threshold})")
    plt.tight_layout()
    plt.show()

show_detections("street.jpg", predictions, threshold=0.7)

Running YOLOv8 (Ultralytics)

For real-time speed, use Ultralytics YOLO:

pip install ultralytics

from ultralytics import YOLO

model = YOLO("yolov8n.pt")   # n=nano (fastest), s/m/l/x = larger/more accurate

# Detect in image
results = model("street.jpg", conf=0.4)
results[0].show()   # pop-up with annotated image
results[0].save("output.jpg")   # save to file

# Print detections
for box in results[0].boxes:
    cls  = results[0].names[int(box.cls)]
    conf = float(box.conf)
    xyxy = box.xyxy[0].tolist()
    print(f"{cls:15s}  conf={conf:.2%}  box={[round(v) for v in xyxy]}")

# Detect in video (real-time)
results = model("traffic_video.mp4", save=True, conf=0.4)

Knowledge Check

An object detection model predicts a bounding box with IoU = 0.3 vs the ground truth. What does this mean?