Module 7 — Computer Vision advanced 30 min
Object Detection Basics
Classification vs. Detection
| Task | Question | Output |
|---|---|---|
| Classification | What is in this image? | “cat” (one label) |
| Localization | What is it AND where? | “cat” + one bounding box |
| Object Detection | What objects AND where are all of them? | Multiple labels + multiple boxes |
| Segmentation | Pixel-level masks? | Label for every pixel |
Bounding Boxes
A bounding box defines the rectangle around an object:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np
def draw_boxes(image_path, boxes, labels, scores=None):
"""
boxes: list of [x_min, y_min, x_max, y_max] in pixel coordinates
labels: list of class name strings
scores: list of confidence floats (optional)
"""
img = np.array(Image.open(image_path))
fig, ax = plt.subplots(1, figsize=(10, 8))
ax.imshow(img)
colors = plt.cm.tab10.colors
for i, (box, label) in enumerate(zip(boxes, labels)):
x_min, y_min, x_max, y_max = box
w, h = x_max - x_min, y_max - y_min
color = colors[i % len(colors)]
rect = patches.Rectangle((x_min, y_min), w, h,
linewidth=2, edgecolor=color, facecolor="none")
ax.add_patch(rect)
score_text = f" {scores[i]:.0%}" if scores else ""
ax.text(x_min, y_min - 5, f"{label}{score_text}",
color="white", fontsize=10, fontweight="bold",
bbox=dict(facecolor=color, alpha=0.8, pad=2))
ax.axis("off")
plt.tight_layout()
plt.show()
# Example
boxes = [[50, 30, 200, 180], [220, 50, 380, 200]]
labels = ["cat", "dog"]
scores = [0.97, 0.88]
draw_boxes("pets.jpg", boxes, labels, scores)
Intersection over Union (IoU)
IoU measures how much a predicted box overlaps with the ground truth box:
def compute_iou(box1, box2):
"""
Both boxes: [x_min, y_min, x_max, y_max]
Returns IoU in [0, 1]
"""
# Intersection
x_min = max(box1[0], box2[0])
y_min = max(box1[1], box2[1])
x_max = min(box1[2], box2[2])
y_max = min(box1[3], box2[3])
inter_w = max(0, x_max - x_min)
inter_h = max(0, y_max - y_min)
intersection = inter_w * inter_h
# Union
area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0.0
# Test
predicted = [50, 50, 200, 200]
ground_truth = [60, 40, 210, 195]
iou = compute_iou(predicted, ground_truth)
print(f"IoU: {iou:.3f}") # ≈ 0.84 — good match!
# Decision threshold
# IoU ≥ 0.5 → "correct detection" (PASCAL VOC standard)
# IoU ≥ 0.75 → stricter standard (COCO challenge)
YOLO — You Only Look Once
Two-stage detectors (like Faster R-CNN):
- Propose ~2000 regions of interest
- Classify each region → Accurate but slow (~5 fps)
YOLO (single-stage):
- Divides image into a grid (e.g., 13×13)
- Each grid cell predicts boxes + class simultaneously in one forward pass → Very fast (30–80 fps for YOLO v5/v8)
Image divided into 13×13 = 169 grid cells
Each cell predicts:
- B bounding boxes (5 numbers each: x, y, w, h, confidence)
- C class probabilities
Output tensor: 13 × 13 × (B*5 + C)
Faster R-CNN with torchvision
PyTorch ships a pretrained Faster R-CNN (80 COCO classes):
import torch
from torchvision import models
from torchvision.transforms import v2 as T
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
# COCO class names
COCO_CLASSES = [
"__background__", "person", "bicycle", "car", "motorcycle", "airplane",
"bus", "train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
"sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
"surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
"knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
"mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
"teddy bear", "hair drier", "toothbrush",
]
# Load model (downloads weights on first run)
model = models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
model.eval()
# Preprocess
def load_image_as_tensor(path):
img = Image.open(path).convert("RGB")
return T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale=True)])(img)
img_tensor = load_image_as_tensor("street.jpg")
# Run detection
with torch.no_grad():
predictions = model([img_tensor])
pred = predictions[0]
print(f"Detected {len(pred['boxes'])} objects")
for box, label, score in zip(pred['boxes'], pred['labels'], pred['scores']):
if score >= 0.7: # confidence threshold
name = COCO_CLASSES[label.item()]
print(f" {name:15s} conf={score:.2%} box=[{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]")
Visualizing Detections
def show_detections(img_path, predictions, threshold=0.7):
img_np = np.array(Image.open(img_path).convert("RGB"))
fig, ax = plt.subplots(1, figsize=(12, 9))
ax.imshow(img_np)
pred = predictions[0]
colors = plt.cm.Set2.colors
for box, label, score in zip(pred['boxes'], pred['labels'], pred['scores']):
if score < threshold:
continue
x1, y1, x2, y2 = box.int().tolist()
name = COCO_CLASSES[label.item()]
c = colors[label.item() % len(colors)]
ax.add_patch(patches.Rectangle(
(x1, y1), x2-x1, y2-y1,
linewidth=2, edgecolor=c, facecolor="none"
))
ax.text(x1, y1-6, f"{name} {score:.0%}",
color="white", fontsize=9, fontweight="bold",
bbox=dict(facecolor=c, alpha=0.85, pad=2, linewidth=0))
ax.axis("off")
plt.title(f"Faster R-CNN Detections (threshold={threshold})")
plt.tight_layout()
plt.show()
show_detections("street.jpg", predictions, threshold=0.7)
Running YOLOv8 (Ultralytics)
For real-time speed, use Ultralytics YOLO:
pip install ultralytics
from ultralytics import YOLO
model = YOLO("yolov8n.pt") # n=nano (fastest), s/m/l/x = larger/more accurate
# Detect in image
results = model("street.jpg", conf=0.4)
results[0].show() # pop-up with annotated image
results[0].save("output.jpg") # save to file
# Print detections
for box in results[0].boxes:
cls = results[0].names[int(box.cls)]
conf = float(box.conf)
xyxy = box.xyxy[0].tolist()
print(f"{cls:15s} conf={conf:.2%} box={[round(v) for v in xyxy]}")
# Detect in video (real-time)
results = model("traffic_video.mp4", save=True, conf=0.4)
An object detection model predicts a bounding box with IoU = 0.3 vs the ground truth. What does this mean?