You're offline — showing cached content
Module 6 — NLP & Large Language Models advanced 30 min

Using HuggingFace

The HuggingFace Ecosystem

HuggingFace provides:

  • Hub: 500,000+ pretrained models (text, image, audio)
  • Transformers library: Unified API for all those models
  • Datasets library: 50,000+ ready-to-use datasets
  • Trainer: High-level training loop
pip install transformers datasets evaluate accelerate

The pipeline() API — 5 Minutes to NLP

The simplest way to use a model. One function call, ready to use.

from transformers import pipeline

# Sentiment Analysis
classifier = pipeline("sentiment-analysis")
results = classifier([
    "I love machine learning!",
    "This tutorial is too complicated.",
    "The results are surprisingly good."
])
for r in results:
    print(f"{r['label']:10s}  {r['score']:.2%}")
# POSITIVE    99.8%
# NEGATIVE    97.4%
# POSITIVE    88.3%

Other Pipeline Tasks

# Named Entity Recognition
ner = pipeline("ner", grouped_entities=True)
ner("Barack Obama was born in Honolulu, Hawaii and worked at Harvard.")
# [{'entity_group': 'PER', 'word': 'Barack Obama', 'score': 0.9995},
#  {'entity_group': 'LOC', 'word': 'Honolulu', 'score': 0.9981},
#  {'entity_group': 'LOC', 'word': 'Hawaii', 'score': 0.9976},
#  {'entity_group': 'ORG', 'word': 'Harvard', 'score': 0.9912}]

# Text Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
article = """
Machine learning is a branch of artificial intelligence that enables systems
to learn and improve from experience without being explicitly programmed.
It focuses on developing computer programs that can access data and use it to
learn for themselves. The process begins with observations or data.
"""
summary = summarizer(article, max_length=50, min_length=10)
print(summary[0]['summary_text'])

# Translation
translator = pipeline("translation_en_to_fr")
print(translator("Hello, how are you today?")[0]['translation_text'])

# Text Generation
generator = pipeline("text-generation", model="gpt2")
output = generator("Once upon a time in the world of AI,", max_new_tokens=50)
print(output[0]['generated_text'])

# Question Answering
qa = pipeline("question-answering")
context = "Python was created by Guido van Rossum and first released in 1991."
print(qa(question="Who created Python?", context=context)['answer'])
# Guido van Rossum

Under the Hood: Tokenizer + Model

Every pipeline is just a Tokenizer + Model:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenization
text = "I really enjoyed this course!"
inputs = tokenizer(text, return_tensors="pt")
print(inputs)
# {'input_ids': tensor([[...]]),
#  'attention_mask': tensor([[...]])}
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))
# ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'course', '!', '[SEP]']

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
predicted = torch.argmax(probs)
print(f"Class: {model.config.id2label[predicted.item()]}")  # POSITIVE
print(f"Score: {probs[0][predicted].item():.2%}")           # 99.7%

Getting Embeddings

Use a model as a feature extractor (no classification head):

from transformers import AutoTokenizer, AutoModel
import torch

def mean_pooling(model_output, attention_mask):
    """Average token embeddings, ignoring padding."""
    token_embeddings = model_output.last_hidden_state
    mask = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()
    return (token_embeddings * mask).sum(1) / mask.sum(1)

# sentence-transformers gives best sentence embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

sentences = [
    "Machine learning is fascinating",
    "AI and deep learning are amazing",
    "I love hiking in the mountains",
]

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

embeddings = mean_pooling(outputs, inputs['attention_mask'])
print(embeddings.shape)  # (3, 384)

# Cosine similarity
from torch.nn.functional import cosine_similarity
e = embeddings
print(f"ML vs AI:      {cosine_similarity(e[0:1], e[1:2]).item():.3f}")  # ~0.82 (similar!)
print(f"ML vs hiking:  {cosine_similarity(e[0:1], e[2:3]).item():.3f}")  # ~0.11 (different)

Fine-tuning a Model

Fine-tuning = take a pretrained model + train it a few more epochs on your specific task with a small labeled dataset.

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
import evaluate
import numpy as np

# --- 1. Your dataset ---
data = {
    "text": [
        "This product is amazing!",
        "Worst purchase ever.",
        "Pretty good, would recommend.",
        "Total waste of money.",
        "Absolutely love it!",
        "Doesn't work at all.",
    ],
    "label": [1, 0, 1, 0, 1, 0],
}
dataset = Dataset.from_dict(data)
split = dataset.train_test_split(test_size=0.33, seed=42)
train_ds, eval_ds = split["train"], split["test"]

# --- 2. Tokenize ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
eval_ds  = eval_ds.map(tokenize, batched=True)

# --- 3. Model ---
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)

# --- 4. Metrics ---
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# --- 5. Training ---
args = TrainingArguments(
    output_dir="./sentiment-model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

# --- 6. Save & Load ---
trainer.save_model("./my-sentiment-model")

# Later...
from transformers import pipeline
clf = pipeline("sentiment-analysis", model="./my-sentiment-model")
print(clf("Absolutely fantastic service!"))
# [{'label': 'POSITIVE', 'score': 0.9981}]

Feature Extraction vs. Fine-tuning

ApproachWhen to useProsCons
Pipeline onlyStandard tasks (sentiment, NER, translation)Zero code, fastCan’t customize
Feature extractionNeed embeddings for downstream taskNo GPU needed, fastLess task-specific
Fine-tuningCustom domain / labelsBest accuracyNeeds labeled data + GPU
Train from scratchVery unusual domain (e.g., DNA sequences)Full controlNeeds massive data
Knowledge Check

When you run `pipeline('sentiment-analysis')` without specifying a model, HuggingFace automatically downloads...