Module 6 — NLP & Large Language Models advanced 30 min
Using HuggingFace
The HuggingFace Ecosystem
HuggingFace provides:
- Hub: 500,000+ pretrained models (text, image, audio)
- Transformers library: Unified API for all those models
- Datasets library: 50,000+ ready-to-use datasets
- Trainer: High-level training loop
pip install transformers datasets evaluate accelerate
The pipeline() API — 5 Minutes to NLP
The simplest way to use a model. One function call, ready to use.
from transformers import pipeline
# Sentiment Analysis
classifier = pipeline("sentiment-analysis")
results = classifier([
"I love machine learning!",
"This tutorial is too complicated.",
"The results are surprisingly good."
])
for r in results:
print(f"{r['label']:10s} {r['score']:.2%}")
# POSITIVE 99.8%
# NEGATIVE 97.4%
# POSITIVE 88.3%
Other Pipeline Tasks
# Named Entity Recognition
ner = pipeline("ner", grouped_entities=True)
ner("Barack Obama was born in Honolulu, Hawaii and worked at Harvard.")
# [{'entity_group': 'PER', 'word': 'Barack Obama', 'score': 0.9995},
# {'entity_group': 'LOC', 'word': 'Honolulu', 'score': 0.9981},
# {'entity_group': 'LOC', 'word': 'Hawaii', 'score': 0.9976},
# {'entity_group': 'ORG', 'word': 'Harvard', 'score': 0.9912}]
# Text Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
article = """
Machine learning is a branch of artificial intelligence that enables systems
to learn and improve from experience without being explicitly programmed.
It focuses on developing computer programs that can access data and use it to
learn for themselves. The process begins with observations or data.
"""
summary = summarizer(article, max_length=50, min_length=10)
print(summary[0]['summary_text'])
# Translation
translator = pipeline("translation_en_to_fr")
print(translator("Hello, how are you today?")[0]['translation_text'])
# Text Generation
generator = pipeline("text-generation", model="gpt2")
output = generator("Once upon a time in the world of AI,", max_new_tokens=50)
print(output[0]['generated_text'])
# Question Answering
qa = pipeline("question-answering")
context = "Python was created by Guido van Rossum and first released in 1991."
print(qa(question="Who created Python?", context=context)['answer'])
# Guido van Rossum
Under the Hood: Tokenizer + Model
Every pipeline is just a Tokenizer + Model:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Tokenization
text = "I really enjoyed this course!"
inputs = tokenizer(text, return_tensors="pt")
print(inputs)
# {'input_ids': tensor([[...]]),
# 'attention_mask': tensor([[...]])}
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))
# ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'course', '!', '[SEP]']
# Forward pass
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
predicted = torch.argmax(probs)
print(f"Class: {model.config.id2label[predicted.item()]}") # POSITIVE
print(f"Score: {probs[0][predicted].item():.2%}") # 99.7%
Getting Embeddings
Use a model as a feature extractor (no classification head):
from transformers import AutoTokenizer, AutoModel
import torch
def mean_pooling(model_output, attention_mask):
"""Average token embeddings, ignoring padding."""
token_embeddings = model_output.last_hidden_state
mask = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()
return (token_embeddings * mask).sum(1) / mask.sum(1)
# sentence-transformers gives best sentence embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sentences = [
"Machine learning is fascinating",
"AI and deep learning are amazing",
"I love hiking in the mountains",
]
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
embeddings = mean_pooling(outputs, inputs['attention_mask'])
print(embeddings.shape) # (3, 384)
# Cosine similarity
from torch.nn.functional import cosine_similarity
e = embeddings
print(f"ML vs AI: {cosine_similarity(e[0:1], e[1:2]).item():.3f}") # ~0.82 (similar!)
print(f"ML vs hiking: {cosine_similarity(e[0:1], e[2:3]).item():.3f}") # ~0.11 (different)
Fine-tuning a Model
Fine-tuning = take a pretrained model + train it a few more epochs on your specific task with a small labeled dataset.
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
import evaluate
import numpy as np
# --- 1. Your dataset ---
data = {
"text": [
"This product is amazing!",
"Worst purchase ever.",
"Pretty good, would recommend.",
"Total waste of money.",
"Absolutely love it!",
"Doesn't work at all.",
],
"label": [1, 0, 1, 0, 1, 0],
}
dataset = Dataset.from_dict(data)
split = dataset.train_test_split(test_size=0.33, seed=42)
train_ds, eval_ds = split["train"], split["test"]
# --- 2. Tokenize ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, max_length=128)
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)
# --- 3. Model ---
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1},
)
# --- 4. Metrics ---
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return accuracy_metric.compute(predictions=predictions, references=labels)
# --- 5. Training ---
args = TrainingArguments(
output_dir="./sentiment-model",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=10,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
logging_dir="./logs",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_ds,
eval_dataset=eval_ds,
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)
trainer.train()
# --- 6. Save & Load ---
trainer.save_model("./my-sentiment-model")
# Later...
from transformers import pipeline
clf = pipeline("sentiment-analysis", model="./my-sentiment-model")
print(clf("Absolutely fantastic service!"))
# [{'label': 'POSITIVE', 'score': 0.9981}]
Feature Extraction vs. Fine-tuning
| Approach | When to use | Pros | Cons |
|---|---|---|---|
| Pipeline only | Standard tasks (sentiment, NER, translation) | Zero code, fast | Can’t customize |
| Feature extraction | Need embeddings for downstream task | No GPU needed, fast | Less task-specific |
| Fine-tuning | Custom domain / labels | Best accuracy | Needs labeled data + GPU |
| Train from scratch | Very unusual domain (e.g., DNA sequences) | Full control | Needs massive data |
When you run `pipeline('sentiment-analysis')` without specifying a model, HuggingFace automatically downloads...