Text Preprocessing
Why Text Preprocessing?
Computers don’t understand words — they understand numbers. Before feeding text to any ML model, we need to convert it to a numerical representation. But raw text is noisy: different capitalization, punctuation, irrelevant words, morphological variants.
Preprocessing pipeline:
Raw text → Clean → Tokenize → Normalize → Vectorize → Numbers ✓
Step 1: Basic Cleaning
import re
def clean_text(text):
text = text.lower() # lowercase
text = re.sub(r"<[^>]+>", "", text) # remove HTML tags
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"[^a-zA-Z\s]", "", text) # keep only letters
text = re.sub(r"\s+", " ", text).strip() # normalize whitespace
return text
raw = "Check out https://example.com!! It's AMAZING 🎉 <p>buy now</p>"
print(clean_text(raw))
# "check out its amazing buy now"
Step 2: Tokenization
Breaking text into individual tokens (usually words):
# Simple approach
text = "machine learning is fascinating and powerful"
tokens = text.split()
print(tokens)
# ['machine', 'learning', 'is', 'fascinating', 'and', 'powerful']
# Better: use NLTK
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Natural Language Processing is amazing. Let's learn it!"
word_tokens = word_tokenize(text)
sent_tokens = sent_tokenize(text)
print(word_tokens)
# ['Natural', 'Language', 'Processing', 'is', 'amazing', '.', "Let's", 'learn', 'it', '!']
print(sent_tokens)
# ['Natural Language Processing is amazing.', "Let's learn it!"]
Step 3: Removing Stopwords
“Stopwords” are common words (the, is, a, and) that carry little meaning:
from nltk.corpus import stopwords
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))
tokens = word_tokenize("the quick brown fox jumps over the lazy dog")
filtered = [t for t in tokens if t.lower() not in stop_words]
print(filtered)
# ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
Step 4: Stemming & Lemmatization
Reducing words to their base form:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet", quiet=True)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = ["running", "runs", "ran", "easily", "fairly", "studies", "studying"]
for w in words:
stemmed = stemmer.stem(w)
lemma = lemmatizer.lemmatize(w, pos="v") # pos="v" for verbs
print(f"{w:12s} → stem: {stemmed:10s} | lemma: {lemma}")
# running → stem: run | lemma: run
# runs → stem: run | lemma: run
# ran → stem: ran | lemma: run
# easily → stem: easili | lemma: easily (needs POS)
# studies → stem: studi | lemma: study
Stemming — crude, cuts off word endings (fast but imprecise)
Lemmatization — uses vocabulary to find actual root word (slower but better)
Step 5: Text to Numbers
Bag of Words (BoW)
Count how many times each word appears:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
"I love machine learning",
"Machine learning is fascinating",
"I love Python programming",
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df)
# fascinating is learning love machine programming python
# 0 0 0 1 1 1 0 0
# 1 1 1 1 0 1 0 0
# 2 0 0 0 1 0 1 1
TF-IDF — Term Frequency-Inverse Document Frequency
TF-IDF weights words by how important they are (common words get lower weight):
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1, 2))
X = tfidf.fit_transform(corpus)
print(X.shape) # (3, num_features)
# Most important terms
feature_names = tfidf.get_feature_names_out()
for i, doc in enumerate(corpus):
scores = X[i].toarray()[0]
top_idx = scores.argsort()[-3:][::-1]
print(f"Doc {i}: {', '.join(feature_names[top_idx])}")
Text Classification Pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
# Load news classification dataset
cats = ["sci.space", "comp.graphics", "rec.sport.hockey"]
data = fetch_20newsgroups(categories=cats, remove=("headers", "footers", "quotes"))
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# Full pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
("tfidf", TfidfVectorizer(max_features=10000, stop_words="english", ngram_range=(1, 2))),
("classifier", LogisticRegression(max_iter=1000)),
])
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2%}") # usually ~95%
# Predict new text
sample = ["The space shuttle launch was a success!"]
pred = pipeline.predict(sample)[0]
print(f"Predicted category: {data.target_names[pred]}") # sci.space
Why does TF-IDF give lower scores to words like 'the' and 'is' that appear in almost every document?