Text Preprocessing — ML Course

Why Text Preprocessing?

Computers don’t understand words — they understand numbers. Before feeding text to any ML model, we need to convert it to a numerical representation. But raw text is noisy: different capitalization, punctuation, irrelevant words, morphological variants.

Preprocessing pipeline:

Raw text → Clean → Tokenize → Normalize → Vectorize → Numbers ✓

Step 1: Basic Cleaning

import re

def clean_text(text):
    text = text.lower()                      # lowercase
    text = re.sub(r"<[^>]+>", "", text)     # remove HTML tags
    text = re.sub(r"http\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text) # keep only letters
    text = re.sub(r"\s+", " ", text).strip() # normalize whitespace
    return text

raw = "Check out https://example.com!! It's AMAZING 🎉 <p>buy now</p>"
print(clean_text(raw))
# "check out  its amazing  buy now"

Step 2: Tokenization

Breaking text into individual tokens (usually words):

# Simple approach
text = "machine learning is fascinating and powerful"
tokens = text.split()
print(tokens)
# ['machine', 'learning', 'is', 'fascinating', 'and', 'powerful']

# Better: use NLTK
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Natural Language Processing is amazing. Let's learn it!"
word_tokens = word_tokenize(text)
sent_tokens = sent_tokenize(text)

print(word_tokens)
# ['Natural', 'Language', 'Processing', 'is', 'amazing', '.', "Let's", 'learn', 'it', '!']
print(sent_tokens)
# ['Natural Language Processing is amazing.', "Let's learn it!"]

Step 3: Removing Stopwords

“Stopwords” are common words (the, is, a, and) that carry little meaning:

from nltk.corpus import stopwords
nltk.download("stopwords", quiet=True)

stop_words = set(stopwords.words("english"))

tokens = word_tokenize("the quick brown fox jumps over the lazy dog")
filtered = [t for t in tokens if t.lower() not in stop_words]
print(filtered)
# ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']

Step 4: Stemming & Lemmatization

Reducing words to their base form:

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("wordnet", quiet=True)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ["running", "runs", "ran", "easily", "fairly", "studies", "studying"]

for w in words:
    stemmed = stemmer.stem(w)
    lemma   = lemmatizer.lemmatize(w, pos="v")  # pos="v" for verbs
    print(f"{w:12s} → stem: {stemmed:10s} | lemma: {lemma}")

# running      → stem: run        | lemma: run
# runs         → stem: run        | lemma: run
# ran          → stem: ran        | lemma: run
# easily       → stem: easili     | lemma: easily  (needs POS)
# studies      → stem: studi      | lemma: study

Stemming — crude, cuts off word endings (fast but imprecise)
Lemmatization — uses vocabulary to find actual root word (slower but better)

Step 5: Text to Numbers

Bag of Words (BoW)

Count how many times each word appears:

from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I love machine learning",
    "Machine learning is fascinating",
    "I love Python programming",
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df)
#    fascinating  is  learning  love  machine  programming  python
# 0            0   0         1     1        1            0       0
# 1            1   1         1     0        1            0       0
# 2            0   0         0     1        0            1       1

TF-IDF — Term Frequency-Inverse Document Frequency

TF-IDF weights words by how important they are (common words get lower weight):

$\text{TF-IDF}(t,d) = TF(t,d) \times \log\left(\frac{N}{df(t)}\right)$

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1, 2))
X = tfidf.fit_transform(corpus)
print(X.shape)   # (3, num_features)

# Most important terms
feature_names = tfidf.get_feature_names_out()
for i, doc in enumerate(corpus):
    scores = X[i].toarray()[0]
    top_idx = scores.argsort()[-3:][::-1]
    print(f"Doc {i}: {', '.join(feature_names[top_idx])}")

Text Classification Pipeline

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

# Load news classification dataset
cats = ["sci.space", "comp.graphics", "rec.sport.hockey"]
data = fetch_20newsgroups(categories=cats, remove=("headers", "footers", "quotes"))

X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.2, random_state=42
)

# Full pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf",     TfidfVectorizer(max_features=10000, stop_words="english", ngram_range=(1, 2))),
    ("classifier", LogisticRegression(max_iter=1000)),
])

pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2%}")  # usually ~95%

# Predict new text
sample = ["The space shuttle launch was a success!"]
pred = pipeline.predict(sample)[0]
print(f"Predicted category: {data.target_names[pred]}")  # sci.space

Knowledge Check

Why does TF-IDF give lower scores to words like 'the' and 'is' that appear in almost every document?