%%HTML
<link rel="stylesheet" type="text/css" href="https://raw.githubusercontent.com/malkaguillot/Foundations-in-Data-Science-and-Machine-Learning/refs/heads/main/docs/utils/custom.css">
%%HTML
<link rel="stylesheet" type="text/css" href="../utils/custom.css">

 # Common imports
import numpy as np
import os
import pandas as pd

# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
import matplotlib.pyplot as plt
#%matplotlib notebook
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings = lambda *a, **kw: None

import sklearn
# to make this notebook's output identical at every run
np.random.seed(42)

from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups() # object is a dictionary
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

# Dataset description
print(data['DESCR'][:200])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for t

W, y = data['data'], data['target']
n_samples = y.shape[0]
n_samples

11314

y[:10] # news story categories

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

doc = W[0] # first document (news story)
doc[:300]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be "

df = pd.DataFrame(W,columns=['text'])
df['topic'] = y
df.head()

#!pip install nltk
import nltk

# Download the lexicon
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/malka/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True

# Import the lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

# Example
sentence = "VADER is pretty good at identifying the underlying sentiment of a text!"
print(sent_analyzer.polarity_scores(sentence))

{'neg': 0.0, 'neu': 0.585, 'pos': 0.415, 'compound': 0.75}

sent_analyzer = SentimentIntensityAnalyzer()
polarity = sent_analyzer.polarity_scores(doc)
print(polarity)

{'neg': 0.012, 'neu': 0.916, 'pos': 0.072, 'compound': 0.807}

dfs = df.sample(frac=.2) # sample 20% of the dataset

# apply compound sentiment score to data-frame
def get_sentiment(snippet):
    return sent_analyzer.polarity_scores(snippet)['compound']

dfs['sentiment'] = dfs['text'].apply(get_sentiment)

dfs.sort_values('sentiment',inplace=True)
[x[60:150] for x  in dfs[-5:]['text']] # print beginning of most positive documents

['CLINTON: AM Press Briefing by Dee Dee Myers -- 4.15.93\nOrganization: Project GNU, Free Sof',
 ' Newsletter, Part 2/4\nReply-To: david@stat.com (David Dodell)\nDistribution: world\nOrganiza',
 "CLINTON: President's Remarks at Town Hall Meeting\nOrganization: MIT Artificial Intelligenc",
 'Final Public Dragon Magazine Update (Last chance for public bids)\nKeywords: Dragon Magazin',
 'CLINTON: Background BRiefing in Vancouver 4.4.93\nOrganization: Project GNU, Free Software ']

text = "Marie Curie was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in 2 scientific fields. Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first married couple to win the Nobel Prize and launching the Curie family legacy of 5 Nobel Prizes."

text_lower = text.lower() # go to lower-case
text_lower

'marie curie was the first woman to win a nobel prize, the first person to win a nobel prize twice, and the only person to win a nobel prize in 2 scientific fields. her husband, pierre curie, was a co-winner of her first nobel prize, making them the first married couple to win the nobel prize and launching the curie family legacy of 5 nobel prizes.'

# recipe for fast punctuation removal
from string import punctuation
punc_remover = str.maketrans('','',punctuation)
text_nopunc = text_lower.translate(punc_remover)
print(text_nopunc)

marie curie was the first woman to win a nobel prize the first person to win a nobel prize twice and the only person to win a nobel prize in 2 scientific fields her husband pierre curie was a cowinner of her first nobel prize making them the first married couple to win the nobel prize and launching the curie family legacy of 5 nobel prizes

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english') # snowball stemmer, english
# remake list of tokens, replace with stemmed versions
tokens_stemmed = [stemmer.stem(t) for t in ['tax','taxes','taxed','taxation']]
print(tokens_stemmed)

['tax', 'tax', 'tax', 'taxat']

stemmer = SnowballStemmer('german') # snowball stemmer, german
print(stemmer.stem("Autobahnen"))

autobahn

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/malka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
[wnl.lemmatize(c) for c in ['corporation', 'corporations', 'corporate']]

['corporation', 'corporation', 'corporate']

from string import punctuation
translator = str.maketrans('','',punctuation)
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

def normalize_text(doc):
    "Input doc and return clean list of tokens"
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower() # all lower case
    nopunc = lower.translate(translator) # remove punctuation
    words = nopunc.split() # split into tokens
    nostop = [w for w in words if w not in stoplist] # remove stopwords
    no_numbers = [w if not w.isdigit() else '#' for w in nostop] # normalize numbers
    stemmed = [stemmer.stem(w) for w in no_numbers] # stem each word
    return stemmed

df['tokens_cleaned'] = df['text'].apply(normalize_text)
df['tokens_cleaned'].head(5)

0    [lerxstwamumdedu, where, thing, subject, car, ...
1    [guykuocarsonuwashingtonedu, guy, kuo, subject...
2    [twillisececnpurdueedu, thoma, e, willi, subje...
3    [jgreenamb, joe, green, subject, weitek, p9000...
4    [jcmheadcfaharvardedu, jonathan, mcdowel, subj...
Name: tokens_cleaned, dtype: object

from gensim.utils import simple_preprocess

print(simple_preprocess(text))

['marie', 'curie', 'was', 'the', 'first', 'woman', 'to', 'win', 'nobel', 'prize', 'the', 'first', 'person', 'to', 'win', 'nobel', 'prize', 'twice', 'and', 'the', 'only', 'person', 'to', 'win', 'nobel', 'prize', 'in', 'scientific', 'fields', 'her', 'husband', 'pierre', 'curie', 'was', 'co', 'winner', 'of', 'her', 'first', 'nobel', 'prize', 'making', 'them', 'the', 'first', 'married', 'couple', 'to', 'win', 'the', 'nobel', 'prize', 'and', 'launching', 'the', 'curie', 'family', 'legacy', 'of', 'nobel', 'prizes']

df['tokens_simple'] = df['text'].apply(simple_preprocess)
df['tokens_simple'].head(5)

0    [from, lerxst, wam, umd, edu, where, my, thing...
1    [from, guykuo, carson, washington, edu, guy, k...
2    [from, twillis, ec, ecn, purdue, edu, thomas, ...
3    [from, jgreen, amber, joe, green, subject, re,...
4    [from, jcm, head, cfa, harvard, edu, jonathan,...
Name: tokens_simple, dtype: object

tokens = text_nopunc.split() # splits a string on white space
print(tokens)

['marie', 'curie', 'was', 'the', 'first', 'woman', 'to', 'win', 'a', 'nobel', 'prize', 'the', 'first', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'twice', 'and', 'the', 'only', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'in', '2', 'scientific', 'fields', 'her', 'husband', 'pierre', 'curie', 'was', 'a', 'cowinner', 'of', 'her', 'first', 'nobel', 'prize', 'making', 'them', 'the', 'first', 'married', 'couple', 'to', 'win', 'the', 'nobel', 'prize', 'and', 'launching', 'the', 'curie', 'family', 'legacy', 'of', '5', 'nobel', 'prizes']

# remove numbers (keep if not a digit)
no_numbers = [t for t in tokens if not t.isdigit()]
print(no_numbers )

['marie', 'curie', 'was', 'the', 'first', 'woman', 'to', 'win', 'a', 'nobel', 'prize', 'the', 'first', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'twice', 'and', 'the', 'only', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'in', 'scientific', 'fields', 'her', 'husband', 'pierre', 'curie', 'was', 'a', 'cowinner', 'of', 'her', 'first', 'nobel', 'prize', 'making', 'them', 'the', 'first', 'married', 'couple', 'to', 'win', 'the', 'nobel', 'prize', 'and', 'launching', 'the', 'curie', 'family', 'legacy', 'of', 'nobel', 'prizes']

# keep if not a digit, else replace with "#"
norm_numbers = [t if not t.isdigit() else '#'
                for t in tokens ]
print(norm_numbers)

['marie', 'curie', 'was', 'the', 'first', 'woman', 'to', 'win', 'a', 'nobel', 'prize', 'the', 'first', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'twice', 'and', 'the', 'only', 'person', 'to', 'win', 'a', 'nobel', 'prize', 'in', '#', 'scientific', 'fields', 'her', 'husband', 'pierre', 'curie', 'was', 'a', 'cowinner', 'of', 'her', 'first', 'nobel', 'prize', 'making', 'them', 'the', 'first', 'married', 'couple', 'to', 'win', 'the', 'nobel', 'prize', 'and', 'launching', 'the', 'curie', 'family', 'legacy', 'of', '#', 'nobel', 'prizes']

from nltk.corpus import stopwords # Stopwords
stoplist = stopwords.words('english')
# keep if not a stopword
nostop = [t for t in norm_numbers if t not in stoplist]
print(nostop)

['marie', 'curie', 'first', 'woman', 'win', 'nobel', 'prize', 'first', 'person', 'win', 'nobel', 'prize', 'twice', 'person', 'win', 'nobel', 'prize', '#', 'scientific', 'fields', 'husband', 'pierre', 'curie', 'cowinner', 'first', 'nobel', 'prize', 'making', 'first', 'married', 'couple', 'win', 'nobel', 'prize', 'launching', 'curie', 'family', 'legacy', '#', 'nobel', 'prizes']

# Counter is a quick pure-python solution.
from collections import Counter
freqs = Counter(tokens)
freqs.most_common()[:10]

[('the', 6),
 ('nobel', 6),
 ('prize', 5),
 ('first', 4),
 ('to', 4),
 ('win', 4),
 ('a', 4),
 ('curie', 3),
 ('was', 2),
 ('person', 2)]

from nltk import ngrams
from collections import Counter

# get n-gram counts for 10 documents
grams = []
for i, row in df.iterrows():
    tokens = row['text'].lower().split() # get tokens
    for n in range(2,4):
        grams += list(ngrams(tokens,n)) # get bigrams, trigrams, and quadgrams
    if i > 50:
        break
Counter(grams).most_common()[:8]  # most frequent n-grams

[(('of', 'the'), 41),
 (('subject:', 're:'), 37),
 (('in', 'the'), 33),
 (('to', 'the'), 27),
 (('i', 'am'), 21),
 (('i', 'have'), 21),
 (('to', 'be'), 19),
 (('on', 'the'), 18)]

pip install spacy
python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load('en_core_web_sm')

dfs = df.sample(10)
dfs['doc'] = dfs['text'].apply(nlp)

doc = dfs['doc'].iloc[0]

for token in doc[:10]:
    print(f"Token: {token.text}, POS: {token.pos_}")

Token: From, POS: ADP
Token: :, POS: PUNCT
Token: wcd82671@uxa.cso.uiuc.edu, POS: PROPN
Token: (, POS: PUNCT
Token: daniel, POS: PROPN
Token: warren, POS: PROPN
Token: c, POS: PROPN
Token: ), POS: PUNCT
Token: 
, POS: SPACE
Token: Subject, POS: NOUN

# spacy NER noun chunks
chunks = list(nlp(df['text'].iloc[10]).noun_chunks)
chunks[:20]

[irwin@cmptrc.lonestar.org,
 (Irwin Arnstein,
 Subject,
 Recommendation,
 Duc
 Summary,
 What,
 it,
 Distribution,
 usa,
 Sat,
 May 1993 05:00:00 GMT
 Organization,
 CompuTrac Inc.,
 Richardson TX
 Keywords,
 Ducati,
 GTS,
 I,
 a line,
 a Ducati 900GTS 1978 model,
 the clock,
 paint]

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(min_df=0.001, # at min 0.1% of docs
                        max_df=.8, # drop if shows up ih more than 80%
                        max_features=1000,
                        stop_words='english',
                        ngram_range=(1,3)) # words, bigrams, and trigrams
X = vec.fit_transform(df['text'])

# save the vectors
# pd.to_pickle(X,'X.pkl')

# save the vectorizer
# (so you can transform other documents, also for the vocab)
#pd.to_pickle(vec, 'vec-3grams-1.pkl')

X

<11314x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 526707 stored elements in Compressed Sparse Row format>

# tf-idf vectorizer up-weights rare/distinctive words
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.001,
                        max_df=0.9,
                        max_features=1000,
                        stop_words='english',
                        use_idf=True, # the new piece
                        ngram_range=(1,2))

X_tfidf = tfidf.fit_transform(df['text'])
#pd.to_pickle(X_tfidf,'X_tfidf.pkl')
X_tfidf

<11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 521387 stored elements in Compressed Sparse Row format>

# compute pair-wise similarities between all documents in corpus"
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(X[:100])
sim.shape

(100, 100)

sim[:4,:4]

array([[1.        , 0.20384233, 0.15095711, 0.19219753],
       [0.20384233, 1.        , 0.12569587, 0.1608558 ],
       [0.15095711, 0.12569587, 1.        , 0.16531366],
       [0.19219753, 0.1608558 , 0.16531366, 1.        ]])

# TF-IDF Similarity
tsim = cosine_similarity(X_tfidf[:100])
tsim[:4,:4]

array([[1.        , 0.05129256, 0.08901433, 0.06064389],
       [0.05129256, 1.        , 0.07497709, 0.03570566],
       [0.08901433, 0.07497709, 1.        , 0.09077347],
       [0.06064389, 0.03570566, 0.09077347, 1.        ]])

	text	topic
0	From: lerxst@wam.umd.edu (where's my thing)\nS...	7
1	From: guykuo@carson.u.washington.edu (Guy Kuo)...	4
2	From: twillis@ec.ecn.purdue.edu (Thomas E Will...	4
3	From: jgreen@amber (Joe Green)\nSubject: Re: W...	1
4	From: jcm@head-cfa.harvard.edu (Jonathan McDow...	14

Document	Word1	Word2	Word3	Word4
Doc1	2	1	0	1
Doc2	0	3	1	0
Doc3	1	0	4	2

Foundations in Data Science and Machine Learning¶

Module 6: Natural Language Processing¶

Malka Guillot¶

Table of Contents¶

References¶

Prologue¶

Motivation¶

[Motivation] The rise of text data¶

[Motivation] Moore’s law¶

Natural language processing¶

This course¶

A special characteristic of text data: high dimensionality¶

“Text as Data” by Gentzkow, Kelly, Taddy (2017)¶

Corpora¶

What counts as a document?¶

Setup the data¶

20 Newsgroups dataset from sklearn¶

One document¶

Store the data in a pandas dataframe¶

Dictionary methods¶

Dictionary methods¶

Example: dictionary methods¶

Baker, Bloom, and Davis (QJE 2016), “Measuring Policy Uncertainty”¶

WordNet¶

General dictionaries¶

Sentiment Analysis¶

Using the vaderSentimentIntensityAnalyzer from nltk¶

For the news document:¶

Applying the sentiment analysis to the DataFrame¶

NLP “bias” is statistical bias¶

(We already had this problem)¶

Tokenization¶

Tokenization¶

A standard tokenization pipeline¶

Example text for tokenization¶

1. Pre-processing text¶

Capitalization¶

Remove punctuation?¶

Drop numbers?¶

Stemming/lemmatizing¶

Lemmatization with WordNetLemmatizer from nltk¶

Pre-processing function (homemade)¶

Applying the pre-processing function to the DataFrame¶

Pre-processing function (readymade)¶

2. Count and frequencies¶

Tokens¶

Removing numbers¶

Removing stopwords¶

3. N-grams¶

N-grams and high dimensionality¶

4. Parts of speech¶

Install spaCy and download the model¶

Parts of speech tagging with spaCy¶

5. Named Entity Recognition¶

Bag-of-words representation¶

scikit-learn's CountVectorizer¶

Counts and frequencies¶

Building a vocabulary¶

TF-IDF (Term Frequency-Inverse Document Frequency) weighting¶

scikit-learn’s TfidfVectorizer¶

Measures of document distances¶

Document-term matrix¶

Cosine similarity¶

Clustering¶

k-means clustering¶

Other clustering algorithms¶

Final Notes on $\mathbf{X}$¶

Topic models¶

Topic models¶

A stylized example¶

Topic models¶

Latent Dirichlet Allocation (LDA)¶

Using an LDA model¶

Application¶

Analyzing business news ...¶

... to predict macroeconomic variables.¶

Word embeddings¶

Where we are¶

Different ways to represent text data¶

Text classifiers¶

Malka Guillot ¶

20 Newsgroups dataset from `sklearn`¶

Using the `vaderSentimentIntensityAnalyzer` from `nltk`¶

Lemmatization with `WordNetLemmatizer` from `nltk`¶