Apply PCA on the Word2Vec Embddings | Analyze Climate_fever Dataset Using Word2Vec Embddings

Jun 23, 20222 min read

Installing Datasets Library

!pip install datasets

Import Necessary Packages

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import pandas as pd
from datasets import load_dataset
import csv
import re
import nltk
from gensim.models import Word2Vec
import numpy as np
from sklearn.decomposition import PCA
from nltk.corpus import stopwords

dataset = load_dataset("climate_fever",split='test')

text_data = dataset['claim']

df=pd.DataFrame(dataset['claim'])
df.head()

output:

Apply PCA on the Word2Vec embeddings. Notice the train, val, and test sets in embedding with dimensionality reduction methods

from gensim.models import Word2Vec, FastText
import pandas as pd
from sklearn_model
import re
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import numpy as np
import warnings
warnings.filterwarnings('ignore')

c=0
sentences =[]
for i in df[0]:
  if c<1000:
    sentences.append(i)
    c+=1

tx = 800
ts = 100
vl = 100

train,test,val=sentences[:tx],sentences[tx:ts],sentences[ts:vl]

# train word2vec model
w2v = Word2Vec(train, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]
len(X)

pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection

words = list(w2v.wv.vocab)

!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

output:

--2021-03-09 09:52:42-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.50.46 Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.50.46|:443... connected. HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable The file is already fully retrieved; nothing to do.

!pip install eigpca

Use a scree plot to find the best dimensionality using the PCA subspace. In practice, people usually use either train or val sets for the scree plot but here, use the val set.

x = np.array(df[0])
len(X)

from eigpca import PCA

w2v = Word2Vec(val, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]

pca = PCA()
pca.fit(X)

pca.plot(y="eig")
pca.plot(y="pov")

from eigpca import PCA

w2v = Word2Vec(train, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]

pca = PCA()
pca.fit(X)

pca.plot(y="eig")
pca.plot(y="pov")

from eigpca import PCA

w2v = Word2Vec(test, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]

pca = PCA()
pca.fit(X)

pca.plot(y="eig")
pca.plot(y="pov")

Output:

w2v = Word2Vec(test, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]
len(X)

pca = PCA(n_components=4)
result = pca.fit_transform(X)

# create a scatter plot of the projection

words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))

w2v = Word2Vec(test, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]
len(X)

pca = PCA(n_components=4)
result = pca.fit_transform(X)

# create a scatter plot of the projection

words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))

Visualize the first four dimensionalities of this subspace using a pairs plot. Discuss the PCA embedding applied on the Word2Vec embeddings

from sklearn.decomposition import PCA

w2v = Word2Vec(train, min_count=1, size = 5)

print(w2v)

X = w2v[w2v.wv.vocab]
len(X)

pca = PCA(n_components=4)
result = pca.fit_transform(X)

# create a scatter plot of the projection

words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))

Output:

Compare the PCA embeddings with Word2Vec embeddings. Use cosine similarity/dissimilarity for comparisons and discussions.

from sklearn.metrics.pairwise import cosine_similarity
p = PCA(n_components=5)
pcaEM = p.fit_transform(X)
cosine_similarity(pcaEM,X)

Output:

array([[ 0.47246924, 0.5533322 , 0.69968647, ..., 0.3520948 , 0.2265005 , 0.39633623], [-0.18453637, -0.37230664, -0.21567692, ..., -0.07461087, -0.43031302, 0.04274582], [ 0.3912312 , 0.12304249, 0.19217059, ..., 0.49092332, 0.21944408, 0.5939499 ], ..., [-0.6996586 , -0.7926788 , -0.89047426, ..., -0.3704218 , -0.3495899 , -0.6576486 ], [-0.7633835 , -0.85069066, -0.927511 , ..., -0.39605126, -0.42877147, -0.72114724], [-0.6238885 , -0.75254095, -0.8371487 , ..., -0.2576178 , -0.2643652 , -0.57529074]], dtype=float32)