Installing Datasets Library
!pip install datasets
Import Necessary Packages
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import pandas as pd
from datasets import load_dataset
import csv
import re
import nltk
from gensim.models import Word2Vec
import numpy as np
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
dataset = load_dataset("climate_fever",split='test')
text_data = dataset['claim']
df=pd.DataFrame(dataset['claim'])
df.head()
output:
Apply PCA on the Word2Vec embeddings. Notice the train, val, and test sets in embedding with dimensionality reduction methods
from gensim.models import Word2Vec, FastText
import pandas as pd
from sklearn_model
import re
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import numpy as np
import warnings
warnings.filterwarnings('ignore')
c=0
sentences =[]
for i in df[0]:
if c<1000:
sentences.append(i)
c+=1
tx = 800
ts = 100
vl = 100
train,test,val=sentences[:tx],sentences[tx:ts],sentences[ts:vl]
# train word2vec model
w2v = Word2Vec(train, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
len(X)
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
words = list(w2v.wv.vocab)
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
output:
--2021-03-09 09:52:42-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.50.46 Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.50.46|:443... connected. HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable The file is already fully retrieved; nothing to do.
!pip install eigpca
Use a scree plot to find the best dimensionality using the PCA subspace. In practice, people usually use either train or val sets for the scree plot but here, use the val set.
x = np.array(df[0])
len(X)
from eigpca import PCA
w2v = Word2Vec(val, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
pca = PCA()
pca.fit(X)
pca.plot(y="eig")
pca.plot(y="pov")
from eigpca import PCA
w2v = Word2Vec(train, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
pca = PCA()
pca.fit(X)
pca.plot(y="eig")
pca.plot(y="pov")
from eigpca import PCA
w2v = Word2Vec(test, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
pca = PCA()
pca.fit(X)
pca.plot(y="eig")
pca.plot(y="pov")
Output:
w2v = Word2Vec(test, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
len(X)
pca = PCA(n_components=4)
result = pca.fit_transform(X)
# create a scatter plot of the projection
words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))
w2v = Word2Vec(test, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
len(X)
pca = PCA(n_components=4)
result = pca.fit_transform(X)
# create a scatter plot of the projection
words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))
Visualize the first four dimensionalities of this subspace using a pairs plot. Discuss the PCA embedding applied on the Word2Vec embeddings
from sklearn.decomposition import PCA
w2v = Word2Vec(train, min_count=1, size = 5)
print(w2v)
X = w2v[w2v.wv.vocab]
len(X)
pca = PCA(n_components=4)
result = pca.fit_transform(X)
# create a scatter plot of the projection
words = list(w2v.wv.vocab)
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(pd.DataFrame(result))
Output:
Compare the PCA embeddings with Word2Vec embeddings. Use cosine similarity/dissimilarity for comparisons and discussions.
from sklearn.metrics.pairwise import cosine_similarity
p = PCA(n_components=5)
pcaEM = p.fit_transform(X)
cosine_similarity(pcaEM,X)
Output:
array([[ 0.47246924, 0.5533322 , 0.69968647, ..., 0.3520948 , 0.2265005 , 0.39633623], [-0.18453637, -0.37230664, -0.21567692, ..., -0.07461087, -0.43031302, 0.04274582], [ 0.3912312 , 0.12304249, 0.19217059, ..., 0.49092332, 0.21944408, 0.5939499 ], ..., [-0.6996586 , -0.7926788 , -0.89047426, ..., -0.3704218 , -0.3495899 , -0.6576486 ], [-0.7633835 , -0.85069066, -0.927511 , ..., -0.39605126, -0.42877147, -0.72114724], [-0.6238885 , -0.75254095, -0.8371487 , ..., -0.2576178 , -0.2643652 , -0.57529074]], dtype=float32)
Contact Us:
Hire us to get any machine learning problems related help. We are providing complete projects related help with an affordable prices.
Comments