import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from google.colab import drive
drive.mount('/content/drive')
stats_data = pd.read_csv('/content/drive/My Drive/krishna/my/Sheet_1_Full_Data_data.csv')
stats_data.head()
data = pd.read_csv('/content/drive/My Drive/krishna/my/memegenerator.csv')
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,5))
stats_plot = sb.barplot(stats_data['Country'].head(10),stats_data['% Yes (of any degree)'].head(10),color='blue')
plt.xlabel('Countries')
plt.ylabel('percentage')
plt.title('Top_ten countries facing the highest percentage of cyberbullying')
for p in stats_plot.patches:
stats_plot.annotate('{:.0f}%'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
plt.show()
data.head()
data.info()
data.isnull().sum()
kk = data.drop(['Archived URL','Meme Page URL','MD5 Hash','File Size (In Bytes)'],axis=1)
kk.head()
kk.info()
import seaborn as sb
sb.heatmap(kk.isnull(),cbar=True)
plt.show()
kk.isnull().sum()
kk.dropna(inplace=True)
kk.isnull().sum()
drop_data1 = kk[kk['Alternate Text'].str.isalnum() == True]
drop_data1.count()
kk = kk.drop(drop_data1.index)
# Dropping these wanted numeric values
kk = kk.drop([427,644,646,659,730,820,1270,1271,1553,2814,3765,3787,4342,5401,5970,6360,6429,6741,7571,8997,9932,10222,10708 \
,10916,11076,11271,11721,12020,12224,12544,12924,13175,13458,13490,13644,13645,14164,14708,14834,14895,14959,15125,15179,15187])
newdf= kk
newdf.count()
newdf.head(10)
import nltk
from wordcloud import WordCloud
ll = " ".join(newdf['Alternate Text'].values)
word_cloud = WordCloud().generate(ll)
word_cloud = WordCloud(width=800,height=800,
background_color='white',
max_words=150).\
generate(ll)
plt.figure(figsize=[8,8])
plt.imshow(word_cloud)
plt.show()
from tqdm import tqdm
pip install googletrans
from googletrans import Translator
translator = Translator(service_urls=[
'translate.google.com',])
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
all_terms = word_tokenize(ll.lower())
stop_nltk = stopwords.words(['hungarian','swedish','kazakh','norwegian', \
'finnish','arabic','indonesian','portuguese','turkish', \
'azerbaijani','slovene','spanish','danish','nepali','romanian','greek',\
'dutch','tajik','german','english','russian','french','italian'])
stop_updated = stop_nltk + ["...","..","n't","got","memegenerator net","do not","get"]
kll = [term for term in all_terms \
if term not in stop_updated and term
not in list(punctuation) and len(term)>2]
u =[]
k =[]
for i in kll:
if i == "memegenerator net":
u.append(i)
else:
k.append(i)
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","n't":"not",
"can't": "cannot","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not",
"couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
"don't": "do not","dont": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would", "got" : "get",
"he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
"how'd": "how did","how'd'y": "how do you","how'll": "how will",
"I'd": "I would", "I'd've": "I would have","I'll": "I will",
"I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
"it'd": "it would","it'd've": "it would have","it'll": "it will",
"it'll've": "it will have", "let's": "let us","ma'am": "madam",
"mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not",
"mustn't've": "must not have", "needn't": "need not",
"needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
"she'll": "she will", "she'll've": "she will have","should've": "should have",
"shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
"that'd": "that would","that'd've": "that would have", "there'd": "there would",
"there'd've": "there would have", "they'd": "they would",
"they'd've": "they would have","they'll": "they will",
"they'll've": "they will have", "they're": "they are","they've": "they have",
"to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have",
"we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
"what'll've": "what will have","what're": "what are", "what've": "what have",
"when've": "when have","where'd": "where did", "where've": "where have",
"who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not",
"won't've": "will not have", "would've": "would have","wouldn't": "would not",
"wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
"y'all'd've": "you all would have","y'all're": "you all are",
"y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
"you'll": "you will","you'll've": "you will have", "you're": "you are",
"you've": "you have"}
# Regular expression for finding contractions
import re
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
def replace(match):
return contractions_dict[match.group(0)]
return contractions_re.sub(replace, text)
# Expanding Contractions in the reviews
newdf['clean_text']=newdf['clean_text'].apply(lambda x:expand_contractions(x))
newdf['clean_text']=newdf['clean_text'].apply(lambda x: re.sub('\w*\d\w*','', x))
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
def clean_txt(sent):
sk = []
toks= []
tokens = word_tokenize(sent.lower())
for tks in tokens:
if tks == "memegenerator net":
sk.append(tks)
else:
toks.append(tks)
lemmed = [lemm.lemmatize(term) for term in toks \
if term not in stop_updated and \
term not in list(punctuation) and len(term) > 2]
res = " ".join(lemmed)
return res
newdf['clean_text'] = newdf['Alternate Text'].apply(clean_txt)
newdf.head(10)
bb = " ".join(newdf['clean_text'].values)
word_cloud = WordCloud(width=800,height=800,
background_color='black',
max_words=200).\
generate(bb)
plt.figure(figsize=[8,8])
plt.imshow(word_cloud, interpolation='bilinear')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vectorizer = CountVectorizer(max_features=2000)
Input_unigram = unigram_count_vectorizer.fit_transform(newdf['clean_text'])
DTM_unigram = pd.DataFrame(Input_unigram.toarray(),columns = unigram_count_vectorizer.get_feature_names())
DTM_sum_uni = DTM_unigram.sum().sort_values(ascending=False)
temp2 = DTM_sum_uni.reset_index()
temp2.columns = ['clean_text','count']
temp2.head(5)
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,5))
plot = sb.barplot(DTM_sum_uni.head(25).index,DTM_sum_uni.head(25).values)
plt.xticks(rotation=90)
plt.xlabel('Top_25 unigram words')
plt.ylabel('Frequency')
plt.title('Top 25 frequently used unigram words in memes')
for p in plot.patches:
plot.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
bigram_count_vectorizer = CountVectorizer(ngram_range=(2,2),max_features=200)
Input_bigram = bigram_count_vectorizer.fit_transform(newdf['clean_text'])
DTM_bigram = pd.DataFrame(Input_bigram.toarray(),columns = bigram_count_vectorizer.get_feature_names())
DTM_bigram.head()
DTM_sum = DTM_bigram.sum().sort_values(ascending=False)
temp1 = DTM_sum.head(5).reset_index()
temp1.columns = ['clean_text','count']
temp1.head(5)
import matplotlib.pyplot as plt
import seaborn as sb
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,5))
plot = sb.barplot(DTM_sum.head(25).index,DTM_sum.head(25).values)
plt.xticks(rotation=90)
plt.xlabel('Top_25 bigram words')
plt.ylabel('Frequency')
plt.title('Top 25 frequently used bigram words in memes')
for p in plot.patches:
plot.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
trigram_count_vectorizer = CountVectorizer(ngram_range=(3,3),max_features=200)
Input_trigram= trigram_count_vectorizer.fit_transform(newdf['clean_text'])
DTM_trigram = pd.DataFrame(Input_trigram.toarray(),columns = trigram_count_vectorizer.get_feature_names())
DTM_trigram.head()
DTM_sum_tri = DTM_trigram.sum().sort_values(ascending=False)
{'postive':.02,negative:.3 , neutral.6 ,compound:.339}
temp2 =DTM_sum_tri.reset_index()
temp2.columns =['clean_text','count']
temp2.head()
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,5))
plot = sb.barplot(DTM_sum_tri.head(25).index,DTM_sum_tri.head(25).values)
plt.xticks(rotation=90)
plt.xlabel('Top_25 Trigram words')
plt.ylabel('Frequency')
plt.title('Top 25 frequently used trigram words in memes')
for p in plot.patches:
plot.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
ha='center', va='bottom',
color= 'black')
plt.show()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score,confusion_matrix
nltk.download('vader_lexicon')
analyser = SentimentIntensityAnalyzer()
def vader_class_prediction(score):
sentiment_dict = analyser.polarity_scores(score)
if sentiment_dict['compound'] >= 0.01 :
vader_class = "Positive"
elif sentiment_dict['compound'] <= - 0.01 :
vader_class = "Negative"
else :
vader_class = "Neutral"
return vader_class
def get_vader_sentiment_score(text):
return analyser.polarity_scores(text)['compound']
newdf['vader_score'] = newdf['Alternate Text'].apply(get_vader_sentiment_score)
newdf['prediction'] = newdf['Alternate Text'].apply(vader_class_prediction)
newdf.head(10)
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,5))
plot = plt.hist(newdf['vader_score'])
plt.show()
c_data = newdf.prediction.value_counts()
c_data1 = c_data.reset_index()
c_data1.columns = ['predicted classes', 'count']
c_data1
fig = plt.figure(figsize =(10, 7))
explode = (0.1, 0.0, 0.2)
colors = ( "cyan", "red", "green")
wp = { 'linewidth' : 1, 'edgecolor' : "black" }
def func(pct, allvalues):
absolute = int(pct / 100.*np.sum(allvalues))
return "{:.1f}%\n({:d} g)".format(pct, absolute)
# Creating plot
fig, ax = plt.subplots(figsize =(10, 7))
wedges, texts, autotexts = ax.pie(c_data,
autopct = lambda pct: func(pct, c_data),
explode = explode,
labels = c_data.index,
shadow = True,
colors = colors,
startangle = 90,
wedgeprops = wp,
textprops = dict(color ="black"))
ax.legend(wedges, c_data,
title ="Predicted classes",
loc ="center left",
bbox_to_anchor =(1.1, 0, 0.5, 1.5))
plt.setp(autotexts, size = 12, weight ="bold")
ax.set_title("Multi-class classification using Sentiment analysis")
plt.show()
NAVIE Bayes
from sklearn.preprocessing import LabelEncoder
x1 = LabelEncoder()
newdf['encoded_classes'] = x1.fit_transform(newdf['prediction'])
newdf.head(10)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_bi = TfidfVectorizer(ngram_range=(2,2), max_features=200)
Input_bigram= tfidf_vectorizer_bi.fit_transform(newdf['clean_text'])
DTM_bigram = pd.DataFrame(Input_bigram.toarray(),columns = tfidf_vectorizer_bi.get_feature_names())
DTM_bigram.head()
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
X = newdf.clean_text.values
target = newdf.encoded_classes.values
X_train, X_test, y_train, y_test = train_test_split(X, target, train_size = 0.75,random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_uni = TfidfVectorizer( max_features=5000)
X_train_tfidf_uni = tfidf_vectorizer_uni.fit_transform(X_train)
X_test_tfidf_uni = tfidf_vectorizer_uni.transform(X_test)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf_uni, y_train)
traget_prediction_uni = classifier.predict(X_test_tfidf_uni)
print("confusion matrix :", confusion_matrix(y_test,traget_prediction_uni),end="\n")
print("Accuracy of bigram_Tfidf_vector", accuracy_score(y_test,traget_prediction_uni)*100)
pd.Series(traget_prediction_uni).head(10)
print(classification_report(y_test,traget_prediction_uni))
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_bi = TfidfVectorizer(ngram_range=(2,2), max_features=5000)
X_train_tfidf = tfidf_vectorizer_bi.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer_bi.transform(X_test)
print(X_train_tfidf.shape,X_test_tfidf.shape)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)
traget_prediction = classifier.predict(X_test_tfidf)
print("confusion matrix :", confusion_matrix(y_test,traget_prediction),end="\n")
print("Accuracy of bigram_Tfidf_vector", accuracy_score(y_test,traget_prediction)*100)
print(classification_report(y_test,traget_prediction))
tfidf_vectorizer_tri = TfidfVectorizer(ngram_range=(3,3), max_features=5000)
X_train_tfidf_tri = tfidf_vectorizer_tri.fit_transform(X_train)
X_test_tfidf_tri = tfidf_vectorizer_tri.transform(X_test)
print(X_train_tfidf.shape,X_test_tfidf.shape)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf_tri, y_train)
traget_prediction1 = classifier.predict(X_test_tfidf_tri)
print("confusion matrix :", confusion_matrix(y_test,traget_prediction1),sep="\n")
print("Accuracy of bigram_Tfidf_vector", accuracy_score(y_test,traget_prediction1)*100)
print(classification_report(y_test,traget_prediction1))
p = pd.DataFrame([traget_prediction_uni,traget_prediction,traget_prediction1]).T
p.columns = ['1-gram','2-gram','3-gram']
p.groupby(['1-gram']).count()
X1 = newdf.clean_text.values
target1 = newdf.encoded_classes.values
X_train1, X_test1, y_train1, y_test1 = \
train_test_split(X1,target1, test_size = 0.20,random_state=42)
tfidf_vectorizer_uni = TfidfVectorizer(max_features=3000)
X_train_tfidf_uni = tfidf_vectorizer_uni.fit_transform(X_train1)
X_test_tfidf_uni = tfidf_vectorizer_uni.transform(X_test1)
svclassifier = SVC(C=1.0, kernel='linear', degree=8, gamma='auto')
svclassifier.fit(X_train_tfidf_uni, y_train1)
predictions_SVM_uni = svclassifier.predict(X_test_tfidf_uni)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_uni, y_test1)*100)
print("confusion matrix :", confusion_matrix(y_test1,predictions_SVM_uni),sep="\n")
print(classification_report(y_test1,predictions_SVM_uni))
tfidf_vectorizer_bi1 = TfidfVectorizer(ngram_range=(2,2), max_features=10000)
X_train_tfidf_bi1 = tfidf_vectorizer_bi1.fit_transform(X_train1)
X_test_tfidf_bi1 = tfidf_vectorizer_bi1.transform(X_test1)
svclassifier = SVC(C=1.0, kernel='linear', degree=8, gamma='auto')
svclassifier.fit(X_train_tfidf_bi1, y_train1)
predictions_SVM = svclassifier.predict(X_test_tfidf_bi1)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test1)*100)
print("confusion matrix :", confusion_matrix(y_test1,predictions_SVM),sep="\n")
print(classification_report(y_test1,predictions_SVM))
tfidf_vectorizer_3 = TfidfVectorizer(ngram_range=(3,3), max_features=10000)
X_train_tfidf_3 = tfidf_vectorizer_3.fit_transform(X_train1)
X_test_tfidf_3 = tfidf_vectorizer_3.transform(X_test1)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear', degree=8, gamma='auto')
svclassifier.fit(X_train_tfidf_3, y_train1)
predictions_SVM1 = svclassifier.predict(X_test_tfidf_3)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM1, y_test1)*100)
print("confusion matrix :", confusion_matrix(y_test1,predictions_SVM1),sep="\n")
print(classification_report(y_test1,predictions_SVM1))
Uni_NB =accuracy_score(y_test,traget_prediction_uni)*100
BI_NB = accuracy_score(y_test,traget_prediction)*100
Tri_NB = accuracy_score(y_test,traget_prediction1)*100
SVM_uni =accuracy_score(predictions_SVM_uni, y_test1)*100
SVM_BI = accuracy_score(predictions_SVM, y_test1)*100
SVM_TRI= accuracy_score(predictions_SVM1, y_test1)*100
scores = [Uni_NB,BI_NB,Tri_NB,SVM_uni,SVM_BI,SVM_TRI]
names = ['unigram','bigram','trigram','unigram','bigram','trigram']
model_names =['Navie Bayes','Navie Bayes','Navie Bayes','SVM','SVM','SVM']
accuracy_table = pd.DataFrame([model_names,names,scores]).T
accuracy_table.columns = ['model_mame','N-gram','accuracy_score (%)']
accuracy_table['accuracy_score (%)'] = \
accuracy_table['accuracy_score (%)'].apply(lambda x: x.astype(float))
table = accuracy_table.pivot_table(index='model_mame', \
columns = 'N-gram', values = 'accuracy_score (%)')
table
accuracy_table.groupby(['model_mame'])[['N-gram','accuracy_score (%)']].mean()
from itertools import groupby
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
COLOR = 'black'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR
plt.figure(figsize =(15,10))
plot = table.plot(kind='bar')
plt.xticks(rotation=0)
plt.xlabel('Models')
plt.ylabel('Accuracy(in %)')
plt.title('Comparative graph of accuracy_scores of different n-gram models')
plt.ylim(0,100)
plt.show()