Related Topics
No topics are associated with this blog
TF-IDF: Term Frequency-Inverse Document Frequency
Using Python and Scikit-Learn to analyze blogs in a MySQL database
Downloaded from a jupyter notebook into *.py format
import mysql.connector
import re
import pandas as pd
import numpy as np
# dict to group blogs by keywords
from collections import defaultdict
d = defaultdict(list)
# everything
plot_title = "Non-Terse-Verse Philadelphia Reflections"
query = '''SELECT title, blog_contents, table_key
FROM individual_reflections
WHERE LOWER(title) NOT LIKE '%terse%' AND
LOWER(title) NOT LIKE '%znote%' AND
LOWER(title) NOT LIKE '%front stuff%' '''
def remove_html_tags(text):
"""Remove html tags from a string"""
import re
clean = re.compile('<.*?>')
step1 = re.sub(clean, ' ', text)
step2 = step1.replace('\n', ' ').replace('\r', '')
step3 = ' '.join(step2.split())
text=step3.lower()
text=re.sub("","",text)
text=re.sub("(\\d|\\W)+"," ",text)
text=text.strip()
return text
mydb = mysql.connector.connect(
host="",
user="",
passwd="",
database=''
)
cursor = mydb.cursor()
cursor.execute(query)
df_idf = pd.DataFrame([])
for (title, blog_contents, table_key) in cursor:
blog_contents = remove_html_tags(blog_contents)
title = remove_html_tags(title)
df_idf = df_idf.append(pd.DataFrame({'table_key': table_key, 'title': title, 'body': blog_contents}, index=[0]), ignore_index=True)
df_idf['text'] = df_idf['title'] + " " + df_idf['body']
docs_test = df_idf['text'].tolist()
cursor.close()
mydb.close()
df_idf.head()
from sklearn.feature_extraction.text import CountVectorizer
import re
#get the text column
docs=df_idf['text'].tolist()
#create a vocabulary of words,
#ignore words that appear in 85% of documents,
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words='english',max_features=10000) # cv = cv
word_count_vector=cv.fit_transform(docs) # cv_fit = word_count_vector
list(cv.vocabulary_.keys())[:10]
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""get the feature names and tf-idf score of top n items"""
#use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
# mapping of index
feature_names=cv.get_feature_names()
# human-readable list of keywords for each blog
myfile = open('.../tf-idf.txt', 'w')
# explode-able/split-able list of keywords for each blog
kwfile = open('.../kwds.txt', 'w')
header = '''
==========================================
10 most-significant keywords for each blog
=========================================='''
print(header)
myfile.write(header+"\n")
for i in range(len(docs)):
# get the document that we want to extract keywords from
doc=docs[i]
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
# now print the results
print("\n=====Title=====")
myfile.write("\n=====Title=====\n")
print(df_idf.iloc[i]['table_key'], df_idf.iloc[i]['title'])
myfile.write("{} {}\n".format(df_idf.iloc[i]['table_key'],df_idf.iloc[i]['title']))
kwds = str(df_idf.iloc[i]['table_key'])+'|'
print("\n===Keywords===")
myfile.write("\n===Keywords===\n")
for k in keywords:
print(k,keywords[k])
d[k].append((df_idf.iloc[i]['table_key'],
keywords[k],
df_idf.iloc[i]['title']))
myfile.write("{} {}\n".format(k,keywords[k]))
kwds += k+','
kwfile.write(kwds[:-1]+"\n")
myfile.close()
kwfile.close()
import operator
word_list = cv.get_feature_names();
count_list = word_count_vector.toarray().sum(axis=0)
word_freq = sorted(dict(zip(word_list,count_list)).items(), key=operator.itemgetter(1), reverse=True)
x = []
y = []
for word,freq in word_freq[:25]:
x.append(word)
y.append(freq)
from matplotlib import pyplot as plt
plt.figure(figsize=(15,15))
plt.plot(x,y)
plt.title("Word Frequency in "+plot_title,fontsize=18)
plt.xticks(rotation=45,fontsize=18)
plt.yticks(fontsize=18)
plt.grid()
plt.show()
plt.savefig('.../Word_Frequency.png')
# # Group blogs by keywords
myfile = open('.../Keyword_Popularity.txt', 'w')
header = '''
==================================================
Most-Frequent Keywords with the blogs they specify
==================================================
(blog key, keyword importance, blog title)
..................................................\n'''
print(header)
myfile.write(header+"\n")
x = []
y = []
for k in sorted(d, key=lambda k: len(d[k]), reverse=True):
if len(d[k]) == 1: break
x.append(k)
y.append(len(d[k]))
print(len(d[k]),"blogs contain the keyword","'"+k+"'")
myfile.write("{} blogs contain the keyword '{}'\n".format(len(d[k]), k))
print('---------------------------------------------')
myfile.write("---------------------------------------------\n")
for t in sorted(d[k], key=lambda tup:(-tup[1], tup[2])):
print(t)
myfile.write("{}\n".format(t))
print ('*****\n')
myfile.write('*****\n\n')
myfile.close()
num = 50
plt.figure(figsize=(15,15))
plt.plot(x[:num],y[:num])
plt.title(str(num)+" Most Popular Keywords in "+plot_title,fontsize=18)
plt.xticks(rotation=90,fontsize=12)
plt.yticks(fontsize=18)
plt.ylabel("Number of blogs with this keyword",fontsize=18)
plt.grid()
plt.savefig('.../Keyword_Popularity.png')
plt.show()
Originally published: Monday, October 07, 2019; most-recently modified: Monday, October 07, 2019