from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import datetime as pydt
#set up the database connection
#this is a stats table for a wordpress plugin I use to post my podcast episodes
#This account only has SELECT privileges.
#note that a random number is being substituted in for each IP
engine = create_engine('mysql+pymysql://XXMYCONNECTIONSTRINGXX', echo=False)
conn = engine.connect().connection
#create the pandas dataframe and populate with data from ssp stats table
dfListens = pd.read_sql("SELECT s1.id, s1.post_id, r.random_ip_sub, s1.referrer, s1.date FROM clz_ssp_stats s1 " \
"left join (select ip_Address, max(FLOOR(100000000 + (RAND() * 199999999))) " \
"random_ip_sub from clz_ssp_stats group by ip_Address) r ON s1.ip_address = r.ip_address "
, conn, index_col=['id'])
#rename the columns
dfListens.columns = ['postnum','IP','method','timestamp']
#display new randomized IP as string without scientific notation
dfListens['IP'] = dfListens['IP'].astype(int).astype(str)
#display the row count and first 5 rows
print("rows in dataframe:",len(dfListens),"\n")
print(dfListens.head())
#add a column with the date from timestamp
dfListens['day'] = pd.to_datetime(dfListens['timestamp'], unit='s').dt.date
#create a column of episode names based on dictionary of post numbers
eps = {776:'Renee',807:'Will',841:'Safia',871:'Shlomo',896:'Sherman',921:'Clare',964:'Erin',
980:'Enda',1014:'Sebastian',1043:'Justin',1056:'Trey',1081:'Stephanie',1105:'DSLC',1118:'Debbie',
1159:'Anncmt',1204:'Election',1249:'Ed',1316:'Jasmine',1346:'David',1421:'Randy'}
dfListens['episode'] = dfListens['postnum'].replace(eps)
#add a column with the date from timestamp
dfListens['day'] = pd.to_datetime(dfListens['timestamp'], unit='s').dt.date
#set option to print all dataframs columns without wrapping
pd.set_option('expand_frame_repr', False)
#check that it's connecting to live data
print("Max date in dataframe: ",dfListens['day'].max(),"\n")
#show first 5 and last 20 rows after new columns added
print(dfListens.head(n=5),"\n")
print(dfListens.tail(n=20))
#group listens by method and display count
grpMethod = dfListens.groupby('method')
print(grpMethod['postnum'].count())
#some weird methods in there!
#truncate anything that starts with "download" to be just "download"
dfListens.loc[dfListens['method'].str.startswith('download'),'method'] = "download"
#and replace the blank methods with "unknown"
dfListens.loc[dfListens['method'] == '','method'] = "unknown"
#group by methods again and show counts, sorted
grpMethod = dfListens.groupby('method').count()
print(grpMethod['postnum'].sort_values())
#set plots to display inline in jupyter notebook
%matplotlib inline
#bar chart of listen methods
grpMethod['postnum'].sort_values().plot(kind='bar')
#Which episodes are most popular?
#group by methods again and show counts, sorted
grpEpisode = dfListens.groupby('episode').count()
print(grpEpisode['postnum'].sort_values())
#bar chart of episode listen counts
grpEpisode['postnum'].sort_values().plot(kind='bar', color='r')
#now let's group by episode and display counts
grpIP = dfListens.groupby('IP').count()
print(grpIP['postnum'].sort_values(ascending=False).head())
#let's take a look at the data for that top IP address. how many accesses per day?
print(dfListens.loc[dfListens['IP'] == '299900616',:].groupby(['day']).count().tail(n=10))
#I don't know how to do "get first row from each grouping"quickly in pandas yet, so for now I'll adjust the SQL query
#...ugh apparently it's an old version of mySQL without window functions like ROW_NUMBER(), so we'll self-join
dfListens2 = pd.read_sql("SELECT distinct s1.id, s1.post_id, r.random_ip_sub, s1.referrer, s1.date " \
"FROM clz_ssp_stats s1 " \
"left join (select ip_address, post_id, min(id) first_id from clz_ssp_stats group by ip_address, post_id) s2 " \
"ON s1.ip_address = s2.ip_address and s1.id = s2.first_id " \
"left join (select ip_Address, max(FLOOR(100000000 + (RAND() * 199999999))) random_ip_sub from clz_ssp_stats " \
"group by ip_Address) r ON s1.ip_address = r.ip_address " \
"WHERE s2.post_id is not null", conn, index_col=['id'])
#close the database connection
conn.close()
#apply all of the same data transformations I made to df1 to this dataframe
dfListens2.columns = ['postnum','IP','method','timestamp']
dfListens2['day'] = pd.to_datetime(dfListens2['timestamp'], unit='s').dt.date
eps = {776:'Renee',807:'Will',841:'Safia',871:'Shlomo',896:'Sherman',921:'Clare',964:'Erin',
980:'Enda',1014:'Sebastian',1043:'Justin',1056:'Trey',1081:'Stephanie',1105:'DSLC',1118:'Debbie',
1159:'Anncmt',1204:'Election',1249:'Ed',1316:'Jasmine',1346:'David',1421:'Randy'}
dfListens2['episode'] = dfListens2['postnum'].replace(eps)
dfListens2.loc[dfListens['method'].str.startswith('download'),'method'] = "download"
dfListens2.loc[dfListens['method'] == '','method'] = "unknown"
dfListens2['IP'] = dfListens['IP'].astype(int).astype(str)
#check count of rows in new dataframe
print("rows in dataframe:",len(dfListens2),"\n")
#print header of new dataframe
print(dfListens2.head())
#group by episode and display counts in new dataframe
grpEpisode = dfListens2.groupby('IP').count()
print(grpEpisode['postnum'].sort_values(ascending=False).head())
#OK now each IP is only counted once per episode, let's redo the method count
#group by methods again and show counts, sorted
grpMethod = dfListens2.groupby('method').count()
print(grpMethod['postnum'].sort_values())
#bar chart of listen methods
grpMethod['postnum'].sort_values().plot(kind='bar')
#Which episodes are most popular?
#group by episodes again and show counts, sorted
grpEpisode = dfListens2.groupby('episode').count()
print(grpEpisode['postnum'].sort_values())
#bar chart of episodes
grpEpisode['postnum'].sort_values().plot(kind='bar', color='r')
print(dfListens2.head())
#get count of IPs accessing episode per day
dfEpisodeDays = dfListens2[['postnum','episode','day']].groupby(['episode','day']).count()
dfEpisodeDays.columns=['listencount']
dfEpisodeDays = dfEpisodeDays.reset_index().sort_values(by=['listencount'], ascending=False)
print('\nListens Grouped by Episode by Day')
print(dfEpisodeDays.head())
#get the max listen count day per episode
#note- if published late at night, or published but not made public immediately, 1st day will be low
dfEpDayMax = pd.DataFrame(dfEpisodeDays.groupby('episode').nth(0)).reset_index()
#add weekday to see if most are Monday/Tuesday when episode published
dfEpDayMax['DayOfWeek'] = pd.to_datetime(dfEpDayMax['day']).apply(lambda x: pydt.datetime.strftime(x, '%A'))
print('Max Listens In a Day by Episode (ordered by date):')
print(dfEpDayMax.sort_values(by=['listencount']))
It makes sense that the first episodes have the lowest "biggest day" counts, because they were released when there were few regular subscribers, so have built up listeners over time and didn't get as big of a "pop" on release day as later episodes did
#get the listen count on the first day each episode was released
#note: would be "fairer" to get 1st 24 hrs per episode since i don't post at same time of day each time
dfEpisodeDays = dfEpisodeDays.reset_index().sort_values(by=['day'])
print(dfEpisodeDays.head())
dfEpDayMax = pd.DataFrame(dfEpisodeDays.groupby('episode').nth(0)).reset_index()
#add weekday to see if most are Monday/Tuesday when episode published
dfEpDayMax['DayOfWeek'] = pd.to_datetime(dfEpDayMax['day']).apply(lambda x: pydt.datetime.strftime(x, '%A'))
print('Listens on Release Day by Episode (ordered by date):')
print(dfEpDayMax.sort_values(by=['day']))
Podcast listen monitoring | Interactive listens by episode by week
#Need to build a Web Data Connector to database for Tableau Public (since mySQL connection not available in free version),
#so in the meantime, download CSV of podcast data to use
dfListens2.to_csv('BADSPodcast1stListenPerEpPerIP.csv', sep=',')
Pandas string handling (like "startswith")