from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import datetime as pydt
#set up the database connection
engine = create_engine('mysql+pymysql://XXXXXXMYCONNECTIONSTRINGXXXXXX', echo=False)
conn = engine.connect().connection
#query the database to get row representing the 1st time each IP address accessed each episode
#note that a random number is being substituted in for each IP
dfListens = pd.read_sql("SELECT distinct s1.id, s1.post_id, r.random_ip_sub, s1.referrer, s1.date " \
"FROM clz_ssp_stats s1 " \
"left join (select ip_address, post_id, min(id) first_id from clz_ssp_stats group by ip_address, post_id) s2 " \
"ON s1.ip_address = s2.ip_address and s1.id = s2.first_id " \
"left join (select ip_Address, max(FLOOR(100000000 + (RAND() * 199999999))) random_ip_sub from clz_ssp_stats " \
"group by ip_Address) r ON s1.ip_address = r.ip_address " \
"WHERE s2.post_id is not null", conn, index_col=['id'])
#close the database connection
conn.close()
#rename the columns
dfListens.columns = ['postnum','IP','method','timestamp']
#display new randomized IP as string without scientific notation
dfListens['IP'] = dfListens['IP'].astype(int).astype(str)
#display the row count and first 5 rows
print("Count of rows in dataframe:",len(dfListens),"\n")
print("Example data rows:",dfListens.head())
#add a column with the date from timestamp
dfListens['day'] = pd.to_datetime(dfListens['timestamp'], unit='s').dt.date
#create a column of episode names based on dictionary of post numbers
eps = {776:'Renee',807:'Will',841:'Safia',871:'Shlomo',896:'Sherman',921:'Clare',964:'Erin',
980:'Enda',1014:'Sebastian',1043:'Justin',1056:'Trey',1081:'Stephanie',1105:'DSLC',1118:'Debbie',
1159:'Anncmt',1204:'Election',1249:'Ed',1316:'Jasmine',1346:'David',1421:'Randy'}
dfListens['episode'] = dfListens['postnum'].replace(eps)
#add a column with the date from timestamp
dfListens['day'] = pd.to_datetime(dfListens['timestamp'], unit='s').dt.date
#set option to print all dataframs columns without wrapping
pd.set_option('expand_frame_repr', False)
#show last 5 rows after new columns added
print("Example transformed data rows:",dfListens.tail(n=5))
print("\nMax date in dataframe: ",dfListens['day'].max())
#truncate anything that starts with "download" to be just "download"
dfListens.loc[dfListens['method'].str.startswith('download'),'method'] = "download"
#and replace the blank methods with "unknown"
dfListens.loc[dfListens['method'] == '','method'] = "unknown"
#group by methods again and show counts, sorted
grpMethod = dfListens.groupby('method').count()
print(grpMethod['postnum'].sort_values())
#set plots to display inline in jupyter notebook
%matplotlib inline
#bar chart of listen methods
grpMethod['postnum'].sort_values().plot(kind='bar')
#group by methods again and show counts, sorted
grpEpisode = dfListens.groupby('episode').count()
print(grpEpisode['postnum'].sort_values())
#bar chart of episode listen counts
grpEpisode['postnum'].sort_values().plot(kind='bar', color='r')
#get count of IPs accessing episode per day
dfEpisodeDays = dfListens[['postnum','episode','day']].groupby(['episode','day']).count()
dfEpisodeDays.columns=['listencount']
dfEpisodeDays = dfEpisodeDays.reset_index().sort_values(by=['listencount'], ascending=False)
#get the listen count on the first day each episode was released
#note: would be "fairer" to get 1st 24 hrs per episode since i don't post at same time of day each time
dfEpisodeDays = dfEpisodeDays.reset_index().sort_values(by=['day'])
dfEpDayMax = pd.DataFrame(dfEpisodeDays.groupby('episode').nth(0)).reset_index()
#add weekday to see if most are Monday/Tuesday when episode published
dfEpDayMax['DayOfWeek'] = pd.to_datetime(dfEpDayMax['day']).apply(lambda x: pydt.datetime.strftime(x, '%A'))
print('Listens on Release Day by Episode (ordered by date):')
print(dfEpDayMax.sort_values(by=['day']))
#get count of IPs accessing episode per day
dfEpisodeDays = dfListens[['postnum','episode','day']].groupby(['episode','day']).count()
dfEpisodeDays.columns=['listencount']
dfEpisodeDays = dfEpisodeDays.reset_index().sort_values(by=['listencount'], ascending=False)
#get the max listen count day per episode
dfEpDayMax = pd.DataFrame(dfEpisodeDays.groupby('episode').nth(0)).reset_index()
dfEpDayMax['DayOfWeek'] = pd.to_datetime(dfEpDayMax['day']).apply(lambda x: pydt.datetime.strftime(x, '%A'))
print('Max Listens In a Day by Episode (ordered by listen count):')
print(dfEpDayMax.sort_values(by=['listencount']))
It makes sense that the earliest episodes have the lowest "biggest day" counts, because they were released when there were few regular subscribers, so have built up listeners over time and didn't get as big of a "pop" on release day as later episodes did