22import os
33import json
44import time
5+ import calendar
56from datetime import datetime , timedelta , timezone
67import requests
8+ import pandas as pd
79import numpy as np
810from sqlalchemy import desc
9- from sqlalchemy .sql .functions import func
1011from celery .utils .log import get_task_logger
1112from app import celery , db
1213from app .emails import send_email
@@ -119,89 +120,216 @@ def import_analyze_store_list(list_data, org_id, user_email=None):
119120
120121 return list_stats
121122
122- def send_report (stats , list_id , list_name , user_email_or_emails ):
123+ def generate_summary_stats (list_stats_objects ):
124+ """Generates summary statistics dictionaries for a list and the database.
125+
126+ First extracts the stats from SQLAlchemy objects for the most recent and
127+ next to most recent analyses for the given list and organizes them in a
128+ dictionary. Then creates a similar dictionary for the database averages. If
129+ list_stats_objects only contains a single analysis, takes the average of the
130+ most recent analysis across all lists. Otherwise, takes the average of the
131+ most recent analysis and next to me recent analysis across all lists with
132+ two or more total analyses.
133+
134+ Args:
135+ list_stats_objects: a list of SQLAlchemy query results.
136+
137+ Returns:
138+ A tuple consisting of two dictionaries. The first dictionary pertains to
139+ the list under analysis, the second pertains to the database as a whole.
140+ The dictionaries consist of keys, such as 'subscribers', and values,
141+ a list of analysis results. These lists contain either one element
142+ (the value for the most recent analysis), or two elements
143+ (the value for the next to most recent analysis followed by the value
144+ for the most recent analysis).
145+ """
146+
147+ # Convert the ListStats objects to easier-to-use dictionary
148+ # Then merge the two dictionaries
149+ most_recent_stats = extract_stats (list_stats_objects [0 ])
150+ previous_stats = (extract_stats (list_stats_objects [1 ])
151+ if len (list_stats_objects ) > 1
152+ else None )
153+ list_stats = ({k : [previous_stats [k ], v ] for k , v in most_recent_stats .items ()}
154+ if previous_stats
155+ else {k : [v ] for k , v in most_recent_stats .items ()})
156+
157+ # Now generate averages
158+ # Works a bit differently depending on whether we're comparing
159+ # single or multiple analyses (see docstring)
160+ if previous_stats :
161+
162+ # This query returns all list_stats objects where the list_id column
163+ # is duplicated elsewhere in the table as well as a row_number column,
164+ # e.g. row_number == 1 corresponds to the most recent analysis for each
165+ # grouping
166+ df = pd .read_sql ( # pylint: disable=invalid-name
167+ '''WITH has_prev_stats as (
168+ SELECT list_stats.list_id, COUNT(*) from list_stats
169+ LEFT JOIN email_list ON list_stats.list_id = email_list.list_id
170+ WHERE email_list.store_aggregates = 'True'
171+ GROUP BY list_stats.list_id HAVING COUNT(*) >= 2)
172+ SELECT list_stats.*,
173+ ROW_NUMBER() OVER(PARTITION BY list_stats.list_id
174+ ORDER BY analysis_timestamp DESC)
175+ FROM list_stats
176+ JOIN has_prev_stats
177+ ON has_prev_stats.list_id = list_stats.list_id;''' ,
178+ db .session .bind )
179+
180+ mean_dfs = [df [df ['row_number' ] == 2 ].mean (),
181+ df [df ['row_number' ] == 1 ].mean ()]
182+
183+ else :
184+
185+ df = pd .read_sql ( # pylint: disable=invalid-name
186+ ListStats .query .filter (ListStats .list .has (
187+ store_aggregates = True )).order_by ('list_id' , desc (
188+ 'analysis_timestamp' )).distinct (ListStats .list_id ).statement ,
189+ db .session .bind )
190+
191+ mean_dfs = [df .mean ()]
192+
193+
194+ agg_stats = {
195+ 'subscribers' : [int (mean_df ['subscribers' ]) for mean_df in mean_dfs ],
196+ 'subscribed_pct' : [mean_df ['subscribed_pct' ] for mean_df in mean_dfs ],
197+ 'unsubscribed_pct' : [mean_df ['unsubscribed_pct' ] for mean_df in mean_dfs ],
198+ 'cleaned_pct' : [mean_df ['cleaned_pct' ] for mean_df in mean_dfs ],
199+ 'pending_pct' : [mean_df ['pending_pct' ] for mean_df in mean_dfs ],
200+ 'open_rate' : [mean_df ['open_rate' ] for mean_df in mean_dfs ],
201+ 'high_open_rt_pct' : [mean_df ['high_open_rt_pct' ] for mean_df in mean_dfs ],
202+ 'cur_yr_inactive_pct' : [
203+ mean_df ['cur_yr_inactive_pct' ] for mean_df in mean_dfs ]
204+ }
205+
206+ return list_stats , agg_stats
207+
208+ def generate_diffs (list_stats , agg_stats ):
209+ """Generates diffs between last month and this month's stats and returns
210+ them in a dictionary."""
211+ diffs = {}
212+ for k in agg_stats .keys ():
213+ diffs [k ] = [
214+ ((list_stats [k ][1 ] - list_stats [k ][0 ]) / list_stats [k ][0 ]
215+ if list_stats [k ][0 ] else 0 ),
216+ ((agg_stats [k ][1 ] - agg_stats [k ][0 ]) / agg_stats [k ][0 ]
217+ if agg_stats [k ][0 ] else 0 )
218+ ]
219+ diffs [k ] = [('+{:.1%}' if diff >= 0 else '{:.1%}' ).format (diff )
220+ for diff in diffs [k ]]
221+ return diffs
222+
223+ def send_report ( # pylint: disable=too-many-locals
224+ list_stats , agg_stats , list_id , list_name , user_email_or_emails ):
123225 """Generates charts using Plotly and emails them to the user.
124226
125227 Args:
126- stats: a dictionary containing analysis results for a list.
228+ list_stats: a dictionary containing analysis results for a list.
229+ agg_stats: a dictionary containing aggregate analysis results from the
230+ database.
127231 list_id: the list's unique MailChimp id.
128232 list_name: the list's name.
129233 user_email_or_emails: a list of emails to send the report to.
130234 """
131235
132- # This subquery generates the most recent stats
133- # For each unique list_id in the database
134- # Where store_aggregates is True
135- subquery = ListStats .query .filter (
136- ListStats .list .has (store_aggregates = True )).order_by ('list_id' , desc (
137- 'analysis_timestamp' )).distinct (ListStats .list_id ).subquery ()
138-
139- # Generate aggregates within the subquery
140- agg_stats = db .session .query (
141- func .avg (subquery .columns .subscribers ),
142- func .avg (subquery .columns .subscribed_pct ),
143- func .avg (subquery .columns .unsubscribed_pct ),
144- func .avg (subquery .columns .cleaned_pct ),
145- func .avg (subquery .columns .pending_pct ),
146- func .avg (subquery .columns .open_rate ),
147- func .avg (subquery .columns .high_open_rt_pct ),
148- func .avg (subquery .columns .cur_yr_inactive_pct )).first ()
149-
150- # Make sure we have no 'None' values
151- agg_stats = [agg if agg else 0 for agg in agg_stats ]
152-
153- # Convert subscribers average to an integer
154- agg_stats [0 ] = int (agg_stats [0 ])
155-
156- # Generate epoch time (to get around image caching in webmail)
236+ # Generate epoch time to append to filenames
237+ # This is a hacky workaround for webmail image caching
157238 epoch_time = str (int (time .time ()))
158239
159- # Generate charts
240+ # Figure out whether there's two sets of stats per graph
241+ contains_prev_month = len (list_stats ['subscribers' ]) == 2
242+
243+ if contains_prev_month :
244+
245+ # Calculate the diffs (for month-over-month change labels)
246+ diff_vals = generate_diffs (list_stats , agg_stats )
247+
248+ # Get the current month and previous month in words (for labels)
249+ cur_month = datetime .now ().month
250+ last_month = cur_month - 1 or 12
251+ cur_month_formatted = calendar .month_abbr [cur_month ]
252+ last_month_formatted = calendar .month_abbr [last_month ]
253+
254+ bar_titles = [
255+ 'Your List<br>as of ' + last_month_formatted ,
256+ 'Your List<br>as of ' + cur_month_formatted ,
257+ 'Average<br>as of ' + last_month_formatted ,
258+ 'Average<br>as of ' + cur_month_formatted ]
259+ stacked_bar_titles = [
260+ 'Average <br>as of ' + last_month_formatted + ' ' ,
261+ 'Average <br>as of ' + cur_month_formatted + ' ' ,
262+ 'Your List <br>as of ' + last_month_formatted + ' ' ,
263+ 'Your List <br>as of ' + cur_month_formatted + ' ' ]
264+
265+ else :
266+
267+ diff_vals = None
268+ bar_titles = ['Your List' , 'Average' ]
269+ stacked_bar_titles = ['Average ' , 'Your List ' ]
270+
160271 draw_bar (
161- ['Your List' , 'Dataset Average' ],
162- [stats ['subscribers' ], agg_stats [0 ]],
272+ bar_titles ,
273+ [* list_stats ['subscribers' ], * agg_stats ['subscribers' ]],
274+ diff_vals ['subscribers' ] if diff_vals else None ,
163275 'Chart A: List Size' ,
164276 list_id + '_size_' + epoch_time )
165277
166- draw_stacked_horizontal_bar (
167- ['Dataset Average' , 'Your List' ],
168- [('Subscribed %' , [agg_stats [1 ], stats ['subscribed_pct' ]]),
169- ('Unsubscribed %' , [agg_stats [2 ], stats ['unsubscribed_pct' ]]),
170- ('Cleaned %' , [agg_stats [3 ], stats ['cleaned_pct' ]]),
171- ('Pending %' , [agg_stats [4 ], stats ['pending_pct' ]])],
172- 'Chart B: List Composition' ,
173- list_id + '_breakdown_' + epoch_time )
174-
175278 draw_bar (
176- ['Your List' , 'Dataset Average' ],
177- [stats ['open_rate' ], agg_stats [5 ]],
279+ bar_titles ,
280+ [* list_stats ['open_rate' ], * agg_stats ['open_rate' ]],
281+ diff_vals ['open_rate' ] if diff_vals else None ,
178282 'Chart C: List Open Rate' ,
179283 list_id + '_open_rate_' + epoch_time ,
180284 percentage_values = True )
181285
286+ draw_stacked_horizontal_bar (
287+ stacked_bar_titles ,
288+ [('Subscribed %' ,
289+ [* agg_stats ['subscribed_pct' ], * list_stats ['subscribed_pct' ]]),
290+ ('Unsubscribed %' ,
291+ [* agg_stats ['unsubscribed_pct' ], * list_stats ['unsubscribed_pct' ]]),
292+ ('Cleaned %' ,
293+ [* agg_stats ['cleaned_pct' ], * list_stats ['cleaned_pct' ]]),
294+ ('Pending %' ,
295+ [* agg_stats ['pending_pct' ], * list_stats ['pending_pct' ]])],
296+ diff_vals ['subscribed_pct' ][::- 1 ] if diff_vals else None ,
297+ 'Chart B: List Composition' ,
298+ list_id + '_breakdown_' + epoch_time )
299+
300+
182301 histogram_legend_uri = ('https://s3-us-west-2.amazonaws.com/email-'
183302 'benchmarking-imgs/open_rate_histogram_legend.png' )
184303
185304 draw_histogram (
186305 {'title' : 'Open Rate by Decile' , 'vals' : np .linspace (.05 , .95 , num = 10 )},
187- {'title' : 'Subscribers' , 'vals' : stats ['hist_bin_counts' ]},
306+ {'title' : 'Subscribers' , 'vals' : list_stats ['hist_bin_counts' ][ 0 ]},
188307 'Chart D: Distribution of Subscribers by Open Rate' ,
189308 histogram_legend_uri ,
190309 list_id + '_open_rate_histogram_' + epoch_time )
191310
311+ high_open_rt_vals = [
312+ * list_stats ['high_open_rt_pct' ],
313+ * agg_stats ['high_open_rt_pct' ]]
314+
192315 draw_donuts (
193316 ['Open Rate >80%' , 'Open Rate <=80%' ],
194- [('Your List' ,
195- [ stats [ 'high_open_rt_pct' ], 1 - stats [ 'high_open_rt_pct' ]]) ,
196- ( 'Dataset Average' , [ agg_stats [ 6 ], 1 - agg_stats [ 6 ]])] ,
317+ [(title , [ high_open_rt_vals [ title_num ], 1 - high_open_rt_vals [ title_num ]])
318+ for title_num , title in enumerate ( bar_titles )] ,
319+ diff_vals [ 'high_open_rt_pct' ] if diff_vals else None ,
197320 'Chart E: Percentage of Subscribers with User Unique Open Rate >80%' ,
198321 list_id + '_high_open_rt_pct_' + epoch_time )
199322
323+ cur_yr_inactive_vals = [
324+ * list_stats ['cur_yr_inactive_pct' ],
325+ * agg_stats ['cur_yr_inactive_pct' ]]
326+
200327 draw_donuts (
201328 ['Inactive in Past 365 Days' , 'Active in Past 365 Days' ],
202- [('Your List' ,
203- [stats ['cur_yr_inactive_pct' ], 1 - stats ['cur_yr_inactive_pct' ]]),
204- ('Dataset Average' , [agg_stats [7 ], 1 - agg_stats [7 ]])],
329+ [(title ,
330+ [cur_yr_inactive_vals [title_num ], 1 - cur_yr_inactive_vals [title_num ]])
331+ for title_num , title in enumerate (bar_titles )],
332+ diff_vals ['cur_yr_inactive_pct' ] if diff_vals else None ,
205333 'Chart F: Percentage of Subscribers who did not Open '
206334 'in last 365 Days' ,
207335 list_id + '_cur_yr_inactive_pct_' + epoch_time )
@@ -233,10 +361,11 @@ def extract_stats(list_object):
233361def init_list_analysis (user_data , list_data , org_id ):
234362 """Celery task wrapper for each stage of analyzing a list.
235363
236- First checks if there is a recently cached analysis, i.e. already in the
237- database. If not, calls import_analyze_store_list() to generate
238- the ListStats and an associated EmailList. Next updates the user's
239- privacy options (e.g. store_aggregates, monthly_updates) if the list was
364+ First checks if there is a recently cached analysis/analyses,
365+ i.e. already in the database. If not, calls import_analyze_store_list()
366+ to generate the ListStats (and an associated EmailList, if the user
367+ gave permission to store their data). Next updates the user's privacy options
368+ (e.g. store_aggregates, monthly_updates) if the list was
240369 cached. Then checks if the user selected monthly updates, if so,
241370 create the relationship. Finally, generates a benchmarking
242371 report with the stats.
@@ -247,12 +376,12 @@ def init_list_analysis(user_data, list_data, org_id):
247376 org_id: the id of the organization associated with the list.
248377 """
249378
250- # Try to pull the most recent ListStats from the database
251- # Otherwise generate them
252- most_recent_analysis = (ListStats .query .filter_by (
379+ # Try to pull the two most recent ListStats records from the database
380+ # Otherwise generate one
381+ analyses = (ListStats .query .filter_by (
253382 list_id = list_data ['list_id' ]).order_by (desc (
254- 'analysis_timestamp' )).first () or import_analyze_store_list (
255- list_data , org_id , user_data ['email' ]))
383+ 'analysis_timestamp' )).limit ( 2 ). all () or [
384+ import_analyze_store_list ( list_data , org_id , user_data ['email' ])] )
256385
257386 # If the user chose to store their data, there will be an associated
258387 # EmailList object
@@ -278,9 +407,9 @@ def init_list_analysis(user_data, list_data, org_id):
278407 if list_data ['monthly_updates' ]:
279408 associate_user_with_list (user_data ['user_id' ], list_object )
280409
281- # Convert the ListStats object to an easier-to-use dictionary
282- stats = extract_stats ( most_recent_analysis )
283- send_report (stats , list_data ['list_id' ],
410+ list_stats , agg_stats = generate_summary_stats ( analyses )
411+
412+ send_report (list_stats , agg_stats , list_data ['list_id' ],
284413 list_data ['list_name' ], [user_data ['email' ]])
285414
286415@celery .task
@@ -398,12 +527,13 @@ def send_monthly_reports():
398527 monthly_report_list .list_id )
399528
400529 # Get the most recent analysis for the list
401- stats_object = ListStats .query .filter_by (
402- list_id = monthly_report_list .list_id ).order_by (
403- desc ('analysis_timestamp' )).first ()
530+ analyses = ListStats .query .filter_by (
531+ list_id = monthly_report_list .list_id ).order_by (desc (
532+ 'analysis_timestamp' )).limit (2 ).all ()
533+
534+ # Generate summary statistics
535+ list_stats , agg_stats = generate_summary_stats (analyses )
404536
405- # Extract stats from the list object
406- stats = extract_stats (stats_object )
407- send_report (stats , monthly_report_list .list_id ,
537+ send_report (list_stats , agg_stats , monthly_report_list .list_id ,
408538 monthly_report_list .list_name ,
409539 users_to_email )
0 commit comments