Skip to content
This repository was archived by the owner on Feb 6, 2026. It is now read-only.

Commit abbb22b

Browse files
Merge pull request #41 from ShorensteinCenter/devel
[Feature, N/A] Add month-over-month changes to graphs and integrate playbook
2 parents fcbc439 + 7e11042 commit abbb22b

9 files changed

Lines changed: 498 additions & 158 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
node_modules/
22
*.exe
3-
*.png
43
__pycache__/
54
*.pyc
65
*.db
@@ -14,3 +13,4 @@ benchmarks-env/
1413
.coverage
1514
.DS_Store
1615
scratch/
16+
app/static/charts/*.png

app/static/img/light-bulb.png

5.01 KB
Loading
192 KB
Loading

app/tasks.py

Lines changed: 197 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
import os
33
import json
44
import time
5+
import calendar
56
from datetime import datetime, timedelta, timezone
67
import requests
8+
import pandas as pd
79
import numpy as np
810
from sqlalchemy import desc
9-
from sqlalchemy.sql.functions import func
1011
from celery.utils.log import get_task_logger
1112
from app import celery, db
1213
from app.emails import send_email
@@ -119,89 +120,216 @@ def import_analyze_store_list(list_data, org_id, user_email=None):
119120

120121
return list_stats
121122

122-
def send_report(stats, list_id, list_name, user_email_or_emails):
123+
def generate_summary_stats(list_stats_objects):
124+
"""Generates summary statistics dictionaries for a list and the database.
125+
126+
First extracts the stats from SQLAlchemy objects for the most recent and
127+
next to most recent analyses for the given list and organizes them in a
128+
dictionary. Then creates a similar dictionary for the database averages. If
129+
list_stats_objects only contains a single analysis, takes the average of the
130+
most recent analysis across all lists. Otherwise, takes the average of the
131+
most recent analysis and next to me recent analysis across all lists with
132+
two or more total analyses.
133+
134+
Args:
135+
list_stats_objects: a list of SQLAlchemy query results.
136+
137+
Returns:
138+
A tuple consisting of two dictionaries. The first dictionary pertains to
139+
the list under analysis, the second pertains to the database as a whole.
140+
The dictionaries consist of keys, such as 'subscribers', and values,
141+
a list of analysis results. These lists contain either one element
142+
(the value for the most recent analysis), or two elements
143+
(the value for the next to most recent analysis followed by the value
144+
for the most recent analysis).
145+
"""
146+
147+
# Convert the ListStats objects to easier-to-use dictionary
148+
# Then merge the two dictionaries
149+
most_recent_stats = extract_stats(list_stats_objects[0])
150+
previous_stats = (extract_stats(list_stats_objects[1])
151+
if len(list_stats_objects) > 1
152+
else None)
153+
list_stats = ({k: [previous_stats[k], v] for k, v in most_recent_stats.items()}
154+
if previous_stats
155+
else {k: [v] for k, v in most_recent_stats.items()})
156+
157+
# Now generate averages
158+
# Works a bit differently depending on whether we're comparing
159+
# single or multiple analyses (see docstring)
160+
if previous_stats:
161+
162+
# This query returns all list_stats objects where the list_id column
163+
# is duplicated elsewhere in the table as well as a row_number column,
164+
# e.g. row_number == 1 corresponds to the most recent analysis for each
165+
# grouping
166+
df = pd.read_sql( # pylint: disable=invalid-name
167+
'''WITH has_prev_stats as (
168+
SELECT list_stats.list_id, COUNT(*) from list_stats
169+
LEFT JOIN email_list ON list_stats.list_id = email_list.list_id
170+
WHERE email_list.store_aggregates = 'True'
171+
GROUP BY list_stats.list_id HAVING COUNT(*) >= 2)
172+
SELECT list_stats.*,
173+
ROW_NUMBER() OVER(PARTITION BY list_stats.list_id
174+
ORDER BY analysis_timestamp DESC)
175+
FROM list_stats
176+
JOIN has_prev_stats
177+
ON has_prev_stats.list_id = list_stats.list_id;''',
178+
db.session.bind)
179+
180+
mean_dfs = [df[df['row_number'] == 2].mean(),
181+
df[df['row_number'] == 1].mean()]
182+
183+
else:
184+
185+
df = pd.read_sql( # pylint: disable=invalid-name
186+
ListStats.query.filter(ListStats.list.has(
187+
store_aggregates=True)).order_by('list_id', desc(
188+
'analysis_timestamp')).distinct(ListStats.list_id).statement,
189+
db.session.bind)
190+
191+
mean_dfs = [df.mean()]
192+
193+
194+
agg_stats = {
195+
'subscribers': [int(mean_df['subscribers']) for mean_df in mean_dfs],
196+
'subscribed_pct': [mean_df['subscribed_pct'] for mean_df in mean_dfs],
197+
'unsubscribed_pct': [mean_df['unsubscribed_pct'] for mean_df in mean_dfs],
198+
'cleaned_pct': [mean_df['cleaned_pct'] for mean_df in mean_dfs],
199+
'pending_pct': [mean_df['pending_pct'] for mean_df in mean_dfs],
200+
'open_rate': [mean_df['open_rate'] for mean_df in mean_dfs],
201+
'high_open_rt_pct': [mean_df['high_open_rt_pct'] for mean_df in mean_dfs],
202+
'cur_yr_inactive_pct': [
203+
mean_df['cur_yr_inactive_pct'] for mean_df in mean_dfs]
204+
}
205+
206+
return list_stats, agg_stats
207+
208+
def generate_diffs(list_stats, agg_stats):
209+
"""Generates diffs between last month and this month's stats and returns
210+
them in a dictionary."""
211+
diffs = {}
212+
for k in agg_stats.keys():
213+
diffs[k] = [
214+
((list_stats[k][1] - list_stats[k][0]) / list_stats[k][0]
215+
if list_stats[k][0] else 0),
216+
((agg_stats[k][1] - agg_stats[k][0]) / agg_stats[k][0]
217+
if agg_stats[k][0] else 0)
218+
]
219+
diffs[k] = [('+{:.1%}' if diff >= 0 else '{:.1%}').format(diff)
220+
for diff in diffs[k]]
221+
return diffs
222+
223+
def send_report( # pylint: disable=too-many-locals
224+
list_stats, agg_stats, list_id, list_name, user_email_or_emails):
123225
"""Generates charts using Plotly and emails them to the user.
124226
125227
Args:
126-
stats: a dictionary containing analysis results for a list.
228+
list_stats: a dictionary containing analysis results for a list.
229+
agg_stats: a dictionary containing aggregate analysis results from the
230+
database.
127231
list_id: the list's unique MailChimp id.
128232
list_name: the list's name.
129233
user_email_or_emails: a list of emails to send the report to.
130234
"""
131235

132-
# This subquery generates the most recent stats
133-
# For each unique list_id in the database
134-
# Where store_aggregates is True
135-
subquery = ListStats.query.filter(
136-
ListStats.list.has(store_aggregates=True)).order_by('list_id', desc(
137-
'analysis_timestamp')).distinct(ListStats.list_id).subquery()
138-
139-
# Generate aggregates within the subquery
140-
agg_stats = db.session.query(
141-
func.avg(subquery.columns.subscribers),
142-
func.avg(subquery.columns.subscribed_pct),
143-
func.avg(subquery.columns.unsubscribed_pct),
144-
func.avg(subquery.columns.cleaned_pct),
145-
func.avg(subquery.columns.pending_pct),
146-
func.avg(subquery.columns.open_rate),
147-
func.avg(subquery.columns.high_open_rt_pct),
148-
func.avg(subquery.columns.cur_yr_inactive_pct)).first()
149-
150-
# Make sure we have no 'None' values
151-
agg_stats = [agg if agg else 0 for agg in agg_stats]
152-
153-
# Convert subscribers average to an integer
154-
agg_stats[0] = int(agg_stats[0])
155-
156-
# Generate epoch time (to get around image caching in webmail)
236+
# Generate epoch time to append to filenames
237+
# This is a hacky workaround for webmail image caching
157238
epoch_time = str(int(time.time()))
158239

159-
# Generate charts
240+
# Figure out whether there's two sets of stats per graph
241+
contains_prev_month = len(list_stats['subscribers']) == 2
242+
243+
if contains_prev_month:
244+
245+
# Calculate the diffs (for month-over-month change labels)
246+
diff_vals = generate_diffs(list_stats, agg_stats)
247+
248+
# Get the current month and previous month in words (for labels)
249+
cur_month = datetime.now().month
250+
last_month = cur_month - 1 or 12
251+
cur_month_formatted = calendar.month_abbr[cur_month]
252+
last_month_formatted = calendar.month_abbr[last_month]
253+
254+
bar_titles = [
255+
'Your List<br>as of ' + last_month_formatted,
256+
'Your List<br>as of ' + cur_month_formatted,
257+
'Average<br>as of ' + last_month_formatted,
258+
'Average<br>as of ' + cur_month_formatted]
259+
stacked_bar_titles = [
260+
'Average <br>as of ' + last_month_formatted + ' ',
261+
'Average <br>as of ' + cur_month_formatted + ' ',
262+
'Your List <br>as of ' + last_month_formatted + ' ',
263+
'Your List <br>as of ' + cur_month_formatted + ' ']
264+
265+
else:
266+
267+
diff_vals = None
268+
bar_titles = ['Your List', 'Average']
269+
stacked_bar_titles = ['Average ', 'Your List ']
270+
160271
draw_bar(
161-
['Your List', 'Dataset Average'],
162-
[stats['subscribers'], agg_stats[0]],
272+
bar_titles,
273+
[*list_stats['subscribers'], *agg_stats['subscribers']],
274+
diff_vals['subscribers'] if diff_vals else None,
163275
'Chart A: List Size',
164276
list_id + '_size_' + epoch_time)
165277

166-
draw_stacked_horizontal_bar(
167-
['Dataset Average', 'Your List'],
168-
[('Subscribed %', [agg_stats[1], stats['subscribed_pct']]),
169-
('Unsubscribed %', [agg_stats[2], stats['unsubscribed_pct']]),
170-
('Cleaned %', [agg_stats[3], stats['cleaned_pct']]),
171-
('Pending %', [agg_stats[4], stats['pending_pct']])],
172-
'Chart B: List Composition',
173-
list_id + '_breakdown_' + epoch_time)
174-
175278
draw_bar(
176-
['Your List', 'Dataset Average'],
177-
[stats['open_rate'], agg_stats[5]],
279+
bar_titles,
280+
[*list_stats['open_rate'], *agg_stats['open_rate']],
281+
diff_vals['open_rate'] if diff_vals else None,
178282
'Chart C: List Open Rate',
179283
list_id + '_open_rate_' + epoch_time,
180284
percentage_values=True)
181285

286+
draw_stacked_horizontal_bar(
287+
stacked_bar_titles,
288+
[('Subscribed %',
289+
[*agg_stats['subscribed_pct'], *list_stats['subscribed_pct']]),
290+
('Unsubscribed %',
291+
[*agg_stats['unsubscribed_pct'], *list_stats['unsubscribed_pct']]),
292+
('Cleaned %',
293+
[*agg_stats['cleaned_pct'], *list_stats['cleaned_pct']]),
294+
('Pending %',
295+
[*agg_stats['pending_pct'], *list_stats['pending_pct']])],
296+
diff_vals['subscribed_pct'][::-1] if diff_vals else None,
297+
'Chart B: List Composition',
298+
list_id + '_breakdown_' + epoch_time)
299+
300+
182301
histogram_legend_uri = ('https://s3-us-west-2.amazonaws.com/email-'
183302
'benchmarking-imgs/open_rate_histogram_legend.png')
184303

185304
draw_histogram(
186305
{'title': 'Open Rate by Decile', 'vals': np.linspace(.05, .95, num=10)},
187-
{'title': 'Subscribers', 'vals': stats['hist_bin_counts']},
306+
{'title': 'Subscribers', 'vals': list_stats['hist_bin_counts'][0]},
188307
'Chart D: Distribution of Subscribers by Open Rate',
189308
histogram_legend_uri,
190309
list_id + '_open_rate_histogram_' + epoch_time)
191310

311+
high_open_rt_vals = [
312+
*list_stats['high_open_rt_pct'],
313+
*agg_stats['high_open_rt_pct']]
314+
192315
draw_donuts(
193316
['Open Rate >80%', 'Open Rate <=80%'],
194-
[('Your List',
195-
[stats['high_open_rt_pct'], 1 - stats['high_open_rt_pct']]),
196-
('Dataset Average', [agg_stats[6], 1 - agg_stats[6]])],
317+
[(title, [high_open_rt_vals[title_num], 1 - high_open_rt_vals[title_num]])
318+
for title_num, title in enumerate(bar_titles)],
319+
diff_vals['high_open_rt_pct'] if diff_vals else None,
197320
'Chart E: Percentage of Subscribers with User Unique Open Rate >80%',
198321
list_id + '_high_open_rt_pct_' + epoch_time)
199322

323+
cur_yr_inactive_vals = [
324+
*list_stats['cur_yr_inactive_pct'],
325+
*agg_stats['cur_yr_inactive_pct']]
326+
200327
draw_donuts(
201328
['Inactive in Past 365 Days', 'Active in Past 365 Days'],
202-
[('Your List',
203-
[stats['cur_yr_inactive_pct'], 1 - stats['cur_yr_inactive_pct']]),
204-
('Dataset Average', [agg_stats[7], 1 - agg_stats[7]])],
329+
[(title,
330+
[cur_yr_inactive_vals[title_num], 1 - cur_yr_inactive_vals[title_num]])
331+
for title_num, title in enumerate(bar_titles)],
332+
diff_vals['cur_yr_inactive_pct'] if diff_vals else None,
205333
'Chart F: Percentage of Subscribers who did not Open '
206334
'in last 365 Days',
207335
list_id + '_cur_yr_inactive_pct_' + epoch_time)
@@ -233,10 +361,11 @@ def extract_stats(list_object):
233361
def init_list_analysis(user_data, list_data, org_id):
234362
"""Celery task wrapper for each stage of analyzing a list.
235363
236-
First checks if there is a recently cached analysis, i.e. already in the
237-
database. If not, calls import_analyze_store_list() to generate
238-
the ListStats and an associated EmailList. Next updates the user's
239-
privacy options (e.g. store_aggregates, monthly_updates) if the list was
364+
First checks if there is a recently cached analysis/analyses,
365+
i.e. already in the database. If not, calls import_analyze_store_list()
366+
to generate the ListStats (and an associated EmailList, if the user
367+
gave permission to store their data). Next updates the user's privacy options
368+
(e.g. store_aggregates, monthly_updates) if the list was
240369
cached. Then checks if the user selected monthly updates, if so,
241370
create the relationship. Finally, generates a benchmarking
242371
report with the stats.
@@ -247,12 +376,12 @@ def init_list_analysis(user_data, list_data, org_id):
247376
org_id: the id of the organization associated with the list.
248377
"""
249378

250-
# Try to pull the most recent ListStats from the database
251-
# Otherwise generate them
252-
most_recent_analysis = (ListStats.query.filter_by(
379+
# Try to pull the two most recent ListStats records from the database
380+
# Otherwise generate one
381+
analyses = (ListStats.query.filter_by(
253382
list_id=list_data['list_id']).order_by(desc(
254-
'analysis_timestamp')).first() or import_analyze_store_list(
255-
list_data, org_id, user_data['email']))
383+
'analysis_timestamp')).limit(2).all() or [
384+
import_analyze_store_list(list_data, org_id, user_data['email'])])
256385

257386
# If the user chose to store their data, there will be an associated
258387
# EmailList object
@@ -278,9 +407,9 @@ def init_list_analysis(user_data, list_data, org_id):
278407
if list_data['monthly_updates']:
279408
associate_user_with_list(user_data['user_id'], list_object)
280409

281-
# Convert the ListStats object to an easier-to-use dictionary
282-
stats = extract_stats(most_recent_analysis)
283-
send_report(stats, list_data['list_id'],
410+
list_stats, agg_stats = generate_summary_stats(analyses)
411+
412+
send_report(list_stats, agg_stats, list_data['list_id'],
284413
list_data['list_name'], [user_data['email']])
285414

286415
@celery.task
@@ -398,12 +527,13 @@ def send_monthly_reports():
398527
monthly_report_list.list_id)
399528

400529
# Get the most recent analysis for the list
401-
stats_object = ListStats.query.filter_by(
402-
list_id=monthly_report_list.list_id).order_by(
403-
desc('analysis_timestamp')).first()
530+
analyses = ListStats.query.filter_by(
531+
list_id=monthly_report_list.list_id).order_by(desc(
532+
'analysis_timestamp')).limit(2).all()
533+
534+
# Generate summary statistics
535+
list_stats, agg_stats = generate_summary_stats(analyses)
404536

405-
# Extract stats from the list object
406-
stats = extract_stats(stats_object)
407-
send_report(stats, monthly_report_list.list_id,
537+
send_report(list_stats, agg_stats, monthly_report_list.list_id,
408538
monthly_report_list.list_name,
409539
users_to_email)

app/templates/email-base.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<p style="text-align:left;margin-bottom:2em">
99
<img style="width:175px;" alt="logo" src="{{ url_for('static', filename='img/logo.png', _external=True) }}">
1010
</p>
11-
<p style="color:#a71930;text-align:center;font-weight:400;font-size:2rem;font-family:Montserrat,Arial,sans-serif;margin-bottom:1.7rem">{{ title }}</p>
11+
<p style="color:#a71930;text-align:center;font-weight:400;font-size:2.5rem;line-height:3.5rem;font-family:Montserrat,Verdana,sans-serif;margin-bottom:1.7rem">{{ title }}</p>
1212
{% block content %}{% endblock %}
1313
</body>
1414
</html>

0 commit comments

Comments
 (0)