Skip to content

Commit 14675ab

Browse files
authored
Merge pull request #59 from datakind/pdp-update-data-assessment-nb-template
[pdp] Refine data assessment template nb
2 parents bcf8ccb + 9770382 commit 14675ab

1 file changed

Lines changed: 81 additions & 58 deletions

File tree

notebooks/pdp/01-data-assessment-eda-TEMPLATE.py

Lines changed: 81 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,18 @@
1515
# COMMAND ----------
1616

1717
# MAGIC %md
18-
# MAGIC # Setup
18+
# MAGIC # setup
1919

2020
# COMMAND ----------
2121

2222
# MAGIC %sh python --version
2323

2424
# COMMAND ----------
2525

26-
# install dependencies, most of which should come through our 1st-party SST package
26+
# install dependencies, most/all of which should come through our 1st-party SST package
27+
# NOTE: it's okay to use 'develop' or a feature branch while developing this nb
28+
# but when it's finished, it's best to pin to a specific version of the package
29+
# %pip install "student-success-tool == 0.1.0"
2730
# %pip install git+https://github.com/datakind/student-success-tool.git@develop
2831

2932
# COMMAND ----------
@@ -33,7 +36,6 @@
3336
# COMMAND ----------
3437

3538
import logging
36-
import os
3739
import sys
3840

3941
import matplotlib.pyplot as plt
@@ -44,23 +46,24 @@
4446
from databricks.connect import DatabricksSession
4547
from databricks.sdk.runtime import dbutils
4648

49+
from student_success_tool import configs
4750
from student_success_tool.analysis import pdp
4851

4952
# COMMAND ----------
5053

51-
logging.basicConfig(level=logging.INFO)
54+
logging.basicConfig(level=logging.INFO, force=True)
5255
logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger
5356

5457
try:
55-
spark_session = DatabricksSession.builder.getOrCreate()
58+
spark = DatabricksSession.builder.getOrCreate()
5659
except Exception:
5760
logging.warning("unable to create spark session; are you in a Databricks runtime?")
5861
pass
5962

6063
# COMMAND ----------
6164

6265
# MAGIC %md
63-
# MAGIC ## `student-success-intervention` hacks
66+
# MAGIC ## import school-specific code
6467

6568
# COMMAND ----------
6669

@@ -69,38 +72,26 @@
6972

7073
# COMMAND ----------
7174

72-
# HACK: insert our 1st-party (school-specific) code into PATH
75+
# insert our 1st-party (school-specific) code into PATH
7376
if "../" not in sys.path:
7477
sys.path.insert(1, "../")
7578

76-
# TODO: specify school's subpackage
79+
# TODO: specify school's subpackage here
7780
from analysis import * # noqa: F403
7881

7982
# COMMAND ----------
8083

81-
# MAGIC %md
82-
# MAGIC ## unity catalog config
83-
84-
# COMMAND ----------
85-
86-
catalog = "sst_dev"
87-
88-
# configure where data is to be read from / written to
89-
inst_name = "SCHOOL" # TODO: fill in school's name in Unity Catalog
90-
read_schema = f"{inst_name}_bronze"
91-
write_schema = f"{inst_name}_silver"
92-
93-
path_volume = os.path.join(
94-
"/Volumes", catalog, read_schema, f"{inst_name}_bronze_file_volume"
95-
)
96-
path_table = f"{catalog}.{read_schema}"
97-
print(f"{path_table=}")
98-
print(f"{path_volume=}")
84+
# project configuration should be stored in a config file in TOML format
85+
# it'll start out with just basic info: institution_id, institution_name
86+
# but as each step of the pipeline gets built, more parameters will be moved
87+
# from hard-coded notebook variables to shareable, persistent config fields
88+
cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2)
89+
cfg
9990

10091
# COMMAND ----------
10192

10293
# MAGIC %md
103-
# MAGIC # Read and Validate Raw Data
94+
# MAGIC # read and validate raw data
10495

10596
# COMMAND ----------
10697

@@ -109,14 +100,16 @@
109100

110101
# COMMAND ----------
111102

112-
# TODO: fill in school's name; may not be same as in the schemas above
113-
fpath_course = os.path.join(path_volume, "SCHOOL_COURSE_AR_DEID_DTTM.csv")
103+
# TODO: fill in the actual path to school's raw course file
104+
# okay to add it to project config now or later, whatever you prefer
105+
raw_course_file_path = cfg.datasets["labeled"].raw_course.file_path
106+
# raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv"
114107

115108
# COMMAND ----------
116109

117110
# read without any schema validation, so we can look at the data "raw"
118111
df_course_raw = pdp.dataio.read_raw_pdp_course_data_from_file(
119-
fpath_course, schema=None, dttm_format="%Y%m%d.0"
112+
raw_course_file_path, schema=None, dttm_format="%Y%m%d.0"
120113
)
121114
print(f"rows x cols = {df_course_raw.shape}")
122115
df_course_raw.head()
@@ -127,6 +120,10 @@
127120

128121
# COMMAND ----------
129122

123+
df_course_raw["course_begin_date"].describe()
124+
125+
# COMMAND ----------
126+
130127
# MAGIC %md
131128
# MAGIC Quick checks:
132129
# MAGIC - [ ] data exists where it should
@@ -137,14 +134,16 @@
137134

138135
# try to read data while validating with the "base" PDP schema
139136
df_course = pdp.dataio.read_raw_pdp_course_data_from_file(
140-
fpath_course, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0"
137+
raw_course_file_path,
138+
schema=pdp.schemas.RawPDPCourseDataSchema,
139+
dttm_format="%Y%m%d.0",
141140
)
142141
df_course
143142

144143
# COMMAND ----------
145144

146145
# MAGIC %md
147-
# MAGIC If the above command works, and `df_course` is indeed a `pd.DataFrame` containing the validated + parsed PDP cohort dataset, then you're all set, and can skip ahead to the next section. If not, and this is instead a json blob of schema errors, then you'll need to iteratively develop school-specific overrides. There are existing examples you can refer to in the `student-success-intervention` repo.
146+
# MAGIC If the above command works, and `df_course` is indeed a `pd.DataFrame` containing the validated + parsed PDP cohort dataset, then you're all set, and can skip ahead to the next section. If not, and this is instead a json blob of schema errors, then you'll need to inspect those errors and iteratively develop school-specific overrides to handle them. There are existing examples you can refer to in the `student-success-intervention` repo if you're unsure.
148147
# MAGIC
149148
# MAGIC This will involve some ad-hoc exploratory work, depending on the schema errors. For example:
150149
# MAGIC
@@ -199,7 +198,7 @@
199198
# MAGIC ```
200199
# MAGIC
201200
# MAGIC At this point, `df_course` should be a properly validated and parsed data frame, ready for exploratory data analysis.
202-
201+
# MAGIC
203202

204203
# COMMAND ----------
205204

@@ -208,22 +207,25 @@
208207

209208
# COMMAND ----------
210209

211-
212-
# TODO: fill in school's name; may not be same as in the schemas above
213-
fpath_cohort = os.path.join(path_volume, "SCHOOL_COHORT_AR_DEID_DTTM.csv")
210+
# TODO: fill in the actual path to school's raw cohort file
211+
# okay to add it to project config now or later, whatever you prefer
212+
raw_cohort_file_path = cfg.datasets["labeled"].raw_cohort.file_path
213+
# raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv"
214214

215215
# COMMAND ----------
216216

217217
# read without any schema validation, so we can look at the data "raw"
218-
df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(fpath_cohort, schema=None)
218+
df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(
219+
raw_cohort_file_path, schema=None
220+
)
219221
print(f"rows x cols = {df_cohort_raw.shape}")
220222
df_cohort_raw.head()
221223

222224
# COMMAND ----------
223225

224226
# try to read data while validating with the "base" PDP schema
225227
df_cohort = pdp.dataio.read_raw_pdp_cohort_data_from_file(
226-
fpath_cohort, schema=pdp.schemas.base.RawPDPCohortDataSchema
228+
raw_cohort_file_path, schema=pdp.schemas.base.RawPDPCohortDataSchema
227229
)
228230
df_cohort
229231

@@ -242,44 +244,52 @@
242244
# COMMAND ----------
243245

244246
# MAGIC %md
245-
# MAGIC ## save validated data
247+
# MAGIC ## STOP HERE!
248+
249+
# COMMAND ----------
250+
251+
# MAGIC %md
252+
# MAGIC Before continuing on to EDA, now's a great time to do a couple things:
253+
# MAGIC
254+
# MAGIC - Copy any school-specific raw dataset schemas into a `schemas.py` file in the current working directory
255+
# MAGIC - Copy any school-specific preprocessing functions needed to coerce the raw data into a standardized form into a `dataio.py` file in the current working directory
256+
# MAGIC - **Optional:** If you want easy access to outputs from every (sub-)step of the data transformation pipeline, save the validated datasets into this school's "silver" schema in Unity Catalog.
246257

247258
# COMMAND ----------
248259

249260
pdp.dataio.write_data_to_delta_table(
250261
df_course,
251-
f"{catalog}.{write_schema}.course_dataset_validated",
252-
spark_session=spark_session,
262+
"CATALOG.INST_NAME_silver.course_dataset_validated",
263+
spark_session=spark,
253264
)
254265

255266
# COMMAND ----------
256267

257268
pdp.dataio.write_data_to_delta_table(
258269
df_cohort,
259-
f"{catalog}.{write_schema}.cohort_dataset_validated",
260-
spark_session=spark_session,
270+
"CATALOG.INST_NAME_silver.cohort_dataset_validated",
271+
spark_session=spark,
261272
)
262273

263274
# COMMAND ----------
264275

265276
# MAGIC %md
266-
# MAGIC # Exploratory Data Analysis
277+
# MAGIC # exploratory data analysis
267278

268279
# COMMAND ----------
269280

270-
# MAGIC %md
271281
# MAGIC %md
272282
# MAGIC ## read validated data
273283
# MAGIC
274-
# MAGIC (so you don't have to execute the validation process more than once)
284+
# MAGIC (optional, so you don't have to execute the validation process more than once)
275285

276286
# COMMAND ----------
277287

278288
# use base or school-specific schema, as needed
279289
df_course = pdp.schemas.RawPDPCourseDataSchema(
280290
pdp.dataio.read_data_from_delta_table(
281-
f"{catalog}.{write_schema}.course_dataset_validated",
282-
spark_session=spark_session,
291+
"CATALOG.INST_NAME_silver.course_dataset_validated",
292+
spark_session=spark,
283293
)
284294
)
285295
df_course.shape
@@ -288,8 +298,8 @@
288298

289299
df_cohort = pdp.schemas.RawCohortDataSchema(
290300
pdp.dataio.read_data_from_delta_table(
291-
f"{catalog}.{write_schema}.cohort_dataset_validated",
292-
spark_session=spark_session,
301+
"CATALOG.INST_NAME_silver.cohort_dataset_validated",
302+
spark_session=spark,
293303
)
294304
)
295305
df_cohort.shape
@@ -307,8 +317,11 @@
307317
# COMMAND ----------
308318

309319
# specific follow-ups, for example
320+
# df_course["academic_year"].value_counts(normalize=True, dropna=False)
321+
# df_course["academic_term"].value_counts(normalize=True, dropna=False)
310322
# df_course["grade"].value_counts(normalize=True, dropna=False)
311323
# df_course["delivery_method"].value_counts(normalize=True, dropna=False)
324+
# df_course["course_name"].value_counts(normalize=True, dropna=False).head(10)
312325

313326
# COMMAND ----------
314327

@@ -317,8 +330,8 @@
317330
# COMMAND ----------
318331

319332
# specific follow-ups, for example
320-
# df_course["cohort"].value_counts(normalize=True, dropna=False)
321-
# df_course["enrollment_type"].value_counts(normalize=True, dropna=False)
333+
# df_cohort["cohort"].value_counts(normalize=True, dropna=False)
334+
# df_cohort["enrollment_type"].value_counts(normalize=True, dropna=False)
322335

323336
# COMMAND ----------
324337

@@ -509,15 +522,19 @@
509522

510523
# COMMAND ----------
511524

525+
df_pre_cohort["enrollment_type"].value_counts()
526+
527+
# COMMAND ----------
528+
512529
# MAGIC %md
513530
# MAGIC ### filter invalid rows(?)
514531

515532
# COMMAND ----------
516533

517534
# this is probably a filter you'll want to apply
518535
# these courses known to be an issue w/ PDP data
519-
df_course_valid = df_course.loc[df_course["course_number"].notna(), :]
520-
df_course_valid
536+
df_course_filtered = df_course.loc[df_course["course_number"].notna(), :]
537+
df_course_filtered.shape
521538

522539
# COMMAND ----------
523540

@@ -527,7 +544,7 @@
527544
# COMMAND ----------
528545

529546
# MAGIC %md
530-
# MAGIC **Note:** You'll probably want to use the "valid" dataframes for most of these plots, but not necessarily for all. For simplicity, all these example plots will just use the base data w/o extra data validation filtering applied. It's your call!
547+
# MAGIC **Note:** You'll probably want to use the filtered dataframes for most of these plots, but not necessarily for all. Sometimes comparing the two can be instructive. For simplicity, all these example plots will just use the base data w/o extra data validation filtering applied. It's your call!
531548

532549
# COMMAND ----------
533550

@@ -574,6 +591,7 @@
574591

575592
ax = sb.histplot(
576593
df_course.sort_values(by="academic_year"),
594+
# df_course_filtered.sort_values(by="academic_year"),
577595
y="academic_year",
578596
hue="academic_term",
579597
multiple="stack",
@@ -645,6 +663,7 @@
645663
ax = sb.histplot(
646664
pd.merge(
647665
df_course.groupby("student_guid")
666+
# df_course_filtered.groupby("student_guid")
648667
.size()
649668
.rename("num_courses_enrolled")
650669
.reset_index(drop=False),
@@ -667,6 +686,9 @@
667686
df_course.groupby("student_guid").agg(
668687
{"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
669688
),
689+
# df_course_filtered.groupby("student_guid").agg(
690+
# {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
691+
# ),
670692
x="number_of_credits_attempted",
671693
y="number_of_credits_earned",
672694
kind="hex",
@@ -764,12 +786,13 @@
764786
# COMMAND ----------
765787

766788
# MAGIC %md
767-
# MAGIC # Wrap-up
789+
# MAGIC # wrap-up
768790

769791
# COMMAND ----------
770792

771793
# MAGIC %md
772-
# MAGIC - [ ] Add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
773-
# MAGIC - ...
794+
# MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
795+
# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `datasets["labeled"].raw_course` and `datasets["labeled"].raw_cohort` blocks
796+
# MAGIC - [ ] Submit a PR including this notebook and any school-specific files added in order to run it
774797

775798
# COMMAND ----------

0 commit comments

Comments
 (0)