Skip to content

Commit ca68c32

Browse files
authored
Merge pull request #112 from datakind/Validation-Errors
Map legacy schema names to new equivalent during validation config check
2 parents 86f925e + 3c7e9d4 commit ca68c32

9 files changed

Lines changed: 63 additions & 474 deletions

File tree

src/webapp/routers/data.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -855,16 +855,12 @@ def infer_models_from_filename(file_path: str, institution_id: str) -> List[str]
855855
inferred.add("COURSE")
856856
if "student" in name:
857857
inferred.add("STUDENT")
858-
if institution_id == "pdp":
859-
inferred.add("SEMESTER")
860858
if "semester" in name:
861859
inferred.add("SEMESTER")
862860
if "cohort" in name:
863861
inferred.add("STUDENT")
864-
inferred.add("SEMESTER")
865862
if "course" not in name and ("ar" in name or "deidentified" in name):
866863
inferred.add("STUDENT")
867-
inferred.add("SEMESTER")
868864

869865
if not inferred:
870866
logging.error(
@@ -938,7 +934,7 @@ def validation_helper(
938934
uploader=str_to_uuid(current_user.user_id),
939935
source=source_str,
940936
sst_generated=False,
941-
schemas=list(inferred_schemas),
937+
schemas=list(allowed_schemas),
942938
valid=True,
943939
)
944940
local_session.get().add(new_file_record)
@@ -960,7 +956,7 @@ def validation_helper(
960956
return {
961957
"name": file_name,
962958
"inst_id": inst_id,
963-
"file_types": list(inferred_schemas),
959+
"file_types": list(allowed_schemas),
964960
"source": source_str,
965961
"status": db_status,
966962
}

src/webapp/routers/data_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def session_fixture():
135135
updated_at=DATETIME_TESTING,
136136
sst_generated=True,
137137
valid=True,
138-
schemas=[SchemaType.PDP_COHORT],
138+
schemas=[SchemaType.STUDENT],
139139
)
140140
file_4 = FileTable(
141141
id=SAMPLE_UUID,
@@ -145,7 +145,7 @@ def session_fixture():
145145
updated_at=DATETIME_TESTING,
146146
sst_generated=True,
147147
valid=True,
148-
schemas=[SchemaType.PDP_COHORT],
148+
schemas=[SchemaType.STUDENT],
149149
)
150150
try:
151151
with sqlalchemy.orm.Session(engine) as session:
@@ -168,7 +168,7 @@ def session_fixture():
168168
updated_at=DATETIME_TESTING,
169169
sst_generated=False,
170170
valid=False,
171-
schemas=[SchemaType.PDP_COURSE],
171+
schemas=[SchemaType.COURSE],
172172
),
173173
file_3,
174174
file_4,

src/webapp/routers/models.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
get_external_bucket_name,
2121
SchemaType,
2222
decode_url_piece,
23+
LEGACY_TO_NEW_SCHEMA,
2324
)
2425
from ..database import (
2526
get_session,
@@ -517,13 +518,22 @@ def trigger_inference_run(
517518
detail="Unexpected number of batches found: Expected 1, got "
518519
+ str(len(inst_result)),
519520
)
521+
inst_file_schemas = [x.schemas for x in batch_result[0][0].files]
522+
schema_configs = jsonpickle.decode(query_result[0][0].schema_configs)
523+
524+
for config_group in schema_configs:
525+
for config in config_group:
526+
config.schema_type = LEGACY_TO_NEW_SCHEMA.get(
527+
config.schema_type, config.schema_type
528+
)
529+
520530
if not check_file_types_valid_schema_configs(
521-
[x.schemas for x in batch_result[0][0].files],
522-
jsonpickle.decode(query_result[0][0].schema_configs),
531+
inst_file_schemas,
532+
schema_configs,
523533
):
524534
raise HTTPException(
525535
status_code=status.HTTP_400_BAD_REQUEST,
526-
detail="The files in this batch don't conform to the schema configs allowed by this model.",
536+
detail=f"The files in this batch don't conform to the schema configs allowed by this model. For debugging reference - file_schema={inst_file_schemas} and model_schema={schema_configs}",
527537
)
528538
# Note to Datakind: In the long-term, this is where you would have a case block or something that would call different types of pipelines.
529539
db_req = DatabricksInferenceRunRequest(

src/webapp/routers/models_test.py

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def session_fixture():
118118
updated_at=DATETIME_TESTING,
119119
sst_generated=False,
120120
valid=True,
121-
schemas=[SchemaType.PDP_COURSE],
121+
schemas=[SchemaType.COURSE],
122122
)
123123
file_3 = FileTable(
124124
id=FILE_UUID_3,
@@ -129,7 +129,7 @@ def session_fixture():
129129
updated_at=DATETIME_TESTING,
130130
sst_generated=True,
131131
valid=True,
132-
schemas=[SchemaType.PDP_COHORT],
132+
schemas=[SchemaType.STUDENT],
133133
)
134134
model_1 = ModelTable(
135135
id=SAMPLE_UUID,
@@ -139,20 +139,15 @@ def session_fixture():
139139
[
140140
[
141141
SchemaConfigObj(
142-
schema_type=SchemaType.PDP_COURSE,
142+
schema_type=SchemaType.COURSE,
143143
optional=False,
144144
multiple_allowed=False,
145145
),
146146
SchemaConfigObj(
147-
schema_type=SchemaType.PDP_COHORT,
147+
schema_type=SchemaType.STUDENT,
148148
optional=False,
149149
multiple_allowed=False,
150150
),
151-
SchemaConfigObj(
152-
schema_type=SchemaType.SST_PDP_FINANCE,
153-
optional=True,
154-
multiple_allowed=False,
155-
),
156151
]
157152
]
158153
),
@@ -189,7 +184,7 @@ def session_fixture():
189184
updated_at=DATETIME_TESTING,
190185
sst_generated=False,
191186
valid=False,
192-
schemas=[SchemaType.PDP_COURSE],
187+
schemas=[SchemaType.COURSE],
193188
),
194189
file_3,
195190
model_1,
@@ -329,23 +324,18 @@ def test_read_inst_model_output(client: TestClient):
329324
def test_create_model(client: TestClient):
330325
"""Depending on timeline, fellows may not get to this."""
331326
schema_config_1 = {
332-
"schema_type": SchemaType.PDP_COURSE,
327+
"schema_type": SchemaType.COURSE,
333328
"count": 1,
334329
}
335330
schema_config_2 = {
336-
"schema_type": SchemaType.PDP_COHORT,
331+
"schema_type": SchemaType.STUDENT,
337332
"count": 1,
338333
}
339-
schema_config_3 = {
340-
"schema_type": SchemaType.SST_PDP_FINANCE,
341-
"count": 1,
342-
"optional": True,
343-
}
344334
response = client.post(
345335
"/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/models/",
346336
json={
347337
"name": "my_model",
348-
"schema_configs": [[schema_config_1, schema_config_2, schema_config_3]],
338+
"schema_configs": [[schema_config_1, schema_config_2]],
349339
},
350340
)
351341

@@ -368,9 +358,8 @@ def test_trigger_inference_run(client: TestClient):
368358
)
369359

370360
assert response.status_code == 400
371-
assert (
372-
response.text
373-
== '{"detail":"The files in this batch don\'t conform to the schema configs allowed by this model."}'
361+
assert response.json()["detail"].startswith(
362+
"The files in this batch don't conform to the schema configs allowed by this model."
374363
)
375364

376365
response = client.post(
@@ -395,56 +384,45 @@ def test_trigger_inference_run(client: TestClient):
395384
def test_check_file_types_valid_schema_configs():
396385
"""Test batch schema validation logic."""
397386
file_types1 = [
398-
[SchemaType.PDP_COURSE],
399-
[SchemaType.PDP_COHORT],
387+
[SchemaType.COURSE],
388+
[SchemaType.STUDENT],
400389
[SchemaType.UNKNOWN],
401390
]
402391
file_types2 = [
403-
[SchemaType.SST_PDP_COHORT],
404-
[SchemaType.SST_PDP_COURSE],
405-
[SchemaType.SST_PDP_FINANCE],
392+
[SchemaType.STUDENT],
393+
[SchemaType.COURSE],
406394
]
407395
file_types3 = [
408-
[SchemaType.SST_PDP_COHORT, SchemaType.UNKNOWN],
409-
[SchemaType.SST_PDP_COURSE],
396+
[SchemaType.STUDENT, SchemaType.UNKNOWN],
397+
[SchemaType.COURSE],
410398
]
411399
file_types4 = [
412-
[SchemaType.SST_PDP_COHORT, SchemaType.UNKNOWN],
400+
[SchemaType.STUDENT, SchemaType.UNKNOWN],
413401
[SchemaType.UNKNOWN],
414402
]
415403
pdp_configs = [
416404
SchemaConfigObj(
417-
schema_type=SchemaType.PDP_COURSE,
405+
schema_type=SchemaType.COURSE,
418406
optional=False,
419407
multiple_allowed=False,
420408
),
421409
SchemaConfigObj(
422-
schema_type=SchemaType.PDP_COHORT,
410+
schema_type=SchemaType.STUDENT,
423411
optional=False,
424412
multiple_allowed=False,
425413
),
426-
SchemaConfigObj(
427-
schema_type=SchemaType.SST_PDP_FINANCE,
428-
optional=True,
429-
multiple_allowed=False,
430-
),
431414
]
432415
sst_configs = [
433416
SchemaConfigObj(
434-
schema_type=SchemaType.SST_PDP_COHORT,
417+
schema_type=SchemaType.STUDENT,
435418
optional=False,
436419
multiple_allowed=False,
437420
),
438421
SchemaConfigObj(
439-
schema_type=SchemaType.SST_PDP_COURSE,
422+
schema_type=SchemaType.COURSE,
440423
optional=False,
441424
multiple_allowed=False,
442425
),
443-
SchemaConfigObj(
444-
schema_type=SchemaType.SST_PDP_FINANCE,
445-
optional=True,
446-
multiple_allowed=False,
447-
),
448426
]
449427
custom = [
450428
SchemaConfigObj(
@@ -463,10 +441,10 @@ def test_check_file_types_valid_schema_configs():
463441
assert not check_file_types_valid_schema_configs(file_types1, [custom])
464442
assert not check_file_types_valid_schema_configs(file_types1, schema_configs1)
465443
assert check_file_types_valid_schema_configs(file_types2, [sst_configs])
466-
assert not check_file_types_valid_schema_configs(file_types2, [pdp_configs])
444+
assert check_file_types_valid_schema_configs(file_types2, [pdp_configs])
467445
assert not check_file_types_valid_schema_configs(file_types2, [custom])
468446
assert check_file_types_valid_schema_configs(file_types3, [sst_configs])
469-
assert not check_file_types_valid_schema_configs(file_types3, [pdp_configs])
447+
assert check_file_types_valid_schema_configs(file_types3, [pdp_configs])
470448
assert not check_file_types_valid_schema_configs(file_types3, [custom])
471449
assert not check_file_types_valid_schema_configs(file_types4, [sst_configs])
472450
assert not check_file_types_valid_schema_configs(file_types4, [pdp_configs])

src/webapp/utilities.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -125,25 +125,25 @@ class SchemaType(StrEnum):
125125

126126
# If an institution uses UNKNOWN as an allowed schema, it means bypass the validation check.
127127
UNKNOWN = "UNKNOWN"
128-
# The standard PDP ARF file schemas
129-
PDP_COHORT = "PDP_COHORT"
130-
PDP_COURSE = "PDP_COURSE"
131-
# The PDP aligned SST schemas
132-
SST_PDP_COHORT = "SST_PDP_COHORT"
133-
SST_PDP_COURSE = "SST_PDP_COURSE"
134-
SST_PDP_FINANCE = "SST_PDP_FINANCE"
128+
STUDENT = "STUDENT"
129+
SEMESTER = "SEMESTER"
130+
COURSE = "COURSE"
135131

136132
# Schema Types of output files
137133
SST_OUTPUT = "SST_OUTPUT"
138134
PNG = "PNG"
139135

140136

137+
LEGACY_TO_NEW_SCHEMA = {
138+
"PDP_COHORT": "STUDENT",
139+
"SST_PDP_COHORT": "STUDENT",
140+
"PDP_COURSE": "COURSE",
141+
"SST_PDP_COURSE": "COURSE",
142+
}
143+
141144
PDP_SCHEMA_GROUP: Final = {
142-
SchemaType.PDP_COHORT,
143-
SchemaType.PDP_COURSE,
144-
SchemaType.SST_PDP_FINANCE,
145-
SchemaType.SST_PDP_COHORT,
146-
SchemaType.SST_PDP_COURSE,
145+
SchemaType.STUDENT,
146+
SchemaType.COURSE,
147147
}
148148

149149

src/webapp/validation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def validate_dataset(
269269
print("Optional column validation errors on: ", opt_failures)
270270
return {
271271
"validation_status": "passed_with_soft_errors",
272+
"schemas": model_list,
272273
"missing_optional": missing_optional,
273274
"optional_validation_failures": opt_failures,
274275
"failure_cases": err.failure_cases.to_dict(orient="records"),

src/webapp/validation_schemas/base_schema.json

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,7 @@
3737
"nullable": true,
3838
"required": false,
3939
"aliases": ["first_generation_college_student", "first_gen", "first_gen_flag", "firstgenflag"],
40-
"checks": [
41-
{"type": "isin", "args": [["Yes","No","Other","Unknown", "Y", "N", "0", "U"]]}
42-
]
40+
"checks": []
4341
},
4442
"age": {
4543
"dtype": "string",
@@ -55,9 +53,7 @@
5553
"nullable": true,
5654
"required": false,
5755
"aliases": ["gender_identification", "sex"],
58-
"checks": [
59-
{"type": "isin", "args": [["Male","Female", "M", "F", "Other","Unknown", "N", "Not specified"]]}
60-
]
56+
"checks": []
6157
},
6258
"race": {
6359
"dtype": "string",
@@ -254,9 +250,7 @@
254250
"nullable": true,
255251
"required": false,
256252
"aliases": ["pell_status_first_year"],
257-
"checks": [
258-
{"type": "isin", "args": [["true", "false", "yes", "no", "Y", "N", "UK"]]}
259-
]
253+
"checks": []
260254
}
261255
}
262256
},
@@ -463,9 +457,7 @@
463457
"nullable": true,
464458
"required": false,
465459
"aliases": [],
466-
"checks": [
467-
{"type": "isin", "args": [["lecture", "lab", "seminar", "other", "schedules"]]}
468-
]
460+
"checks": []
469461
},
470462
"course_type": {
471463
"dtype": "string",
@@ -481,19 +473,15 @@
481473
"nullable": true,
482474
"required": false,
483475
"aliases": [],
484-
"checks": [
485-
{"type": "isin", "args": [["y", "n", "yes", "no", "true", "false", "unknown", "other"]]}
486-
]
476+
"checks": []
487477
},
488478
"pass_fail_flag": {
489479
"dtype": "string",
490480
"coerce": true,
491481
"nullable": true,
492482
"required": false,
493483
"aliases": ["pass_fail", "pass/fail_flag"],
494-
"checks": [
495-
{"type": "isin", "args": [["y", "n", "Y", "N", "yes", "no", "true", "false", "unknown", "other"]]}
496-
]
484+
"checks": []
497485
},
498486
"grade": {
499487
"dtype": "string",
@@ -509,9 +497,7 @@
509497
"nullable": true,
510498
"required": false,
511499
"aliases": [],
512-
"checks": [
513-
{"type": "isin", "args": [["Y", "N", "yes", "no", "true", "false", "unknown", "other"]]}
514-
]
500+
"checks": []
515501
},
516502
"course_credits": {
517503
"dtype": "float64",

0 commit comments

Comments
 (0)