Skip to content

Commit 5720990

Browse files
committed
Merge branch 'main' into HEA-752/Dagster-GraphQL-API-is-intermittently-failing-with-a-ProtocolError-when-accessed-via-the-revproxy-Django-view
2 parents dffc109 + f819089 commit 5720990

10 files changed

Lines changed: 141 additions & 25 deletions

File tree

pipelines/assets/baseline.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,15 @@ def get_wealth_group_dataframe(
7878
# In the Summary columns in the Data, Data2, Data3 worksheets, the Wealth
7979
# Group Category is in Row 4 (District)rather than Row 3 (Wealth Group Category)
8080
# so do a second lookup to update the blank rows.
81-
# If this doesn't find any new values, then it's because in a WB worksheet
82-
# there are no extra Wealth Group Categories on Row 4
83-
try:
81+
# Note that in a WB worksheet there are no extra Wealth Group Categories on Row 4
82+
if worksheet_name != "WB":
8483
wealth_group_df = wealthgroupcategorylookup.do_lookup(
8584
wealth_group_df, "district", "wealth_group_category", update=True
8685
)
8786
# Remove the duplicate wealth_group_category_original column created by the second do_lookup(),
8887
# which otherwise causes problems when trying to merge dataframes, e.g. when building the wealth_group_df.
8988
wealth_group_df = wealth_group_df.loc[:, ~wealth_group_df.columns.duplicated()]
90-
except ValueError:
91-
pass
89+
9290
# Check if there are unrecognized wealth group categories and report
9391
wealth_group_missing_category_df = wealth_group_df[
9492
wealth_group_df["wealth_group_category"].isnull()
@@ -266,7 +264,7 @@ def baseline_instances(
266264
}
267265

268266
try:
269-
preview = json.dumps(result, indent=4)
267+
preview = json.dumps(result, indent=4, ensure_ascii=False)
270268
except TypeError as e:
271269
raise ValueError("Cannot serialize Community fixture to JSON. Failing dict is\n %s" % result) from e
272270

@@ -359,7 +357,7 @@ def community_instances(context: AssetExecutionContext, config: BSSMetadataConfi
359357
result = {"Community": community_df.to_dict(orient="records")}
360358

361359
try:
362-
preview = json.dumps(result, indent=4)
360+
preview = json.dumps(result, indent=4, ensure_ascii=False)
363361
except TypeError as e:
364362
raise ValueError("Cannot serialize Community fixture to JSON. Failing dict is\n %s" % result) from e
365363

pipelines/assets/fixtures.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def validate_instances(
220220

221221
metadata = {f"num_{key.lower()}": len(value) for key, value in instances.items()}
222222
metadata["total_instances"] = sum(len(value) for value in instances.values())
223-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(instances, indent=4)}\n```")
223+
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(instances, indent=4, ensure_ascii=False)}\n```")
224224
return instances, metadata
225225

226226

@@ -287,7 +287,7 @@ def get_fixture_from_instances(instance_dict: dict[str, list[dict]]) -> tuple[li
287287
metadata[f'num_{str(model._meta).split(".")[-1]}'] += 1
288288

289289
metadata["total_instances"] = len(fixture)
290-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4)}\n```")
290+
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4, ensure_ascii=False)}\n```")
291291
return fixture, metadata
292292

293293

@@ -300,7 +300,7 @@ def import_fixture(fixture: list[dict]) -> dict:
300300
# We need to use a .verbose_json file extension for Django to use the correct serializer.
301301
with tempfile.NamedTemporaryFile(mode="w+", suffix=".verbose_json") as f:
302302
# Write the fixture to a temporary file so that Django can access it
303-
f.write(json.dumps(fixture))
303+
f.write(json.dumps(fixture, indent=4, ensure_ascii=False))
304304
f.seek(0)
305305
call_command(verbose_load_data.Command(), f.name, verbosity=2, format="verbose_json", stdout=output_buffer)
306306

@@ -309,7 +309,7 @@ def import_fixture(fixture: list[dict]) -> dict:
309309
for instance in fixture:
310310
metadata[f'num_{instance["model"].split(".")[-1]}'] += 1
311311
metadata["total_instances"] = len(fixture)
312-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4)}\n```")
312+
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(fixture, indent=4, ensure_ascii=False)}\n```")
313313
metadata["output"] = MetadataValue.md(f"```\n{output_buffer.getvalue()}\n```")
314314
return metadata
315315

pipelines/assets/livelihood_activity.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,7 +1061,7 @@ def get_instances_from_dataframe(
10611061
)
10621062
* 100
10631063
),
1064-
"preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4)}\n```"),
1064+
"preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```"),
10651065
}
10661066
if not unrecognized_labels.empty:
10671067
metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False))
@@ -1116,7 +1116,9 @@ def livelihood_activity_valid_instances(
11161116
valid_instances, metadata = validate_instances(context, livelihood_activity_instances, partition_key)
11171117
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
11181118
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
1119-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```")
1119+
metadata["preview"] = MetadataValue.md(
1120+
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
1121+
)
11201122
return Output(
11211123
valid_instances,
11221124
metadata=metadata,

pipelines/assets/other_cash_income.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,9 @@ def other_cash_income_valid_instances(
171171
valid_instances, metadata = validate_instances(context, other_cash_income_instances, partition_key)
172172
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
173173
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
174-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```")
174+
metadata["preview"] = MetadataValue.md(
175+
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
176+
)
175177
return Output(
176178
valid_instances,
177179
metadata=metadata,

pipelines/assets/wealth_characteristic.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ def wealth_characteristic_instances(
450450
)
451451
* 100
452452
),
453-
"preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4)}\n```"),
453+
"preview": MetadataValue.md(f"```json\n{json.dumps(result, indent=4, ensure_ascii=False)}\n```"),
454454
}
455455
if not unrecognized_labels.empty:
456456
metadata["unrecognized_labels"] = MetadataValue.md(unrecognized_labels.to_markdown(index=False))
@@ -473,7 +473,9 @@ def wealth_characteristic_valid_instances(
473473
valid_instances, metadata = validate_instances(context, wealth_characteristic_instances, partition_key)
474474
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
475475
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
476-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```")
476+
metadata["preview"] = MetadataValue.md(
477+
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
478+
)
477479
return Output(
478480
valid_instances,
479481
metadata=metadata,

pipelines/assets/wild_foods.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ def wild_foods_valid_instances(
181181
valid_instances, metadata = validate_instances(context, wild_foods_instances, partition_key)
182182
metadata = {f"num_{key.lower()}": len(value) for key, value in valid_instances.items()}
183183
metadata["total_instances"] = sum(len(value) for value in valid_instances.values())
184-
metadata["preview"] = MetadataValue.md(f"```json\n{json.dumps(valid_instances, indent=4)}\n```")
184+
metadata["preview"] = MetadataValue.md(
185+
f"```json\n{json.dumps(valid_instances, indent=4, ensure_ascii=False)}\n```"
186+
)
185187
return Output(
186188
valid_instances,
187189
metadata=metadata,

pipelines/resources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):
7070
self.unlink(path)
7171

7272
with path.open("w") as file:
73-
file.write(json.dumps(obj, indent=4))
73+
file.write(json.dumps(obj, indent=4, ensure_ascii=False))
7474

7575
def load_from_path(self, context: InputContext, path: "UPath") -> Any:
7676
with path.open("r") as file:

pipelines/utils.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,18 +88,21 @@ def prepare_lookup(data: str | list[str] | pd.Series | pd.DataFrame) -> pd.Serie
8888
"""
8989
Prepare a Series or DataFrame for lookup operations by converting to lowercase strings and stripping whitespace.
9090
"""
91-
if isinstance(data, str):
92-
result = pd.DataFrame([data])
91+
if isinstance(data, pd.DataFrame):
92+
result = data
9393
elif isinstance(data, (list, pd.Series)):
9494
result = pd.DataFrame(data)
9595
else:
96-
result = data
96+
# Handle other types (like str, int, float)
97+
result = pd.DataFrame([data])
98+
9799
result = result.map(str).map(str.strip).map(str.lower).replace(r"\s+", " ", regex=True)
98-
if isinstance(data, str):
99-
result = result.iloc[0, 0]
100+
101+
if isinstance(data, pd.DataFrame):
102+
return result
100103
elif isinstance(data, (list, pd.Series)):
101-
result = result.iloc[:, 0]
102-
return result
104+
return result.iloc[:, 0]
105+
return result.iloc[0, 0]
103106

104107

105108
def verbose_pivot(df: pd.DataFrame, values: str | list[str], index: str | list[str], columns: str | list[str]):

pipelines_tests/test_utils/__init__.py

Whitespace-only changes.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import pandas as pd
2+
from django.test import TestCase
3+
from pipelines.utils import prepare_lookup
4+
5+
6+
class PrepareLookupTestCase(TestCase):
7+
8+
def test_prepare_lookup_with_premitive_type_input(self):
9+
# empty string
10+
result = prepare_lookup("")
11+
self.assertEqual(result, "")
12+
13+
# a simple string
14+
result = prepare_lookup("nbr Mois")
15+
self.assertEqual(result, "nbr mois")
16+
17+
# an int
18+
result = prepare_lookup(0)
19+
self.assertEqual(result, "0")
20+
21+
# a float
22+
result = prepare_lookup(7.55)
23+
self.assertEqual(result, "7.55")
24+
25+
# a simple string with spaces
26+
result = prepare_lookup(" nbr Mois ")
27+
self.assertEqual(result, "nbr mois")
28+
29+
# string with multiple internal spaces
30+
result = prepare_lookup("Autre revenu (ex. crédit)")
31+
self.assertEqual(result, "autre revenu (ex. crédit)")
32+
33+
def test_prepare_lookup_with_list_input(self):
34+
# list with single element
35+
result = prepare_lookup(["water"])
36+
self.assertIsInstance(result, pd.Series)
37+
self.assertEqual(result[0], "water")
38+
39+
# list with multiple elements
40+
result = prepare_lookup(["Water", "inputs", "Social serv."])
41+
self.assertIsInstance(result, pd.Series)
42+
pd.testing.assert_series_equal(result, pd.Series(["water", "inputs", "social serv."], name=0))
43+
44+
def test_prepare_lookup_with_series_input(self):
45+
# simple series
46+
data = pd.Series(["Camel number owned", "Cattle number owned"])
47+
result = prepare_lookup(data)
48+
self.assertIsInstance(result, pd.Series)
49+
pd.testing.assert_series_equal(result, pd.Series(["camel number owned", "cattle number owned"], name=0))
50+
51+
# test with irrigular spaces in elements
52+
data = pd.Series(["Camel number owned ", " cattle number Owned"])
53+
result = prepare_lookup(data)
54+
pd.testing.assert_series_equal(result, pd.Series(["camel number owned", "cattle number owned"], name=0))
55+
56+
# test with numeric elemnts
57+
data = pd.Series([123, 456])
58+
result = prepare_lookup(data)
59+
pd.testing.assert_series_equal(result, pd.Series(["123", "456"], name=0))
60+
61+
def test_prepare_lookup_with_dataframe_input(self):
62+
# single column dataframe
63+
data = pd.DataFrame({"lables": ["Livestock products"]})
64+
result = prepare_lookup(data)
65+
self.assertIsInstance(result, pd.DataFrame)
66+
pd.testing.assert_frame_equal(result, pd.DataFrame({"lables": ["livestock products"]}))
67+
68+
# multiple columns dataframe
69+
data = pd.DataFrame({"lables": ["Livestock products"], "another": ["Payment in kind "]})
70+
result = prepare_lookup(data)
71+
assert isinstance(result, pd.DataFrame)
72+
expected = pd.DataFrame({"lables": ["livestock products"], "another": ["payment in kind"]})
73+
pd.testing.assert_frame_equal(result, expected)
74+
75+
# numeric values
76+
data = pd.DataFrame({"column1": [123, 456], "column2": [78.9, 1011.12]})
77+
result = prepare_lookup(data)
78+
expected = pd.DataFrame({"column1": ["123", "456"], "column2": ["78.9", "1011.12"]})
79+
pd.testing.assert_frame_equal(result, expected)
80+
81+
# empty df
82+
data = pd.DataFrame()
83+
result = prepare_lookup(data)
84+
self.assertIsInstance(result, pd.DataFrame)
85+
self.assertTrue(result.empty)
86+
87+
# test that datafarme preserves structure
88+
data = pd.DataFrame(
89+
{
90+
"label": ["Cowpeas: kg produced", "Sorghum: kg produced"],
91+
"product": ["Cowpeas", "Sorghum"],
92+
"unit": ["kg", "kg"],
93+
}
94+
)
95+
result = prepare_lookup(data)
96+
self.assertEqual(result.shape, data.shape)
97+
self.assertEqual(list(result.columns), list(data.columns))
98+
99+
def test_prepare_lookup_with_special_characters(self):
100+
result = prepare_lookup("Autre nourriture: Poisson 2(sec)!@#$%")
101+
self.assertEqual(result, "autre nourriture: poisson 2(sec)!@#$%")
102+
# with tabs
103+
result = prepare_lookup("Autre nourriture: \tPoisson")
104+
self.assertEqual(result, "autre nourriture: poisson")
105+
# some unicode characters
106+
result = prepare_lookup("Revenu (Espèces)")
107+
self.assertEqual(result, "revenu (espèces)")

0 commit comments

Comments
 (0)