Skip to content

Commit 1a6f9c0

Browse files
authored
Add CSV identifier validation with comprehensive error reporting (#1437)
* Add CSV identifier validation with comprehensive error reporting - Validates 50 CSV files with identifier columns - Reports missing files and invalid identifiers together - Found 16 invalid identifiers across items, locations, and move_meta_categories
1 parent a0cb8d0 commit 1a6f9c0

1 file changed

Lines changed: 147 additions & 0 deletions

File tree

pokemon_v2/test_models.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import csv
2+
import os
3+
import re
4+
from django.conf import settings
15
from django.test import TestCase
26
from pokemon_v2.models import *
37

@@ -9,3 +13,146 @@ def setUp(self):
913
def fields_are_valid(self):
1014
smell = Ability.objects.get(name="Smell")
1115
self.assertEqual(smell.generation_id, 3)
16+
17+
18+
class CSVResourceNameValidationTestCase(TestCase):
19+
"""
20+
Test that all resource identifiers in CSV files follow ASCII slug format.
21+
22+
Resource identifiers are used in API URLs and should be URL-safe ASCII slugs
23+
(lowercase letters, numbers, and hyphens only).
24+
25+
This test validates the data source (CSV files) before it's loaded into the database.
26+
"""
27+
28+
# Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
29+
VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")
30+
31+
def test_all_csv_identifiers_are_ascii_slugs(self):
32+
"""
33+
Validate that all resource identifiers in CSV files follow the ASCII slug format.
34+
35+
Identifiers should only contain:
36+
- Lowercase letters (a-z)
37+
- Numbers (0-9)
38+
- Hyphens (-)
39+
40+
This test will fail if any CSV contains identifiers with:
41+
- Unicode characters (ñ, ', é, etc.)
42+
- Uppercase letters
43+
- Spaces
44+
- Special characters (&, (), ', etc.)
45+
"""
46+
violations = []
47+
csv_dir = os.path.join(settings.BASE_DIR, "data", "v2", "csv")
48+
49+
for filename in sorted(os.listdir(csv_dir)):
50+
if not filename.endswith(".csv"):
51+
continue
52+
53+
csv_path = os.path.join(csv_dir, filename)
54+
55+
try:
56+
with open(csv_path, "r", encoding="utf-8") as csvfile:
57+
reader = csv.DictReader(csvfile)
58+
59+
if "identifier" not in reader.fieldnames:
60+
continue
61+
62+
for row_num, row in enumerate(reader, start=2):
63+
identifier = row.get("identifier", "").strip()
64+
65+
# Skip empty identifiers
66+
if not identifier:
67+
continue
68+
69+
# Check if identifier matches the pattern
70+
if not self.VALID_IDENTIFIER_PATTERN.match(identifier):
71+
violations.append(
72+
{
73+
"file": filename,
74+
"row": row_num,
75+
"id": row.get("id", "N/A"),
76+
"identifier": identifier,
77+
}
78+
)
79+
80+
except Exception as e:
81+
violations.append(
82+
{
83+
"file": filename,
84+
"row": "N/A",
85+
"id": "N/A",
86+
"identifier": f"Error reading file: {str(e)}",
87+
}
88+
)
89+
90+
error_lines = []
91+
92+
# Report violations
93+
if violations:
94+
error_lines.append(
95+
"\n\nFound {} resource(s) with invalid identifiers (not ASCII slugs):".format(
96+
len(violations)
97+
)
98+
)
99+
error_lines.append("\nIdentifiers must match pattern: ^[a-z0-9-]+$")
100+
error_lines.append("\nInvalid identifiers found in CSV files:")
101+
102+
for v in violations:
103+
error_lines.append(
104+
" - {file} (row {row}, id={id}): {identifier}".format(**v)
105+
)
106+
107+
error_lines.append(
108+
"\nThese identifiers contain invalid characters and must be normalized."
109+
)
110+
error_lines.append(
111+
"Update the CSV files in data/v2/csv/ to fix these identifiers."
112+
)
113+
error_lines.append("\nSuggested fixes:")
114+
error_lines.append(
115+
" - Remove Unicode apostrophes (') and replace with regular hyphens or remove"
116+
)
117+
error_lines.append(" - Remove Unicode letters (ñ → n)")
118+
error_lines.append(" - Remove parentheses and other special characters")
119+
error_lines.append(" - Convert to lowercase")
120+
121+
self.fail("\n".join(error_lines))
122+
123+
def test_identifier_pattern_examples(self):
124+
"""Test that the validation pattern works correctly with example identifiers."""
125+
# Valid identifiers
126+
valid_identifiers = [
127+
"pikachu",
128+
"charizard-mega-x",
129+
"mr-mime",
130+
"ho-oh",
131+
"type-null",
132+
"item-123",
133+
"mega-stone",
134+
]
135+
136+
for identifier in valid_identifiers:
137+
self.assertTrue(
138+
self.VALID_IDENTIFIER_PATTERN.match(identifier),
139+
f"{identifier} should be valid but was rejected",
140+
)
141+
142+
# Invalid identifiers
143+
invalid_identifiers = [
144+
"Pikachu", # Uppercase
145+
"Mr. Mime", # Space and period
146+
"kofu's-wallet", # Unicode apostrophe
147+
"jalapeño", # Unicode ñ
148+
"steel-bottle-(r)", # Parentheses
149+
"b&w-grass-tablecloth", # Ampersand
150+
"farfetch'd", # Apostrophe
151+
"kofu's-wallet", # Regular apostrophe
152+
]
153+
154+
for identifier in invalid_identifiers:
155+
self.assertFalse(
156+
self.VALID_IDENTIFIER_PATTERN.match(identifier),
157+
f"{identifier} should be invalid but was accepted",
158+
)

0 commit comments

Comments
 (0)