Skip to content

Commit a47d2e3

Browse files
committed
Add CSV identifier validation with comprehensive error reporting
- Validates 50 CSV files with identifier columns - Reports missing files and invalid identifiers together - Found 16 invalid identifiers across items, locations, and move_meta_categories
1 parent 5e1c604 commit a47d2e3

1 file changed

Lines changed: 242 additions & 0 deletions

File tree

pokemon_v2/test_models.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import csv
2+
import os
3+
import re
14
from django.test import TestCase
25
from pokemon_v2.models import *
36

@@ -9,3 +12,242 @@ def setUp(self):
912
def fields_are_valid(self):
1013
smell = Ability.objects.get(name="Smell")
1114
self.assertEqual(smell.generation_id, 3)
15+
16+
17+
class CSVResourceNameValidationTestCase(TestCase):
18+
"""
19+
Test that all resource identifiers in CSV files follow ASCII slug format.
20+
21+
Resource identifiers are used in API URLs and should be URL-safe ASCII slugs
22+
(lowercase letters, numbers, and hyphens only).
23+
24+
This test validates the data source (CSV files) before it's loaded into the database.
25+
"""
26+
27+
# Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
28+
VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")
29+
30+
# CSV files that contain an 'identifier' column to validate
31+
# Format: (filename, identifier_column_name)
32+
CSV_FILES_TO_VALIDATE = [
33+
("abilities.csv", "identifier"),
34+
("berry_firmness.csv", "identifier"),
35+
("conquest_episodes.csv", "identifier"),
36+
("conquest_kingdoms.csv", "identifier"),
37+
("conquest_move_displacements.csv", "identifier"),
38+
("conquest_move_ranges.csv", "identifier"),
39+
("conquest_stats.csv", "identifier"),
40+
("conquest_warrior_archetypes.csv", "identifier"),
41+
("conquest_warrior_skills.csv", "identifier"),
42+
("conquest_warrior_stats.csv", "identifier"),
43+
("conquest_warriors.csv", "identifier"),
44+
("contest_types.csv", "identifier"),
45+
("egg_groups.csv", "identifier"),
46+
("encounter_conditions.csv", "identifier"),
47+
("encounter_condition_values.csv", "identifier"),
48+
("encounter_methods.csv", "identifier"),
49+
("evolution_triggers.csv", "identifier"),
50+
("genders.csv", "identifier"),
51+
("generations.csv", "identifier"),
52+
("growth_rates.csv", "identifier"),
53+
("items.csv", "identifier"),
54+
("item_categories.csv", "identifier"),
55+
("item_flags.csv", "identifier"),
56+
("item_fling_effects.csv", "identifier"),
57+
("item_pockets.csv", "identifier"),
58+
("languages.csv", "identifier"),
59+
("locations.csv", "identifier"),
60+
("location_areas.csv", "identifier"),
61+
("moves.csv", "identifier"),
62+
("move_battle_styles.csv", "identifier"),
63+
("move_damage_classes.csv", "identifier"),
64+
("move_flags.csv", "identifier"),
65+
("move_meta_ailments.csv", "identifier"),
66+
("move_meta_categories.csv", "identifier"),
67+
("move_targets.csv", "identifier"),
68+
("natures.csv", "identifier"),
69+
("pal_park_areas.csv", "identifier"),
70+
("pokeathlon_stats.csv", "identifier"),
71+
("pokedexes.csv", "identifier"),
72+
("pokemon.csv", "identifier"),
73+
("pokemon_colors.csv", "identifier"),
74+
("pokemon_forms.csv", "identifier"),
75+
("pokemon_habitats.csv", "identifier"),
76+
("pokemon_move_methods.csv", "identifier"),
77+
("pokemon_shapes.csv", "identifier"),
78+
("pokemon_species.csv", "identifier"),
79+
("regions.csv", "identifier"),
80+
("stats.csv", "identifier"),
81+
("types.csv", "identifier"),
82+
("versions.csv", "identifier"),
83+
("version_groups.csv", "identifier"),
84+
]
85+
86+
def get_csv_path(self, filename):
87+
"""Get the absolute path to a CSV file in data/v2/csv/"""
88+
from django.conf import settings
89+
90+
base_dir = settings.BASE_DIR
91+
return os.path.join(base_dir, "data", "v2", "csv", filename)
92+
93+
def test_all_csv_identifiers_are_ascii_slugs(self):
94+
"""
95+
Validate that all resource identifiers in CSV files follow the ASCII slug format.
96+
97+
Identifiers should only contain:
98+
- Lowercase letters (a-z)
99+
- Numbers (0-9)
100+
- Hyphens (-)
101+
102+
This test will fail if any CSV contains identifiers with:
103+
- Unicode characters (ñ, ', é, etc.)
104+
- Uppercase letters
105+
- Spaces
106+
- Special characters (&, (), ', etc.)
107+
"""
108+
violations = []
109+
missing_files = []
110+
111+
for filename, identifier_column in self.CSV_FILES_TO_VALIDATE:
112+
csv_path = self.get_csv_path(filename)
113+
114+
# Track missing files to report at the end
115+
if not os.path.exists(csv_path):
116+
missing_files.append(filename)
117+
continue
118+
119+
try:
120+
with open(csv_path, "r", encoding="utf-8") as csvfile:
121+
reader = csv.DictReader(csvfile)
122+
123+
# Check if the identifier column exists
124+
if identifier_column not in reader.fieldnames:
125+
violations.append(
126+
{
127+
"file": filename,
128+
"row": "N/A",
129+
"id": "N/A",
130+
"identifier": f"Column '{identifier_column}' not found",
131+
"identifier_repr": "N/A",
132+
}
133+
)
134+
continue
135+
136+
for row_num, row in enumerate(
137+
reader, start=2
138+
): # Start at 2 (after header)
139+
identifier = row.get(identifier_column, "").strip()
140+
141+
# Skip empty identifiers
142+
if not identifier:
143+
continue
144+
145+
# Check if identifier matches the pattern
146+
if not self.VALID_IDENTIFIER_PATTERN.match(identifier):
147+
violations.append(
148+
{
149+
"file": filename,
150+
"row": row_num,
151+
"id": row.get("id", "N/A"),
152+
"identifier": identifier,
153+
"identifier_repr": repr(
154+
identifier
155+
), # Shows unicode chars clearly
156+
}
157+
)
158+
159+
except Exception as e:
160+
violations.append(
161+
{
162+
"file": filename,
163+
"row": "N/A",
164+
"id": "N/A",
165+
"identifier": f"Error reading file: {str(e)}",
166+
"identifier_repr": "N/A",
167+
}
168+
)
169+
170+
# If there are violations or missing files, create a detailed error message
171+
if violations or missing_files:
172+
error_lines = []
173+
174+
# Report missing files first
175+
if missing_files:
176+
error_lines.append("\n\nMissing CSV files:")
177+
for filename in missing_files:
178+
error_lines.append(f" - {filename}")
179+
error_lines.append(
180+
"\nAll CSV files listed in CSV_FILES_TO_VALIDATE must exist."
181+
)
182+
183+
# Report violations
184+
if violations:
185+
error_lines.append(
186+
"\n\nFound {} resource(s) with invalid identifiers (not ASCII slugs):".format(
187+
len(violations)
188+
)
189+
)
190+
error_lines.append("\nIdentifiers must match pattern: ^[a-z0-9-]+$")
191+
error_lines.append("\nInvalid identifiers found in CSV files:")
192+
193+
for v in violations:
194+
error_lines.append(
195+
" - {file} (row {row}, id={id}): {identifier} {identifier_repr}".format(
196+
**v
197+
)
198+
)
199+
200+
error_lines.append(
201+
"\nThese identifiers contain invalid characters and must be normalized."
202+
)
203+
error_lines.append(
204+
"Update the CSV files in data/v2/csv/ to fix these identifiers."
205+
)
206+
error_lines.append("\nSuggested fixes:")
207+
error_lines.append(
208+
" - Remove Unicode apostrophes (') and replace with regular hyphens or remove"
209+
)
210+
error_lines.append(" - Remove Unicode letters (ñ → n)")
211+
error_lines.append(
212+
" - Remove parentheses and other special characters"
213+
)
214+
error_lines.append(" - Convert to lowercase")
215+
216+
self.fail("\n".join(error_lines))
217+
218+
def test_identifier_pattern_examples(self):
219+
"""Test that the validation pattern works correctly with example identifiers."""
220+
# Valid identifiers
221+
valid_identifiers = [
222+
"pikachu",
223+
"charizard-mega-x",
224+
"mr-mime",
225+
"ho-oh",
226+
"type-null",
227+
"item-123",
228+
"mega-stone",
229+
]
230+
231+
for identifier in valid_identifiers:
232+
self.assertTrue(
233+
self.VALID_IDENTIFIER_PATTERN.match(identifier),
234+
f"{identifier} should be valid but was rejected",
235+
)
236+
237+
# Invalid identifiers
238+
invalid_identifiers = [
239+
"Pikachu", # Uppercase
240+
"Mr. Mime", # Space and period
241+
"kofu's-wallet", # Unicode apostrophe
242+
"jalapeño", # Unicode ñ
243+
"steel-bottle-(r)", # Parentheses
244+
"b&w-grass-tablecloth", # Ampersand
245+
"farfetch'd", # Apostrophe
246+
"kofu's-wallet", # Regular apostrophe
247+
]
248+
249+
for identifier in invalid_identifiers:
250+
self.assertFalse(
251+
self.VALID_IDENTIFIER_PATTERN.match(identifier),
252+
f"{identifier} should be invalid but was accepted",
253+
)

0 commit comments

Comments
 (0)