Skip to content

Commit e7bdac8

Browse files
committed
feat: Reinstate support for Unicode component_codes and container_codes
While upgrading platform from core 0.39.2 to core 0.43.0, we realized that we've quietly supported non-ascii component and container codes in Content Libraries V2 since their release. We do not advertise our support for non-ascii codes within Libraries (and we don't want to, yet [2]), but in order to avoid breaking sites that stumbled upon non-ascii codes, we need to loosen the validation & db constraints which were added to openedx-core. On a code level, we simply add a `unicode=True/False` parameter to the `code_field()`. It's set to True for components and containers, and left as False for collections for now, since we no of no way in which a user could make non-ascii collection code (other than by editing a ZIP archive). There are some nuances around the migrations. See the comments for details. [1] openedx/openedx-platform#38402 (comment) [2] openedx/openedx-platform#38413 Bumps version to 0.44.0
1 parent d0dc9cd commit e7bdac8

11 files changed

Lines changed: 160 additions & 21 deletions

File tree

src/openedx_content/applets/collections/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ class Collection(models.Model):
109109
# collection's opaque key:
110110
# e.g. "lib-collection:{org_code}:{library_code}:{collection_code}"
111111
# is the opaque key for a library collection.
112-
collection_code = code_field()
112+
# TODO: Consider supporting unicode https://github.com/openedx/openedx-platform/issues/38413
113+
collection_code = code_field(unicode=False)
113114

114115
title = case_insensitive_char_field(
115116
null=False,
@@ -179,7 +180,7 @@ class Meta:
179180
],
180181
name="oel_coll_uniq_lp_key",
181182
),
182-
code_field_check("collection_code", name="oel_coll_collection_code_regex"),
183+
code_field_check("collection_code", name="oel_coll_collection_code_regex", unicode=False),
183184
]
184185
indexes = [
185186
models.Index(

src/openedx_content/applets/components/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def pk(self):
180180
# component_code is an identifier that is local to the learning_package and
181181
# component_type. The publishable.entity_ref is derived from component_type
182182
# and component_code.
183-
component_code = code_field()
183+
component_code = code_field(unicode=True)
184184

185185
class Meta:
186186
constraints = [
@@ -198,7 +198,7 @@ class Meta:
198198
],
199199
name="oel_component_uniq_lc_ct_lk",
200200
),
201-
code_field_check("component_code", name="oel_component_code_regex"),
201+
code_field_check("component_code", name="oel_component_code_regex", unicode=True),
202202
]
203203
indexes = [
204204
# Global Component-Type/Component-Code Index:

src/openedx_content/applets/containers/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ class Container(PublishableEntityMixin):
189189
# container_code is an identifier that is local to the learning_package.
190190
# Unlike component_code, it is unique across all container types within
191191
# the same LearningPackage.
192-
container_code = code_field()
192+
container_code = code_field(unicode=True)
193193

194194
@property
195195
def id(self) -> ID:
@@ -212,7 +212,7 @@ class Meta:
212212
fields=["learning_package", "container_code"],
213213
name="oel_container_uniq_lp_cc",
214214
),
215-
code_field_check("container_code", name="oel_container_code_regex"),
215+
code_field_check("container_code", name="oel_container_code_regex", unicode=True),
216216
]
217217

218218
@classmethod

src/openedx_content/migrations/0009_rename_component_local_key_to_component_code.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,14 @@ class Migration(migrations.Migration):
5959
migrations.AddConstraint(
6060
model_name='component',
6161
constraint=models.CheckConstraint(
62-
condition=django.db.models.lookups.Regex(models.F('component_code'), '^[a-zA-Z0-9_.-]+\\Z'),
62+
# The original version of this migration had an ascii-only regex constraint,
63+
# matching the django-level RegexValidator defined above. However,
64+
# that constraint caused an IntegrityError on some dev sites with non-ascii component
65+
# codes in libraries, which technically we allow. So, we've loosened this constraint
66+
# just to ensure that the migration applies cleanly. Migration 0013 will re-create
67+
# the constraint and validator to be unicode-friendly, regardless of whether 0009
68+
# was applied with the ascii-only or unicode-friendly constraint.
69+
condition=django.db.models.lookups.Regex(models.F('component_code'), '^[\\w.-]+\\Z'),
6370
name='oel_component_code_regex',
6471
violation_error_message='Enter a valid "code name" consisting of letters, numbers, underscores, hyphens, or periods.',
6572
),

src/openedx_content/migrations/0010_add_container_code.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,14 @@ class Migration(migrations.Migration):
8989
migrations.AddConstraint(
9090
model_name='container',
9191
constraint=models.CheckConstraint(
92-
condition=django.db.models.lookups.Regex(models.F('container_code'), '^[a-zA-Z0-9_.-]+\\Z'),
92+
# The original version of this migration had an ascii-only regex constraint,
93+
# matching the django-level RegexValidator defined above. However,
94+
# that constraint caused an IntegrityError on some dev sites with non-ascii component
95+
# codes in libraries, which technically we allow. So, we've loosened this constraint
96+
# just to ensure that the migration applies cleanly. Migration 0013 will re-create
97+
# the constraint and validator to be unicode-friendly, regardless of whether 0010
98+
# was applied with the ascii-only or unicode-friendly constraint.
99+
condition=django.db.models.lookups.Regex(models.F('container_code'), '^[\\w.-]+\\Z'),
93100
name='oel_container_code_regex',
94101
violation_error_message='Enter a valid "code name" consisting of letters, numbers, underscores, hyphens, or periods.',
95102
),
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Generated by Django 5.2.13 on 2026-04-23 14:15
2+
3+
import re
4+
5+
import django.core.validators
6+
import django.db.models.lookups
7+
from django.conf import settings
8+
from django.db import migrations, models
9+
10+
import openedx_django_lib.fields
11+
12+
13+
class Migration(migrations.Migration):
14+
15+
dependencies = [
16+
('openedx_content', '0012_rename_componentversionmedia_key_to_path'),
17+
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
18+
]
19+
20+
operations = [
21+
# For the vast majority of sites, re-creating these constraints is redundant with
22+
# 0009-0010. But, a handful of developers had applied 0009 and 0010 when they contained
23+
# ascii-only constraints. Re-creating the constraints here ensure that everyone's on
24+
# the same page with identical, unicode-friendly constraints.
25+
migrations.RemoveConstraint(
26+
model_name='component',
27+
name='oel_component_code_regex',
28+
),
29+
migrations.RemoveConstraint(
30+
model_name='container',
31+
name='oel_container_code_regex',
32+
),
33+
migrations.AlterField(
34+
model_name='collection',
35+
name='collection_code',
36+
field=openedx_django_lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=255, validators=[django.core.validators.RegexValidator(re.compile('^[a-zA-Z0-9_.-]+\\Z'), 'Enter a valid "code name" consisting of latin letters (A-Z, a-z), numbers, underscores, hyphens, or periods.', 'invalid')]),
37+
),
38+
migrations.AlterField(
39+
model_name='component',
40+
name='component_code',
41+
field=openedx_django_lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=255, validators=[django.core.validators.RegexValidator(re.compile('^[\\w.-]+\\Z'), 'Enter a valid "code name" consisting of any letters, numbers, underscores, hyphens, or periods.', 'invalid')]),
42+
),
43+
migrations.AlterField(
44+
model_name='container',
45+
name='container_code',
46+
field=openedx_django_lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=255, validators=[django.core.validators.RegexValidator(re.compile('^[\\w.-]+\\Z'), 'Enter a valid "code name" consisting of any letters, numbers, underscores, hyphens, or periods.', 'invalid')]),
47+
),
48+
migrations.AddConstraint(
49+
model_name='component',
50+
constraint=models.CheckConstraint(condition=django.db.models.lookups.Regex(models.F('component_code'), '^[\\w.-]+\\Z'), name='oel_component_code_regex', violation_error_message='Enter a valid "code name" consisting of any letters, numbers, underscores, hyphens, or periods.'),
51+
),
52+
migrations.AddConstraint(
53+
model_name='container',
54+
constraint=models.CheckConstraint(condition=django.db.models.lookups.Regex(models.F('container_code'), '^[\\w.-]+\\Z'), name='oel_container_code_regex', violation_error_message='Enter a valid "code name" consisting of any letters, numbers, underscores, hyphens, or periods.'),
55+
),
56+
migrations.AlterConstraint(
57+
model_name='collection',
58+
name='oel_coll_collection_code_regex',
59+
constraint=models.CheckConstraint(condition=django.db.models.lookups.Regex(models.F('collection_code'), '^[a-zA-Z0-9_.-]+\\Z'), name='oel_coll_collection_code_regex', violation_error_message='Enter a valid "code name" consisting of latin letters (A-Z, a-z), numbers, underscores, hyphens, or periods.'),
60+
),
61+
]

src/openedx_core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
"""
77

88
# The version for the entire repository
9-
__version__ = "0.43.0"
9+
__version__ = "0.44.0"

src/openedx_django_lib/fields.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -119,15 +119,21 @@ def immutable_uuid_field() -> models.UUIDField:
119119

120120

121121
# Alphanumeric, hyphens, underscores, periods
122-
CODE_REGEX = re.compile(r"^[a-zA-Z0-9_.-]+\Z")
122+
CODE_REGEX_ASCII = re.compile(r"^[a-zA-Z0-9_.-]+\Z")
123123

124+
# Anything which passes isalnum(), plus underscores, hyphens, and periods
125+
CODE_REGEX_UNICODE = re.compile(r"^[\w.-]+\Z", flags=re.UNICODE)
124126

125-
_CODE_VIOLATION_MSG = _(
126-
'Enter a valid "code name" consisting of letters, numbers, underscores, hyphens, or periods.'
127+
_CODE_VIOLATION_MSG_ASCII = _(
128+
'Enter a valid "code name" consisting of latin letters (A-Z, a-z), numbers, underscores, hyphens, or periods.'
129+
)
130+
131+
_CODE_VIOLATION_MSG_UNICODE = _(
132+
'Enter a valid "code name" consisting of any letters, numbers, underscores, hyphens, or periods.'
127133
)
128134

129135

130-
def code_field(**kwargs) -> MultiCollationCharField:
136+
def code_field(unicode: bool, **kwargs) -> MultiCollationCharField:
131137
"""
132138
Field to hold a 'code', i.e. a slug-like local identifier.
133139
@@ -139,19 +145,18 @@ def code_field(**kwargs) -> MultiCollationCharField:
139145
blank=False,
140146
validators=[
141147
RegexValidator(
142-
CODE_REGEX,
143-
# Translators: "letters" means latin letters: a-z and A-Z.
144-
_CODE_VIOLATION_MSG,
148+
CODE_REGEX_UNICODE if unicode else CODE_REGEX_ASCII,
149+
_CODE_VIOLATION_MSG_UNICODE if unicode else _CODE_VIOLATION_MSG_ASCII,
145150
"invalid",
146151
),
147152
],
148153
**kwargs,
149154
)
150155

151156

152-
def code_field_check(field_name: str, *, name: str) -> models.CheckConstraint:
157+
def code_field_check(field_name: str, *, name: str, unicode: bool) -> models.CheckConstraint:
153158
"""
154-
Return a ``CheckConstraint`` that enforces :data:`CODE_REGEX` at the DB level.
159+
Return a ``CheckConstraint`` that enforces :data:`CODE_REGEX_UNICODE` or :data:`CODE_REGEX_ASCII` at the DB level.
155160
156161
Django validators (used by :func:`code_field`) are not called on ``.save()``
157162
or ``.update()``. Adding this constraint ensures the regex is also enforced
@@ -162,13 +167,22 @@ def code_field_check(field_name: str, *, name: str) -> models.CheckConstraint:
162167
163168
class Meta:
164169
constraints = [
165-
code_field_check("my_code_field", name="myapp_mymodel_my_code_field_regex"),
170+
code_field_check(
171+
"my_code_field",
172+
name="myapp_mymodel_my_code_field_regex",
173+
unicode=True/False, # Make sure this matches the code_field!
174+
),
166175
]
167176
"""
168177
return models.CheckConstraint(
169-
condition=Regex(models.F(field_name), CODE_REGEX.pattern),
178+
condition=Regex(
179+
models.F(field_name),
180+
(CODE_REGEX_UNICODE if unicode else CODE_REGEX_ASCII).pattern,
181+
),
170182
name=name,
171-
violation_error_message=_CODE_VIOLATION_MSG,
183+
violation_error_message=(
184+
_CODE_VIOLATION_MSG_UNICODE if unicode else _CODE_VIOLATION_MSG_ASCII
185+
),
172186
)
173187

174188

tests/openedx_content/applets/collections/test_api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ def test_create_collection_invalid_code(self):
229229
"has@symbol",
230230
"has/slash",
231231
"has#hash",
232+
"café", # non-ascii letters not allowed for collection_code
233+
"柏倉隆史",
232234
]
233235
for code in invalid_codes:
234236
with self.subTest(code=code):

tests/openedx_content/applets/components/test_api.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from django.contrib.auth import get_user_model
77
from django.contrib.auth.models import User as UserType # pylint: disable=imported-auth-user
88
from django.core.exceptions import ObjectDoesNotExist
9+
from django.db import IntegrityError
910
from django.test import TestCase
1011

1112
from openedx_content.applets.collections import api as collection_api
@@ -385,6 +386,37 @@ def test_exists_by_code(self):
385386
component_code='not_my_component',
386387
)
387388

389+
def test_unicode_code(self):
390+
"""component_code supports non-ascii letters."""
391+
unicode_code = "柏倉隆史"
392+
component = components_api.create_component(
393+
self.learning_package.id,
394+
component_type=self.problem_type,
395+
component_code=unicode_code,
396+
created=self.now,
397+
created_by=None,
398+
)
399+
assert component.component_code == unicode_code
400+
assert components_api.get_component_by_code(
401+
self.learning_package.id,
402+
namespace='xblock.v1',
403+
type_name='problem',
404+
component_code=unicode_code,
405+
).id == component.id
406+
407+
def test_create_container_fails_with_invalid_chars(self):
408+
"""component_code does NOT support whitespace, most symbols, emoji"""
409+
for invalid_code in ["a b", "a,b", "a:b", "a☃b"]:
410+
with self.subTest(invalid_code=invalid_code):
411+
with self.assertRaisesRegex(IntegrityError, r'.*oel_component_code_regex.*'):
412+
components_api.create_component(
413+
self.learning_package.id,
414+
component_type=self.problem_type,
415+
component_code=invalid_code,
416+
created=self.now,
417+
created_by=None,
418+
)
419+
388420

389421
class CreateNewVersionsTestCase(ComponentTestCase):
390422
"""

0 commit comments

Comments
 (0)