Skip to content

Commit ba57a13

Browse files
authored
use new data.json (#3202)
1 parent d92c0ed commit ba57a13

20 files changed

Lines changed: 280 additions & 67 deletions

File tree

frontends/api/src/generated/v0/api.ts

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/api/src/generated/v1/api.ts

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

frontends/main/src/page-components/LearningResourceExpanded/CallToActionSection.tsx

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,13 +210,20 @@ const ImageSection: React.FC<{
210210
config: ImageConfig
211211
}> = ({ resource, config }) => {
212212
const aspect = config.width / config.height
213-
if (
214-
resource?.resource_type === "video" &&
215-
resource?.url &&
216-
resource?.platform?.code === PlatformEnum.Youtube
217-
) {
213+
const youtubeId =
214+
resource?.resource_type === "video"
215+
? resource.content_files?.[0]?.youtube_id
216+
: null
217+
const youtubeUrl = youtubeId
218+
? `https://www.youtube.com/watch?v=${youtubeId}`
219+
: resource?.resource_type === "video" &&
220+
resource?.platform?.code === PlatformEnum.Youtube
221+
? resource.url
222+
: null
223+
224+
if (resource && youtubeUrl) {
218225
return (
219-
<VideoFrame src={resource.url} title={resource.title} aspect={aspect} />
226+
<VideoFrame src={youtubeUrl} title={resource.title} aspect={aspect} />
220227
)
221228
} else if (resource) {
222229
const imageUrl =

learning_resources/etl/ocw.py

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,15 @@ def transform_content_files(
139139
if obj.key.endswith("data.json"):
140140
try:
141141
resource_json = safe_load_json(get_s3_object_and_read(obj), obj.key)
142-
transformed_resource = transform_contentfile(
143-
obj.key, resource_json, s3_resource, force_overwrite
144-
)
142+
143+
if resource_json.get("resourcetype"):
144+
transformed_resource = transform_contentfile(
145+
obj.key, resource_json, s3_resource, force_overwrite
146+
)
147+
else:
148+
transformed_resource = transform_contentfile_legacy(
149+
obj.key, resource_json, s3_resource, force_overwrite
150+
)
145151
if transformed_resource:
146152
yield transformed_resource
147153

@@ -171,7 +177,9 @@ def transform_page(s3_key: str, page_data: dict) -> dict:
171177
"title": page_data.get("title"),
172178
"content_title": page_data.get("title"),
173179
"content": page_data.get("content"),
180+
"content_tags": page_data.get("learning_resource_types"),
174181
"key": s3_path,
182+
"description": page_data.get("description"),
175183
"published": True,
176184
}
177185

@@ -231,6 +239,82 @@ def transform_contentfile(
231239
contentfile_data: dict,
232240
s3_resource: boto3.resource,
233241
force_overwrite: bool, # noqa: FBT001
242+
) -> dict:
243+
"""
244+
Transform the data from data.json (new format) for a content file
245+
246+
Args:
247+
s3_key (str): S3 path for the data.json file
248+
contentfile_data (dict): JSON data from data.json
249+
s3_resource (boto3.resource): The S3 resource
250+
force_overwrite (bool): Overwrite document text if true
251+
252+
Returns:
253+
dict: transformed content file data
254+
"""
255+
s3_path = s3_key.split("data.json", maxsplit=1)[0]
256+
s3_path = urlparse(s3_path).path.lstrip("/")
257+
258+
file_type = contentfile_data.get("file_type")
259+
video_files = contentfile_data.get("video_files", {})
260+
261+
if contentfile_data.get("resourcetype") == "Video":
262+
content_type = CONTENT_TYPE_VIDEO
263+
youtube_id = contentfile_data.get("video_metadata", {}).get("youtube_id")
264+
if youtube_id:
265+
image_src = f"https://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg"
266+
else:
267+
image_src = None
268+
file_s3_path = video_files.get("video_transcript_file")
269+
file_extension = Path(
270+
contentfile_data.get("file") or video_files.get("archive_url") or ""
271+
).suffix
272+
else:
273+
content_type = get_content_type(file_type)
274+
file_s3_path = contentfile_data.get("file") or ""
275+
image_src = None
276+
file_extension = Path(file_s3_path).suffix
277+
youtube_id = None
278+
279+
title = contentfile_data.get("title")
280+
281+
if title in ("3play caption file", "3play pdf file") or not file_s3_path:
282+
return None
283+
284+
contentfile_data = {
285+
"description": clean_data(contentfile_data.get("content")),
286+
"file_type": file_type,
287+
"content_type": content_type,
288+
"url": urljoin(settings.OCW_BASE_URL, urlparse(s3_path).path.lstrip("/")),
289+
"title": title,
290+
"content_title": title,
291+
"key": s3_path,
292+
"content_tags": contentfile_data.get("learning_resource_types"),
293+
"published": True,
294+
"file_extension": file_extension,
295+
}
296+
297+
if not file_s3_path.startswith("courses"):
298+
file_s3_path = "courses" + file_s3_path.split("courses")[1]
299+
300+
content_json = get_file_content(s3_path, file_s3_path, s3_resource, force_overwrite)
301+
if content_json:
302+
contentfile_data["content"] = content_json.get("content")
303+
304+
if image_src:
305+
contentfile_data["image_src"] = image_src
306+
307+
if youtube_id:
308+
contentfile_data["youtube_id"] = youtube_id
309+
310+
return contentfile_data
311+
312+
313+
def transform_contentfile_legacy(
314+
s3_key: str,
315+
contentfile_data: dict,
316+
s3_resource: boto3.resource,
317+
force_overwrite: bool, # noqa: FBT001
234318
) -> dict:
235319
"""
236320
Transform the data from data.json for a content file

learning_resources/etl/ocw_test.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,13 @@ def test_transform_content_files(settings, mocker, base_ocw_url):
5858
transform_content_files(s3_resource, OCW_TEST_PREFIX, False) # noqa: FBT003
5959
)
6060

61-
assert len(content_data) == 5
61+
assert len(content_data) == 4
6262

6363
assert content_data[0] == {
6464
"content": "Pages Section",
65+
"content_tags": [],
6566
"content_type": "page",
67+
"description": "Description of Pages",
6668
"key": (
6769
"courses/16-01-unified-engineering-i-ii-iii-iv-fall-2005-spring-2006/pages/"
6870
),
@@ -74,7 +76,9 @@ def test_transform_content_files(settings, mocker, base_ocw_url):
7476

7577
assert content_data[1] == {
7678
"content": "Course Meeting Times Lecture",
79+
"content_tags": [],
7780
"content_type": "page",
81+
"description": "Description of Syllabus",
7882
"key": "courses/16-01-unified-engineering-i-ii-iii-iv-fall-2005-spring-2006/pages/syllabus/",
7983
"published": True,
8084
"title": "Syllabus",
@@ -110,23 +114,9 @@ def test_transform_content_files(settings, mocker, base_ocw_url):
110114
"title": None,
111115
"content_title": None,
112116
"url": f"{ocw_url}/courses/16-01-unified-engineering-i-ii-iii-iv-fall-2005-spring-2006/resources/video/",
113-
"image_src": "https://img.youtube.com/vi/vKer2U5W5-s/default.jpg",
114-
"file_extension": ".mp4",
115-
}
116-
117-
assert content_data[4] == {
118-
"content": "TEXT",
119-
"content_type": "video",
120-
"description": "Video Description, no file",
121-
"file_type": "video/mp4",
122-
"key": "courses/16-01-unified-engineering-i-ii-iii-iv-fall-2005-spring-2006/resources/video_no_file/",
123-
"content_tags": ["Old Videos"],
124-
"published": True,
125-
"title": None,
126-
"content_title": None,
127-
"url": f"{ocw_url}/courses/16-01-unified-engineering-i-ii-iii-iv-fall-2005-spring-2006/resources/video_no_file/",
128-
"image_src": "https://img.youtube.com/vi/vKer2U5W5-s/default.jpg",
117+
"image_src": "https://i.ytimg.com/vi/vKer2U5W5-s/hqdefault.jpg",
129118
"file_extension": ".mp4",
119+
"youtube_id": "vKer2U5W5-s",
130120
}
131121

132122

@@ -146,7 +136,7 @@ def test_transform_content_files_exceptions(settings, mocker):
146136
transform_content_files(s3_resource, OCW_TEST_PREFIX, False) # noqa: FBT003
147137
)
148138
assert len(content_data) == 0
149-
assert mock_log.call_count == 7
139+
assert mock_log.call_count == 5
150140

151141

152142
@mock_aws

learning_resources/etl/pipelines_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ def test_ocw_courses_etl(
270270
run = resource.runs.first()
271271
assert run.instructors.count() == 10
272272
assert run.run_id == "97db384ef34009a64df7cb86cf701979"
273-
assert run.content_files.count() == (0 if skip_content_files else 5)
273+
assert run.content_files.count() == (0 if skip_content_files else 4)
274274
assert mock_cf_actions.call_count == (0 if skip_content_files else 1)
275275
assert mock_calc_score.call_count == (0 if skip_content_files else 1)
276276

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Generated by Django 4.2.29 on 2026-04-14 19:47
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("learning_resources", "0111_add_edx_module_id_idx"),
9+
]
10+
11+
operations = [
12+
migrations.AddField(
13+
model_name="contentfile",
14+
name="youtube_id",
15+
field=models.CharField(blank=True, max_length=32, null=True),
16+
),
17+
]

learning_resources/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,6 +1199,7 @@ class ContentFile(TimestampedModel):
11991199
summary = models.TextField(blank=True, default="")
12001200
flashcards = models.JSONField(blank=True, default=list)
12011201
duration = models.CharField(max_length=11, null=True, blank=True) # noqa: DJ001
1202+
youtube_id = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
12021203

12031204
def save(self, **kwargs):
12041205
self.checksum = checksum_for_content(self.content)

learning_resources/serializers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1449,6 +1449,7 @@ class Meta:
14491449
"edx_module_id",
14501450
"summary",
14511451
"flashcards",
1452+
"youtube_id",
14521453
]
14531454

14541455

learning_resources/serializers_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,7 @@ def test_content_file_serializer(settings, expected_types, has_channels):
613613
"content_language": "en",
614614
"content_title": "test title",
615615
"edx_module_id": "edx_module_id",
616+
"youtube_id": "youtube_id",
616617
}
617618
platform = PlatformType.ocw.name
618619
course = factories.CourseFactory.create(platform=platform)
@@ -724,6 +725,7 @@ def test_content_file_serializer(settings, expected_types, has_channels):
724725
"summary": content_file.summary,
725726
"flashcards": content_file.flashcards,
726727
"direct_learning_resource_id": direct_learning_resource.id,
728+
"youtube_id": content_file.youtube_id,
727729
},
728730
)
729731

0 commit comments

Comments
 (0)