@@ -139,9 +139,15 @@ def transform_content_files(
139139 if obj .key .endswith ("data.json" ):
140140 try :
141141 resource_json = safe_load_json (get_s3_object_and_read (obj ), obj .key )
142- transformed_resource = transform_contentfile (
143- obj .key , resource_json , s3_resource , force_overwrite
144- )
142+
143+ if resource_json .get ("resourcetype" ):
144+ transformed_resource = transform_contentfile (
145+ obj .key , resource_json , s3_resource , force_overwrite
146+ )
147+ else :
148+ transformed_resource = transform_contentfile_legacy (
149+ obj .key , resource_json , s3_resource , force_overwrite
150+ )
145151 if transformed_resource :
146152 yield transformed_resource
147153
@@ -171,7 +177,9 @@ def transform_page(s3_key: str, page_data: dict) -> dict:
171177 "title" : page_data .get ("title" ),
172178 "content_title" : page_data .get ("title" ),
173179 "content" : page_data .get ("content" ),
180+ "content_tags" : page_data .get ("learning_resource_types" ),
174181 "key" : s3_path ,
182+ "description" : page_data .get ("description" ),
175183 "published" : True ,
176184 }
177185
@@ -231,6 +239,82 @@ def transform_contentfile(
231239 contentfile_data : dict ,
232240 s3_resource : boto3 .resource ,
233241 force_overwrite : bool , # noqa: FBT001
242+ ) -> dict :
243+ """
244+ Transform the data from data.json (new format) for a content file
245+
246+ Args:
247+ s3_key (str): S3 path for the data.json file
248+ contentfile_data (dict): JSON data from data.json
249+ s3_resource (boto3.resource): The S3 resource
250+ force_overwrite (bool): Overwrite document text if true
251+
252+ Returns:
253+ dict: transformed content file data
254+ """
255+ s3_path = s3_key .split ("data.json" , maxsplit = 1 )[0 ]
256+ s3_path = urlparse (s3_path ).path .lstrip ("/" )
257+
258+ file_type = contentfile_data .get ("file_type" )
259+ video_files = contentfile_data .get ("video_files" , {})
260+
261+ if contentfile_data .get ("resourcetype" ) == "Video" :
262+ content_type = CONTENT_TYPE_VIDEO
263+ youtube_id = contentfile_data .get ("video_metadata" , {}).get ("youtube_id" )
264+ if youtube_id :
265+ image_src = f"https://i.ytimg.com/vi/{ youtube_id } /hqdefault.jpg"
266+ else :
267+ image_src = None
268+ file_s3_path = video_files .get ("video_transcript_file" )
269+ file_extension = Path (
270+ contentfile_data .get ("file" ) or video_files .get ("archive_url" ) or ""
271+ ).suffix
272+ else :
273+ content_type = get_content_type (file_type )
274+ file_s3_path = contentfile_data .get ("file" ) or ""
275+ image_src = None
276+ file_extension = Path (file_s3_path ).suffix
277+ youtube_id = None
278+
279+ title = contentfile_data .get ("title" )
280+
281+ if title in ("3play caption file" , "3play pdf file" ) or not file_s3_path :
282+ return None
283+
284+ contentfile_data = {
285+ "description" : clean_data (contentfile_data .get ("content" )),
286+ "file_type" : file_type ,
287+ "content_type" : content_type ,
288+ "url" : urljoin (settings .OCW_BASE_URL , urlparse (s3_path ).path .lstrip ("/" )),
289+ "title" : title ,
290+ "content_title" : title ,
291+ "key" : s3_path ,
292+ "content_tags" : contentfile_data .get ("learning_resource_types" ),
293+ "published" : True ,
294+ "file_extension" : file_extension ,
295+ }
296+
297+ if not file_s3_path .startswith ("courses" ):
298+ file_s3_path = "courses" + file_s3_path .split ("courses" )[1 ]
299+
300+ content_json = get_file_content (s3_path , file_s3_path , s3_resource , force_overwrite )
301+ if content_json :
302+ contentfile_data ["content" ] = content_json .get ("content" )
303+
304+ if image_src :
305+ contentfile_data ["image_src" ] = image_src
306+
307+ if youtube_id :
308+ contentfile_data ["youtube_id" ] = youtube_id
309+
310+ return contentfile_data
311+
312+
313+ def transform_contentfile_legacy (
314+ s3_key : str ,
315+ contentfile_data : dict ,
316+ s3_resource : boto3 .resource ,
317+ force_overwrite : bool , # noqa: FBT001
234318) -> dict :
235319 """
236320 Transform the data from data.json for a content file
0 commit comments