@@ -15,6 +15,8 @@ class Ident:
1515
1616 @staticmethod
1717 async def remote_file_type (url : str ) -> str :
18+ #identifies the file type of a remote file by reading a chunk of it
19+
1820 session = await SessionFactory ().grab_session ()
1921 async with session .get (url ) as response :
2022 mime_header = response .headers .get ('Content-Type' , '' ).lower ()
@@ -29,6 +31,8 @@ async def remote_file_type(url: str) -> str:
2931
3032 @staticmethod
3133 async def file_source (url : str ) -> str :
34+ #determines if the url is from github or not
35+
3236 extracted = tldextract .extract (url )
3337
3438 domain = extracted .registered_domain .lower ()
@@ -59,47 +63,147 @@ async def file_type_allowed(detected_type: str) -> bool:
5963class DSDownload :
6064
6165 @staticmethod
62- async def pipeline (url : str ) -> str :
63- detected_type , mime_header = await Ident .remote_file_type (url )
64- source = await Ident .file_source (url )
65-
66- if not await Verify .file_type_allowed (detected_type ):
67- networking_logger .error (f"Dataset Download Pipeline: File type { detected_type } not allowed. Aborting download for safety. URL: { url } " )
68- return
66+ async def pipeline (url : str ) -> dict :
67+ """
68+ Main pipeline for downloading datasets from various sources.
69+ Supports both GitHub Cockatoo Core format and generic text URLs.
6970
70- if source == "github" : #parse out the raw github content url (raw.githubusercontent.com links will NOT be identified as github)
71+ Returns:
72+ dict: Status information with 'success', 'source', 'file_count', and 'message'
73+ """
74+ try :
75+ detected_type , mime_header = await Ident .remote_file_type (url )
76+ source = await Ident .file_source (url )
77+
78+ if not await Verify .file_type_allowed (detected_type ):
79+ networking_logger .error (f"Dataset Download Pipeline: File type { detected_type } not allowed. Aborting download for safety. URL: { url } " )
80+ return {"success" : False , "source" : source , "file_count" : 0 , "message" : f"File type { detected_type } not allowed" }
81+
82+ if source == "github" :
83+ return await DSDownload .github_download (url )
84+
85+ else :
86+ return await DSDownload .generic_download (url )
87+
88+ except Exception as e :
89+ networking_logger .error (f"Dataset Download Pipeline: Error occurred - { str (e )} " )
90+ return {"success" : False , "source" : "unknown" , "file_count" : 0 , "message" : str (e )}
91+
92+ @staticmethod
93+ async def github_download (url : str ) -> dict :
94+ #expects a root github repo url
95+
96+ try :
97+ #parse out the raw github content url
7198 raw_url = await URLParser .parse_github_url (url )
72- metadata_url = raw_url . append ( " metadata.json") #append the standard dataset filename for Cockatoo Core dataset format
99+ metadata_url = f" { raw_url } / metadata.json"
73100
74- metadata_content = json .loads (await DownloadManager .download_file (metadata_url )) #download the mnetadata file into a variable
101+ networking_logger .info (f"GitHub Download: Fetching metadata from { metadata_url } " )
102+
103+ #download and parse metadata file
104+ metadata_content_raw = await DownloadManager .download_file (metadata_url )
105+ metadata_content = json .loads (metadata_content_raw )
75106
76- relevent_files = metadata_content .get (CoreDatasetMetadata .relevent_files , []) #use the registry data keying to get the relevent files list
107+ #get list of relevant files from metadata
108+ relevent_files = metadata_content .get (CoreDatasetMetadata .relevent_files , [])
109+
110+ if not relevent_files :
111+ networking_logger .warning (f"GitHub Download: No relevant files found in metadata" )
112+ return {"success" : False , "source" : "github" , "file_count" : 0 , "message" : "No relevant files in metadata" }
77113
114+ networking_logger .info (f"GitHub Download: Found { len (relevent_files )} files in metadata" )
115+
116+ #remove unnecessary files from the list (ie, metadata itself, and domains)
78117 for file_to_remove in SetDownload .removed_files :
79118 if file_to_remove in relevent_files :
80- relevent_files .remove (file_to_remove ) #remove the unnecessary file from the relevent files list
119+ relevent_files .remove (file_to_remove )
120+ networking_logger .debug (f"GitHub Download: Removed { file_to_remove } from download list" )
81121
82- #the relevent files should now only contain the actual data files we want to download
122+ #download each relevant file referenced in metadata
123+ downloaded_count = 0
83124
84125 for file_to_download in relevent_files :
85- file_url = raw_url .append (file_to_download ) #construct the full file url
86-
87- content = await DownloadManager .download_file (file_url ) #download the file content
88- json_content = json .loads (content )
89126
90- #process the content as needed (not implemented here)
91-
92- else :
93- content = await DownloadManager .download_file (url ) # -> this is the set as not using cockatoo core dataset format
127+ try :
128+ file_url = f"{ raw_url } /{ file_to_download } "
129+ networking_logger .debug (f"GitHub Download: Downloading { file_to_download } from { file_url } " )
130+
131+ content = await DownloadManager .download_file (file_url )
132+ json_content = json .loads (content )
133+
134+ # Process the downloaded content (store in database, etc.)
135+ await DSDownload ._process_dataset_content (json_content , file_to_download , "github" )
136+ downloaded_count += 1
137+ networking_logger .info (f"GitHub Download: Successfully processed { file_to_download } " )
138+
139+ except json .JSONDecodeError :
140+ networking_logger .error (f"GitHub Download: Failed to parse JSON from { file_to_download } " )
141+
142+ except Exception as e :
143+ networking_logger .error (f"GitHub Download: Error downloading { file_to_download } : { str (e )} " )
144+
145+ networking_logger .info (f"GitHub Download: Completed. Downloaded and processed { downloaded_count } /{ len (relevent_files )} files" )
146+ return {
147+ "success" : downloaded_count > 0 ,
148+ "source" : "github" ,
149+ "file_count" : downloaded_count ,
150+ "message" : f"Successfully downloaded { downloaded_count } files"
151+ }
152+
153+ except Exception as e :
154+ networking_logger .error (f"GitHub Download: Pipeline error - { str (e )} " )
155+ return {"success" : False , "source" : "github" , "file_count" : 0 , "message" : str (e )}
94156
95157 @staticmethod
96- async def github_download (url : str ) -> str :
97- content = await DownloadManager .download_file (url )
98-
158+ async def generic_download (url : str ) -> dict :
159+ """
160+ Download generic datasets from any URL (typically plain text format).
161+
162+ Returns:
163+ dict: Status information with 'success', 'source', 'file_count', and 'message'
164+ """
165+ try :
166+ networking_logger .info (f"Generic Download: Starting download from { url } " )
167+
168+ content = await DownloadManager .download_file (url )
169+
170+ if not content :
171+ networking_logger .warning (f"Generic Download: No content received from { url } " )
172+ return {"success" : False , "source" : "unknown" , "file_count" : 0 , "message" : "No content received" }
173+
174+ # process the downloaded content (store in database, etc.)
175+ await DSDownload ._process_dataset_content (content , url , "generic" )
176+
177+ networking_logger .info (f"Generic Download: Successfully downloaded and processed content from { url } " )
178+ return {
179+ "success" : True ,
180+ "source" : "unknown" ,
181+ "file_count" : 1 ,
182+ "message" : "Successfully downloaded generic dataset"
183+ }
184+
185+ except Exception as e :
186+ networking_logger .error (f"Generic Download: Error - { str (e )} " )
187+ return {"success" : False , "source" : "unknown" , "file_count" : 0 , "message" : str (e )}
99188
100-
101189 @staticmethod
102- async def generic_download (url : str ) -> str :
103- content = await DownloadManager .download_file (url )
104-
105- return content
190+ async def _process_dataset_content (content , source_identifier : str , source_type : str ) -> None :
191+ """
192+ Internal method to process downloaded dataset content.
193+ Handles storage to database or other processing as needed.
194+
195+ Args:
196+ content: The downloaded content (dict for JSON, str for text)
197+ source_identifier: Filename or URL identifier
198+ source_type: 'github' or 'generic'
199+ """
200+ try :
201+ networking_logger .debug (f"Processing dataset from { source_type } source: { source_identifier } " )
202+
203+ # TODO: Implement actual storage logic to database
204+
205+ networking_logger .debug (f"Dataset content processed: { source_identifier } " )
206+
207+ except Exception as e :
208+ networking_logger .error (f"Error processing dataset content from { source_identifier } : { str (e )} " )
209+ return {"success" : False , "source_identifier" : source_identifier , "source_type" : source_type , "message" : str (e )}
0 commit comments