Skip to content

Commit 8a09fcb

Browse files
authored
Merge pull request #17 from DominicTWHV/dev
Update error handling and dataset management frameworks
2 parents 137764d + 138ddff commit 8a09fcb

1 file changed

Lines changed: 133 additions & 29 deletions

File tree

edge/helper/setOrchestrator.py

Lines changed: 133 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class Ident:
1515

1616
@staticmethod
1717
async def remote_file_type(url: str) -> str:
18+
#identifies the file type of a remote file by reading a chunk of it
19+
1820
session = await SessionFactory().grab_session()
1921
async with session.get(url) as response:
2022
mime_header = response.headers.get('Content-Type', '').lower()
@@ -29,6 +31,8 @@ async def remote_file_type(url: str) -> str:
2931

3032
@staticmethod
3133
async def file_source(url: str) -> str:
34+
#determines if the url is from github or not
35+
3236
extracted = tldextract.extract(url)
3337

3438
domain = extracted.registered_domain.lower()
@@ -59,47 +63,147 @@ async def file_type_allowed(detected_type: str) -> bool:
5963
class DSDownload:
6064

6165
@staticmethod
62-
async def pipeline(url: str) -> str:
63-
detected_type, mime_header = await Ident.remote_file_type(url)
64-
source = await Ident.file_source(url)
65-
66-
if not await Verify.file_type_allowed(detected_type):
67-
networking_logger.error(f"Dataset Download Pipeline: File type {detected_type} not allowed. Aborting download for safety. URL: {url}")
68-
return
66+
async def pipeline(url: str) -> dict:
67+
"""
68+
Main pipeline for downloading datasets from various sources.
69+
Supports both GitHub Cockatoo Core format and generic text URLs.
6970
70-
if source == "github": #parse out the raw github content url (raw.githubusercontent.com links will NOT be identified as github)
71+
Returns:
72+
dict: Status information with 'success', 'source', 'file_count', and 'message'
73+
"""
74+
try:
75+
detected_type, mime_header = await Ident.remote_file_type(url)
76+
source = await Ident.file_source(url)
77+
78+
if not await Verify.file_type_allowed(detected_type):
79+
networking_logger.error(f"Dataset Download Pipeline: File type {detected_type} not allowed. Aborting download for safety. URL: {url}")
80+
return {"success": False, "source": source, "file_count": 0, "message": f"File type {detected_type} not allowed"}
81+
82+
if source == "github":
83+
return await DSDownload.github_download(url)
84+
85+
else:
86+
return await DSDownload.generic_download(url)
87+
88+
except Exception as e:
89+
networking_logger.error(f"Dataset Download Pipeline: Error occurred - {str(e)}")
90+
return {"success": False, "source": "unknown", "file_count": 0, "message": str(e)}
91+
92+
@staticmethod
93+
async def github_download(url: str) -> dict:
94+
#expects a root github repo url
95+
96+
try:
97+
#parse out the raw github content url
7198
raw_url = await URLParser.parse_github_url(url)
72-
metadata_url = raw_url.append("metadata.json") #append the standard dataset filename for Cockatoo Core dataset format
99+
metadata_url = f"{raw_url}/metadata.json"
73100

74-
metadata_content = json.loads(await DownloadManager.download_file(metadata_url)) #download the mnetadata file into a variable
101+
networking_logger.info(f"GitHub Download: Fetching metadata from {metadata_url}")
102+
103+
#download and parse metadata file
104+
metadata_content_raw = await DownloadManager.download_file(metadata_url)
105+
metadata_content = json.loads(metadata_content_raw)
75106

76-
relevent_files = metadata_content.get(CoreDatasetMetadata.relevent_files, []) #use the registry data keying to get the relevent files list
107+
#get list of relevant files from metadata
108+
relevent_files = metadata_content.get(CoreDatasetMetadata.relevent_files, [])
109+
110+
if not relevent_files:
111+
networking_logger.warning(f"GitHub Download: No relevant files found in metadata")
112+
return {"success": False, "source": "github", "file_count": 0, "message": "No relevant files in metadata"}
77113

114+
networking_logger.info(f"GitHub Download: Found {len(relevent_files)} files in metadata")
115+
116+
#remove unnecessary files from the list (ie, metadata itself, and domains)
78117
for file_to_remove in SetDownload.removed_files:
79118
if file_to_remove in relevent_files:
80-
relevent_files.remove(file_to_remove) #remove the unnecessary file from the relevent files list
119+
relevent_files.remove(file_to_remove)
120+
networking_logger.debug(f"GitHub Download: Removed {file_to_remove} from download list")
81121

82-
#the relevent files should now only contain the actual data files we want to download
122+
#download each relevant file referenced in metadata
123+
downloaded_count = 0
83124

84125
for file_to_download in relevent_files:
85-
file_url = raw_url.append(file_to_download) #construct the full file url
86-
87-
content = await DownloadManager.download_file(file_url) #download the file content
88-
json_content = json.loads(content)
89126

90-
#process the content as needed (not implemented here)
91-
92-
else:
93-
content = await DownloadManager.download_file(url) # -> this is the set as not using cockatoo core dataset format
127+
try:
128+
file_url = f"{raw_url}/{file_to_download}"
129+
networking_logger.debug(f"GitHub Download: Downloading {file_to_download} from {file_url}")
130+
131+
content = await DownloadManager.download_file(file_url)
132+
json_content = json.loads(content)
133+
134+
# Process the downloaded content (store in database, etc.)
135+
await DSDownload._process_dataset_content(json_content, file_to_download, "github")
136+
downloaded_count += 1
137+
networking_logger.info(f"GitHub Download: Successfully processed {file_to_download}")
138+
139+
except json.JSONDecodeError:
140+
networking_logger.error(f"GitHub Download: Failed to parse JSON from {file_to_download}")
141+
142+
except Exception as e:
143+
networking_logger.error(f"GitHub Download: Error downloading {file_to_download}: {str(e)}")
144+
145+
networking_logger.info(f"GitHub Download: Completed. Downloaded and processed {downloaded_count}/{len(relevent_files)} files")
146+
return {
147+
"success": downloaded_count > 0,
148+
"source": "github",
149+
"file_count": downloaded_count,
150+
"message": f"Successfully downloaded {downloaded_count} files"
151+
}
152+
153+
except Exception as e:
154+
networking_logger.error(f"GitHub Download: Pipeline error - {str(e)}")
155+
return {"success": False, "source": "github", "file_count": 0, "message": str(e)}
94156

95157
@staticmethod
96-
async def github_download(url: str) -> str:
97-
content = await DownloadManager.download_file(url)
98-
158+
async def generic_download(url: str) -> dict:
159+
"""
160+
Download generic datasets from any URL (typically plain text format).
161+
162+
Returns:
163+
dict: Status information with 'success', 'source', 'file_count', and 'message'
164+
"""
165+
try:
166+
networking_logger.info(f"Generic Download: Starting download from {url}")
167+
168+
content = await DownloadManager.download_file(url)
169+
170+
if not content:
171+
networking_logger.warning(f"Generic Download: No content received from {url}")
172+
return {"success": False, "source": "unknown", "file_count": 0, "message": "No content received"}
173+
174+
# process the downloaded content (store in database, etc.)
175+
await DSDownload._process_dataset_content(content, url, "generic")
176+
177+
networking_logger.info(f"Generic Download: Successfully downloaded and processed content from {url}")
178+
return {
179+
"success": True,
180+
"source": "unknown",
181+
"file_count": 1,
182+
"message": "Successfully downloaded generic dataset"
183+
}
184+
185+
except Exception as e:
186+
networking_logger.error(f"Generic Download: Error - {str(e)}")
187+
return {"success": False, "source": "unknown", "file_count": 0, "message": str(e)}
99188

100-
101189
@staticmethod
102-
async def generic_download(url: str) -> str:
103-
content = await DownloadManager.download_file(url)
104-
105-
return content
190+
async def _process_dataset_content(content, source_identifier: str, source_type: str) -> None:
191+
"""
192+
Internal method to process downloaded dataset content.
193+
Handles storage to database or other processing as needed.
194+
195+
Args:
196+
content: The downloaded content (dict for JSON, str for text)
197+
source_identifier: Filename or URL identifier
198+
source_type: 'github' or 'generic'
199+
"""
200+
try:
201+
networking_logger.debug(f"Processing dataset from {source_type} source: {source_identifier}")
202+
203+
# TODO: Implement actual storage logic to database
204+
205+
networking_logger.debug(f"Dataset content processed: {source_identifier}")
206+
207+
except Exception as e:
208+
networking_logger.error(f"Error processing dataset content from {source_identifier}: {str(e)}")
209+
return {"success": False, "source_identifier": source_identifier, "source_type": source_type, "message": str(e)}

0 commit comments

Comments
 (0)