99
1010import json
1111import logging
12+ import math
1213import time
1314from datetime import datetime
1415from http import HTTPStatus
@@ -47,86 +48,111 @@ def steps(cls):
4748 return (cls .collect_and_store_advisories ,)
4849
4950 def fetch_data (self ):
50- # Return cached data if already fetched
5151 if self ._cached_data is not None :
5252 logger .info (f"Using cached data: { len (self ._cached_data )} items" )
5353 return self ._cached_data
5454
55- headers = {"User-Agent" : "VulnerableCode" }
5655 all_items = []
57- page = 0
5856 size = 100
59- max_retries = 100
57+ max_retries = 2
6058
6159 logger .info (f"Fetching data from EUVD API: { self .url } " )
6260
63- while True :
64-
65- retry_count = 0
66- success = False
67-
68- while retry_count < max_retries and not success :
69- try :
70- params = {"size" : size , "page" : page }
71- response = requests .get (self .url , headers = headers , params = params , timeout = 30 )
72-
73- if response .status_code != HTTPStatus .OK :
74- logger .error (f"API returned status { response .status_code } for page { page } " )
75- retry_count += 1
76- if retry_count < max_retries :
77- sleep_time = min (10 * (2 ** min (retry_count - 1 , 5 )), 60 )
78- logger .info (
79- f"Retrying page { page } in { sleep_time } s (attempt { retry_count } /{ max_retries } )"
80- )
81- time .sleep (sleep_time )
82- continue
83- else :
84- logger .error (f"Max retries reached for page { page } " )
85- return all_items
86-
87- data = response .json ()
88- items = data .get ("items" , [])
89-
90- if not items :
91- logger .info (f"No items in response for page { page } ; stopping fetch." )
92- logger .info (
93- f"Fetch completed successfully. Total items collected: { len (all_items )} "
94- )
95-
96- # Cache the fetched data for reuse
97- self ._cached_data = all_items
98- logger .info (f"Cached { len (all_items )} items for reuse" )
99-
100- return all_items
101-
102- all_items .extend (items )
103- logger .info (
104- f"Fetched page { page } : { len (items )} items (total: { len (all_items )} )"
105- )
106- success = True
107- page += 1
108-
109- except requests .exceptions .Timeout as e :
110- retry_count += 1
111- if retry_count < max_retries :
112- logger .warning (
113- f"Timeout on page { page } : { e } . Retrying in 10s (attempt { retry_count } /{ max_retries } )"
114- )
115- time .sleep (10 )
116- else :
117- logger .error (f"Max retries reached for page { page } after timeout" )
118- return all_items
119-
120- except Exception as e :
121- retry_count += 1
122- if retry_count < max_retries :
123- logger .error (
124- f"Error fetching page { page } : { e } . Retrying in 10s (attempt { retry_count } /{ max_retries } )"
125- )
126- time .sleep (10 )
127- else :
128- logger .error (f"Max retries reached for page { page } " )
129- return all_items
61+ total_count = self ._fetch_total_count (size , max_retries )
62+ if total_count is None :
63+ logger .error ("Failed to fetch total count from API" )
64+ return all_items
65+
66+ total_pages = math .ceil (total_count / size )
67+ logger .info (f"Total advisories: { total_count } , Total pages: { total_pages } " )
68+
69+ first_page_data = self ._fetch_page (0 , size , max_retries )
70+ if first_page_data :
71+ all_items .extend (first_page_data )
72+ logger .info (f"Fetched page 0: { len (first_page_data )} items (total: { len (all_items )} )" )
73+
74+ for page in range (1 , total_pages ):
75+ page_data = self ._fetch_page (page , size , max_retries )
76+ if page_data is None :
77+ logger .warning (f"Skipping page { page } after failed retries" )
78+ continue
79+
80+ if not page_data :
81+ logger .info (f"No items in response for page { page } ; stopping fetch." )
82+ break
83+
84+ all_items .extend (page_data )
85+ logger .info (f"Fetched page { page } : { len (page_data )} items (total: { len (all_items )} )" )
86+
87+ logger .info (f"Fetch completed successfully. Total items collected: { len (all_items )} " )
88+
89+ self ._cached_data = all_items
90+ logger .info (f"Cached { len (all_items )} items for reuse" )
91+
92+ return all_items
93+
94+ def _make_request_with_retry (self , params , max_retries , context ):
95+ headers = {"User-Agent" : "VulnerableCode" }
96+
97+ for attempt in range (max_retries ):
98+ try :
99+ response = requests .get (self .url , headers = headers , params = params , timeout = 30 )
100+
101+ if response .status_code != HTTPStatus .OK :
102+ logger .error (f"API returned status { response .status_code } for { context } " )
103+ if attempt < max_retries - 1 :
104+ logger .info (f"Retrying { context } (attempt { attempt + 1 } /{ max_retries } )" )
105+ time .sleep (3 )
106+ continue
107+ return None
108+
109+ return response .json ()
110+
111+ except requests .exceptions .Timeout :
112+ logger .warning (f"Timeout on { context } (attempt { attempt + 1 } /{ max_retries } )" )
113+ if attempt < max_retries - 1 :
114+ time .sleep (3 )
115+ continue
116+ return None
117+
118+ except requests .exceptions .RequestException as e :
119+ logger .error (
120+ f"Network error on { context } : { e } (attempt { attempt + 1 } /{ max_retries } )"
121+ )
122+ if attempt < max_retries - 1 :
123+ time .sleep (3 )
124+ continue
125+ return None
126+
127+ except (ValueError , KeyError ) as e :
128+ logger .error (f"Error parsing response for { context } : { e } " )
129+ return None
130+
131+ return None
132+
133+ def _fetch_total_count (self , size , max_retries ):
134+ """Fetch the total count of advisories from the API."""
135+ params = {"size" : size , "page" : 0 }
136+ data = self ._make_request_with_retry (params , max_retries , "total count" )
137+
138+ if data is None :
139+ return None
140+
141+ total = data .get ("total" )
142+ if total is None :
143+ logger .error ("No 'total' field in API response" )
144+
145+ return total
146+
147+ def _fetch_page (self , page , size , max_retries ):
148+ """Fetch a single page of advisories from the API."""
149+ params = {"size" : size , "page" : page }
150+ data = self ._make_request_with_retry (params , max_retries , f"page { page } " )
151+
152+ if data is None :
153+ return None
154+
155+ return data .get ("items" , [])
130156
131157 def advisories_count (self ) -> int :
132158 return len (self .fetch_data ())
@@ -137,7 +163,7 @@ def collect_advisories(self) -> Iterable[AdvisoryData]:
137163 advisory = self .parse_advisory (raw_data )
138164 if advisory :
139165 yield advisory
140- except Exception as e :
166+ except ( ValueError , KeyError , TypeError ) as e :
141167 logger .error (f"Failed to parse advisory: { e } " )
142168 logger .debug (f"Raw data: { raw_data } " )
143169 continue
@@ -162,7 +188,7 @@ def parse_advisory(self, raw_data: dict) -> AdvisoryData:
162188 date_published = date_published .replace (
163189 tzinfo = datetime .now ().astimezone ().tzinfo
164190 )
165- except Exception as e :
191+ except ( ValueError , TypeError ) as e :
166192 logger .warning (f"Failed to parse date '{ date_str } ': { e } " )
167193
168194 references = []
0 commit comments