diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index feec170bb14..f32a3065192 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -202,18 +202,59 @@ def _get_symbol(): ------- {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...} """ - url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" - try: - resp = requests.get(url, timeout=None) - resp.raise_for_status() - except requests.exceptions.HTTPError as e: - raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e + # url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" + + base_url = "http://99.push2.eastmoney.com/api/qt/clist/get" + params = { + "pn": 1, # page number + "pz": 100, # page size, default to 100 + "po": 1, + "np": 1, + "fs": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", + "fields": "f12", + } - try: - _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] - except Exception as e: - logger.warning("An error occurred while extracting data from the response.") - raise + _symbols = [] + page = 1 + + while True: + params["pn"] = page + try: + resp = requests.get(base_url, params=params, timeout=None) + resp.raise_for_status() + data = resp.json() + + # Check if response contains valid data + if not data or "data" not in data or not data["data"] or "diff" not in data["data"]: + logger.warning(f"Invalid response structure on page {page}") + break + + # fetch the current page data + current_symbols = [_v["f12"] for _v in data["data"]["diff"]] + + if not current_symbols: # It's the last page if there is no data in current page + logger.info(f"Last page reached: {page - 1}") + break + + _symbols.extend(current_symbols) + + # show progress + logger.info( + f"Page {page}: fetch {len(current_symbols)} stocks:[{current_symbols[0]} ... {current_symbols[-1]}]" + ) + + page += 1 + + # sleep time to avoid overloading the server + time.sleep(0.5) + + except requests.exceptions.HTTPError as e: + raise requests.exceptions.HTTPError( + f"Request to {base_url} failed with status code {resp.status_code}" + ) from e + except Exception as e: + logger.warning("An error occurred while extracting data from the response.") + raise if len(_symbols) < 3900: raise ValueError("The complete list of stocks is not available.")