From c9d7c5882f8dfebd3e10eda9afff7ede50dbbe70 Mon Sep 17 00:00:00 2001 From: shengyuhong Date: Fri, 28 Mar 2025 13:28:13 +0800 Subject: [PATCH 1/3] fix: resolve #1892 by retriving the data page by page --- scripts/data_collector/utils.py | 69 +++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index feec170bb14..46c0e92968f 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -202,18 +202,65 @@ def _get_symbol(): ------- {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...} """ - url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" - try: - resp = requests.get(url, timeout=None) - resp.raise_for_status() - except requests.exceptions.HTTPError as e: - raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e + # url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" + # try: + # resp = requests.get(url, timeout=None) + # resp.raise_for_status() + # except requests.exceptions.HTTPError as e: + # raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e + + # try: + # _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] + # except Exception as e: + # logger.warning("An error occurred while extracting data from the response.") + # raise + + base_url = "http://99.push2.eastmoney.com/api/qt/clist/get" + params = { + "pz": 80, # 每页返回200条数据 + "po": 1, + "np": 1, + "fs": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", + "fields": "f12" + } + + _symbols = [] + page = 1 + + while True: + params["pn"] = page + try: + resp = requests.get(base_url, params=params, timeout=None) + resp.raise_for_status() + data = resp.json() + + # Check if response contains valid data + if not data or "data" not in data or not data["data"] or "diff" not in data["data"]: + logger.warning(f"Invalid response structure on page {page}") + break - try: - _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] - except Exception as e: - logger.warning("An error occurred while extracting data from the response.") - raise + # fetch the current page data + current_symbols = [_v["f12"] for _v in data["data"]["diff"]] + + if not current_symbols: # It's the last page if there is no data in current page + logger.info(f"Last page reached: {page - 1}") + break + + _symbols.extend(current_symbols) + + # show progress + logger.info(f"Page {page}: fetch {len(current_symbols)} stocks:[{current_symbols[0]} ... {current_symbols[-1]}]") + + page += 1 + + # sleep time to avoid overloading the server + time.sleep(0.5) + + except requests.exceptions.HTTPError as e: + raise requests.exceptions.HTTPError(f"Request to {base_url} failed with status code {resp.status_code}") from e + except Exception as e: + logger.warning("An error occurred while extracting data from the response.") + raise if len(_symbols) < 3900: raise ValueError("The complete list of stocks is not available.") From 4bf9e7221910b9a3a73af6e79555cc77d4f8e4c2 Mon Sep 17 00:00:00 2001 From: shengyuhong Date: Fri, 28 Mar 2025 13:32:36 +0800 Subject: [PATCH 2/3] fix: resolve #1892 by retriving the data page by page --- scripts/data_collector/utils.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 46c0e92968f..3a031a03911 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -203,21 +203,11 @@ def _get_symbol(): {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...} """ # url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12" - # try: - # resp = requests.get(url, timeout=None) - # resp.raise_for_status() - # except requests.exceptions.HTTPError as e: - # raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e - - # try: - # _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]] - # except Exception as e: - # logger.warning("An error occurred while extracting data from the response.") - # raise base_url = "http://99.push2.eastmoney.com/api/qt/clist/get" params = { - "pz": 80, # 每页返回200条数据 + "pn": 1, # page number + "pz": 100, # page size, default to 100 "po": 1, "np": 1, "fs": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", From d50173c7c61e174917cdaf2787d6bc99b8920349 Mon Sep 17 00:00:00 2001 From: Linlang Date: Sun, 27 Apr 2025 13:14:12 +0800 Subject: [PATCH 3/3] reformat with black --- scripts/data_collector/utils.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 3a031a03911..f32a3065192 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -211,19 +211,19 @@ def _get_symbol(): "po": 1, "np": 1, "fs": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", - "fields": "f12" + "fields": "f12", } - + _symbols = [] page = 1 - + while True: params["pn"] = page try: resp = requests.get(base_url, params=params, timeout=None) resp.raise_for_status() data = resp.json() - + # Check if response contains valid data if not data or "data" not in data or not data["data"] or "diff" not in data["data"]: logger.warning(f"Invalid response structure on page {page}") @@ -231,23 +231,27 @@ def _get_symbol(): # fetch the current page data current_symbols = [_v["f12"] for _v in data["data"]["diff"]] - + if not current_symbols: # It's the last page if there is no data in current page logger.info(f"Last page reached: {page - 1}") break - + _symbols.extend(current_symbols) # show progress - logger.info(f"Page {page}: fetch {len(current_symbols)} stocks:[{current_symbols[0]} ... {current_symbols[-1]}]") + logger.info( + f"Page {page}: fetch {len(current_symbols)} stocks:[{current_symbols[0]} ... {current_symbols[-1]}]" + ) page += 1 - + # sleep time to avoid overloading the server time.sleep(0.5) - + except requests.exceptions.HTTPError as e: - raise requests.exceptions.HTTPError(f"Request to {base_url} failed with status code {resp.status_code}") from e + raise requests.exceptions.HTTPError( + f"Request to {base_url} failed with status code {resp.status_code}" + ) from e except Exception as e: logger.warning("An error occurred while extracting data from the response.") raise