Skip to content

Commit e19c6a4

Browse files
עידן וילנסקיעידן וילנסקי
authored andcommitted
feat: add ChatGPT scraper and document parsing enhancements
- Added new ChatGPT scraper that can scrape responses per given prompt - Enhanced document download function with parsing capabilities - Bumped version to 1.0.7
1 parent 282e842 commit e19c6a4

8 files changed

Lines changed: 6772 additions & 5 deletions

brightdata/api/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from .scraper import WebScraper
22
from .search import SearchAPI
3+
from .chatgpt import ChatGPTAPI
34

45
__all__ = [
56
'WebScraper',
6-
'SearchAPI'
7+
'SearchAPI',
8+
'ChatGPTAPI'
79
]

brightdata/api/chatgpt.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import json
2+
import requests
3+
from typing import Union, Dict, Any, List
4+
5+
from ..utils import get_logger
6+
from ..exceptions import ValidationError, APIError, AuthenticationError
7+
8+
logger = get_logger('api.chatgpt')
9+
10+
11+
class ChatGPTAPI:
12+
"""Handles ChatGPT scraping operations using Bright Data's ChatGPT dataset API"""
13+
14+
def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
15+
self.session = session
16+
self.api_token = api_token
17+
self.default_timeout = default_timeout
18+
self.max_retries = max_retries
19+
self.retry_backoff = retry_backoff
20+
21+
def scrape_chatgpt(
22+
self,
23+
prompts: List[str],
24+
countries: List[str],
25+
additional_prompts: List[str],
26+
web_searches: List[bool],
27+
timeout: int = None
28+
) -> Dict[str, Any]:
29+
"""
30+
Internal method to handle ChatGPT scraping API requests
31+
32+
Parameters:
33+
- prompts: List of prompts to send to ChatGPT
34+
- countries: List of country codes matching prompts
35+
- additional_prompts: List of follow-up prompts matching prompts
36+
- web_searches: List of web_search flags matching prompts
37+
- timeout: Request timeout in seconds
38+
39+
Returns:
40+
- Dict containing response with snapshot_id
41+
"""
42+
url = "https://api.brightdata.com/datasets/v3/trigger"
43+
headers = {
44+
"Authorization": f"Bearer {self.api_token}",
45+
"Content-Type": "application/json"
46+
}
47+
params = {
48+
"dataset_id": "gd_m7aof0k82r803d5bjm",
49+
"include_errors": "true"
50+
}
51+
52+
data = []
53+
for i in range(len(prompts)):
54+
data.append({
55+
"url": "https://chatgpt.com/",
56+
"prompt": prompts[i],
57+
"country": countries[i],
58+
"additional_prompt": additional_prompts[i],
59+
"web_search": web_searches[i]
60+
})
61+
62+
try:
63+
response = self.session.post(
64+
url,
65+
headers=headers,
66+
params=params,
67+
json=data,
68+
timeout=timeout or self.default_timeout
69+
)
70+
71+
if response.status_code == 401:
72+
raise AuthenticationError("Invalid API token or insufficient permissions")
73+
elif response.status_code != 200:
74+
raise APIError(f"ChatGPT scraping request failed with status {response.status_code}: {response.text}")
75+
76+
result = response.json()
77+
snapshot_id = result.get('snapshot_id')
78+
if snapshot_id:
79+
logger.info(f"ChatGPT scraping job initiated successfully for {len(prompts)} prompt(s)")
80+
print("")
81+
print("Snapshot ID:")
82+
print(snapshot_id)
83+
print("")
84+
85+
return result
86+
87+
except requests.exceptions.Timeout:
88+
raise APIError("Timeout while initiating ChatGPT scraping")
89+
except requests.exceptions.RequestException as e:
90+
raise APIError(f"Network error during ChatGPT scraping: {str(e)}")
91+
except json.JSONDecodeError as e:
92+
raise APIError(f"Failed to parse ChatGPT scraping response: {str(e)}")
93+
except Exception as e:
94+
if isinstance(e, (ValidationError, AuthenticationError, APIError)):
95+
raise
96+
raise APIError(f"Unexpected error during ChatGPT scraping: {str(e)}")

0 commit comments

Comments
 (0)