Skip to content

Commit 6aa3161

Browse files
committed
Created using Colab
1 parent b3024d7 commit 6aa3161

1 file changed

Lines changed: 146 additions & 3 deletions

File tree

01_Web_Scraping_Fundamentals.ipynb

Lines changed: 146 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
"base_uri": "https://localhost:8080/"
4444
},
4545
"id": "e193abb6",
46-
"outputId": "07ec6fb7-fd0f-42b2-e5a3-2cb3336d2ae8"
46+
"outputId": "444ba1f5-b172-402d-fab3-0d528bdd552e"
4747
},
4848
"outputs": [
4949
{
@@ -76,14 +76,14 @@
7676
},
7777
{
7878
"cell_type": "code",
79-
"execution_count": 2,
79+
"execution_count": 5,
8080
"id": "94787508",
8181
"metadata": {
8282
"colab": {
8383
"base_uri": "https://localhost:8080/"
8484
},
8585
"id": "94787508",
86-
"outputId": "4a9083ef-fda3-4746-d408-d220808476c9"
86+
"outputId": "2216ea55-efda-444e-b6c6-6e67547ff062"
8787
},
8888
"outputs": [
8989
{
@@ -117,6 +117,149 @@
117117
"- Avoid overloading servers\n",
118118
"- Use delays"
119119
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"source": [
124+
"import requests\n",
125+
"\n",
126+
"response = requests.get(\"https://example.com\")\n",
127+
"\n",
128+
"print(response.status_code) # HTTP status code (e.g., 200)\n",
129+
"print(response.text) # Raw response content"
130+
],
131+
"metadata": {
132+
"colab": {
133+
"base_uri": "https://localhost:8080/"
134+
},
135+
"id": "_7h6bi0IeLD-",
136+
"outputId": "b3f14d20-69a7-432e-f760-af2290c1794c"
137+
},
138+
"id": "_7h6bi0IeLD-",
139+
"execution_count": 7,
140+
"outputs": [
141+
{
142+
"output_type": "stream",
143+
"name": "stdout",
144+
"text": [
145+
"200\n",
146+
"<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style></head><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p><p><a href=\"https://iana.org/domains/example\">Learn more</a></p></div></body></html>\n",
147+
"\n"
148+
]
149+
}
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"source": [
155+
"import requests\n",
156+
"\n",
157+
"url = \"https://example.com\"\n",
158+
"\n",
159+
"headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
160+
"\n",
161+
"response = requests.get(url, headers=headers)\n",
162+
"\n",
163+
"html_content = response.text\n",
164+
"\n",
165+
"print(html_content[:500])"
166+
],
167+
"metadata": {
168+
"id": "MkGESr4NfJ4O",
169+
"outputId": "4c1ce9cb-c5a8-42f5-c4ce-e9d8269154f8",
170+
"colab": {
171+
"base_uri": "https://localhost:8080/"
172+
}
173+
},
174+
"id": "MkGESr4NfJ4O",
175+
"execution_count": 10,
176+
"outputs": [
177+
{
178+
"output_type": "stream",
179+
"name": "stdout",
180+
"text": [
181+
"<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style></head><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p><p><a href=\"https://iana.org/domains/example\">Learn more<\n"
182+
]
183+
}
184+
]
185+
},
186+
{
187+
"cell_type": "code",
188+
"source": [
189+
"import requests\n",
190+
"import random\n",
191+
"import time\n",
192+
"\n",
193+
"url = \"https://example.com\"\n",
194+
"\n",
195+
"# List of common user agents to rotate through\n",
196+
"user_agents = [\n",
197+
" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\",\n",
198+
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\",\n",
199+
" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36\",\n",
200+
" \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0\",\n",
201+
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15\"\n",
202+
"]\n",
203+
"\n",
204+
"response = None\n",
205+
"max_retries = 3\n",
206+
"for attempt in range(max_retries):\n",
207+
" try:\n",
208+
" # Randomly select a user agent for each request\n",
209+
" selected_user_agent = random.choice(user_agents)\n",
210+
" headers = {\"User-Agent\": selected_user_agent}\n",
211+
" print(f\"Attempt {attempt + 1}: Using User-Agent: {selected_user_agent[:50]}...\")\n",
212+
"\n",
213+
" response = requests.get(url, headers=headers, timeout=10) # Added timeout for robustness\n",
214+
" response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)\n",
215+
" print(f\"Request successful with status code: {response.status_code}\")\n",
216+
" break # Exit loop if successful\n",
217+
" except requests.exceptions.HTTPError as e:\n",
218+
" print(f\"HTTP error occurred: {e}\")\n",
219+
" except requests.exceptions.ConnectionError as e:\n",
220+
" print(f\"Connection error occurred: {e}\")\n",
221+
" except requests.exceptions.Timeout as e:\n",
222+
" print(f\"Timeout error occurred: {e}\")\n",
223+
" except requests.exceptions.RequestException as e:\n",
224+
" print(f\"An unexpected request error occurred: {e}\")\n",
225+
"\n",
226+
" if attempt < max_retries - 1:\n",
227+
" wait_time = 2 ** attempt # Exponential backoff\n",
228+
" print(f\"Retrying in {wait_time} seconds...\")\n",
229+
" time.sleep(wait_time)\n",
230+
" else:\n",
231+
" print(\"Max retries reached. Could not retrieve content.\")\n",
232+
"\n",
233+
"\n",
234+
"if response and response.status_code == 200:\n",
235+
" html_content = response.text\n",
236+
" print(\"\\nFirst 500 characters of HTML content:\")\n",
237+
" print(html_content[:500])\n",
238+
"else:\n",
239+
" print(\"\\nFailed to retrieve content after multiple attempts.\")"
240+
],
241+
"metadata": {
242+
"colab": {
243+
"base_uri": "https://localhost:8080/"
244+
},
245+
"id": "SdrzlYi7egME",
246+
"outputId": "94513f72-9aef-42d9-a4f0-c90823f106c1"
247+
},
248+
"id": "SdrzlYi7egME",
249+
"execution_count": 9,
250+
"outputs": [
251+
{
252+
"output_type": "stream",
253+
"name": "stdout",
254+
"text": [
255+
"Attempt 1: Using User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Ap...\n",
256+
"Request successful with status code: 200\n",
257+
"\n",
258+
"First 500 characters of HTML content:\n",
259+
"<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style></head><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p><p><a href=\"https://iana.org/domains/example\">Learn more<\n"
260+
]
261+
}
262+
]
120263
}
121264
],
122265
"metadata": {

0 commit comments

Comments
 (0)