Skip to content

Commit 04a2e9f

Browse files
committed
Created using Colab
1 parent 6aa3161 commit 04a2e9f

1 file changed

Lines changed: 207 additions & 11 deletions

File tree

02_BeautifulSoup_Web_Scraping.ipynb

Lines changed: 207 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,23 @@
1717
"id": "252c1849"
1818
},
1919
"source": [
20-
"# BeautifulSoup Web Scraping"
20+
"# BeautifulSoup Web Scraping\n",
21+
"\n",
22+
"**Beautiful Soup** is a widely used Python library designed for parsing HTML and XML documents. It is an essential tool for web scraping, making it easy to navigate, search, and modify the parse tree to extract specific data\n",
23+
"\n",
24+
"**pypi:** https://pypi.org/project/beautifulsoup4/"
2125
]
2226
},
2327
{
2428
"cell_type": "code",
25-
"execution_count": 1,
29+
"execution_count": null,
2630
"id": "fa32b313",
2731
"metadata": {
2832
"id": "fa32b313"
2933
},
3034
"outputs": [],
3135
"source": [
32-
"!pip install beautifulsoup4 requests pandas -q"
36+
"# !pip install beautifulsoup4 requests pandas -q"
3337
]
3438
},
3539
{
@@ -44,14 +48,14 @@
4448
},
4549
{
4650
"cell_type": "code",
47-
"execution_count": 2,
51+
"execution_count": 1,
4852
"id": "658ebf4e",
4953
"metadata": {
5054
"colab": {
5155
"base_uri": "https://localhost:8080/"
5256
},
5357
"id": "658ebf4e",
54-
"outputId": "75efad64-b03d-4f56-fd33-fa518e35bee8"
58+
"outputId": "28bfad7e-45f1-4c41-f7b9-78284b3a6910"
5559
},
5660
"outputs": [
5761
{
@@ -88,14 +92,14 @@
8892
},
8993
{
9094
"cell_type": "code",
91-
"execution_count": 3,
95+
"execution_count": 8,
9296
"id": "283e0f3b",
9397
"metadata": {
9498
"colab": {
9599
"base_uri": "https://localhost:8080/"
96100
},
97101
"id": "283e0f3b",
98-
"outputId": "401568fa-8837-4b6a-91e8-9482a02ef594"
102+
"outputId": "274fc91c-6cf2-4c2f-f23f-b808e5b1620f"
99103
},
100104
"outputs": [
101105
{
@@ -126,14 +130,14 @@
126130
},
127131
{
128132
"cell_type": "code",
129-
"execution_count": 4,
133+
"execution_count": 3,
130134
"id": "26888879",
131135
"metadata": {
132136
"colab": {
133137
"base_uri": "https://localhost:8080/"
134138
},
135139
"id": "26888879",
136-
"outputId": "3f30e545-5cf5-4cdf-aca3-ce021b9d9d80"
140+
"outputId": "8ecd1eb9-3524-44ea-a2ea-49c300373882"
137141
},
138142
"outputs": [
139143
{
@@ -162,7 +166,7 @@
162166
},
163167
{
164168
"cell_type": "code",
165-
"execution_count": 5,
169+
"execution_count": null,
166170
"id": "234a72c0",
167171
"metadata": {
168172
"colab": {
@@ -200,7 +204,7 @@
200204
},
201205
{
202206
"cell_type": "code",
203-
"execution_count": 6,
207+
"execution_count": null,
204208
"id": "0b69b322",
205209
"metadata": {
206210
"colab": {
@@ -361,6 +365,198 @@
361365
"\n",
362366
"df.head()"
363367
]
368+
},
369+
{
370+
"cell_type": "markdown",
371+
"source": [
372+
"\n",
373+
"\n",
374+
"---\n",
375+
"\n"
376+
],
377+
"metadata": {
378+
"id": "agK7X3LNgXLF"
379+
},
380+
"id": "agK7X3LNgXLF"
381+
},
382+
{
383+
"cell_type": "markdown",
384+
"source": [
385+
"**Step 1**"
386+
],
387+
"metadata": {
388+
"id": "X95tqIFJhKO1"
389+
},
390+
"id": "X95tqIFJhKO1"
391+
},
392+
{
393+
"cell_type": "code",
394+
"source": [
395+
"import requests\n",
396+
"\n",
397+
"url = \"https://lovnishverma.in\"\n",
398+
"\n",
399+
"headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
400+
"\n",
401+
"response = requests.get(url, headers=headers)\n",
402+
"\n",
403+
"html_content = response.text\n",
404+
"\n",
405+
"print(html_content[:500])"
406+
],
407+
"metadata": {
408+
"colab": {
409+
"base_uri": "https://localhost:8080/"
410+
},
411+
"id": "ZE0wdlfBgZCQ",
412+
"outputId": "98f6045b-39ff-4986-f4a0-8277f6274439"
413+
},
414+
"id": "ZE0wdlfBgZCQ",
415+
"execution_count": 6,
416+
"outputs": [
417+
{
418+
"output_type": "stream",
419+
"name": "stdout",
420+
"text": [
421+
"<!DOCTYPE html>\n",
422+
"<html lang=\"en\" data-theme=\"dark\">\n",
423+
"\n",
424+
"<head>\n",
425+
" <meta charset=\"UTF-8\" />\n",
426+
" <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n",
427+
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n",
428+
" <meta name=\"google-site-verification\" content=\"6LXGQp1AM1BbZAD8hGIdRZosN6dBokITCRz1VKWKwus\" />\n",
429+
" <link rel=\"preconnect\" href=\"https://fonts.googleapis.com\">\n",
430+
" <link rel=\"preconnect\" href=\"https://cdn.jsdelivr.net\">\n",
431+
" <link rel=\"preconnect\" href=\"https://api.rss2jso\n"
432+
]
433+
}
434+
]
435+
},
436+
{
437+
"cell_type": "markdown",
438+
"source": [
439+
"**Step 2**"
440+
],
441+
"metadata": {
442+
"id": "9xoXlgDUhOxM"
443+
},
444+
"id": "9xoXlgDUhOxM"
445+
},
446+
{
447+
"cell_type": "code",
448+
"source": [
449+
"from bs4 import BeautifulSoup\n",
450+
"\n",
451+
"soup = BeautifulSoup(html_content, \"html.parser\")\n",
452+
"\n",
453+
"print(type(soup))"
454+
],
455+
"metadata": {
456+
"colab": {
457+
"base_uri": "https://localhost:8080/"
458+
},
459+
"id": "v4-2rB9Qg1hM",
460+
"outputId": "89e0c9ac-97c8-4445-d0e3-86d6c5ec4957"
461+
},
462+
"id": "v4-2rB9Qg1hM",
463+
"execution_count": 9,
464+
"outputs": [
465+
{
466+
"output_type": "stream",
467+
"name": "stdout",
468+
"text": [
469+
"<class 'bs4.BeautifulSoup'>\n"
470+
]
471+
}
472+
]
473+
},
474+
{
475+
"cell_type": "markdown",
476+
"source": [
477+
"**Step 3**"
478+
],
479+
"metadata": {
480+
"id": "WdWC0u_UhQqs"
481+
},
482+
"id": "WdWC0u_UhQqs"
483+
},
484+
{
485+
"cell_type": "code",
486+
"source": [
487+
"# extract <h3 class=\"project-title\">Project Title</h3>\n",
488+
"\n",
489+
"heading = soup.find(\"h3\", class_=\"project-title\")\n",
490+
"\n",
491+
"print(heading.get_text(strip=True))"
492+
],
493+
"metadata": {
494+
"colab": {
495+
"base_uri": "https://localhost:8080/"
496+
},
497+
"id": "zNqepeGugnHs",
498+
"outputId": "51669e27-21b5-47dc-ed44-6c676a096aeb"
499+
},
500+
"id": "zNqepeGugnHs",
501+
"execution_count": 11,
502+
"outputs": [
503+
{
504+
"output_type": "stream",
505+
"name": "stdout",
506+
"text": [
507+
"NIELIT StudentHub\n"
508+
]
509+
}
510+
]
511+
},
512+
{
513+
"cell_type": "code",
514+
"metadata": {
515+
"colab": {
516+
"base_uri": "https://localhost:8080/"
517+
},
518+
"id": "dd6bb1df",
519+
"outputId": "90208177-0e63-4b05-e2e3-8f01b8577762"
520+
},
521+
"source": [
522+
"project_titles = soup.find_all(\"h3\", class_=\"project-title\")\n",
523+
"\n",
524+
"for title in project_titles:\n",
525+
" print(title.get_text(strip=True))"
526+
],
527+
"id": "dd6bb1df",
528+
"execution_count": 12,
529+
"outputs": [
530+
{
531+
"output_type": "stream",
532+
"name": "stdout",
533+
"text": [
534+
"NIELIT StudentHub\n",
535+
"NIELIT Ropar AI Assistant\n",
536+
"Brain Tumor Detection AI\n",
537+
"Diabetes Risk Predictor\n",
538+
"Employee Geo-Tracker\n",
539+
"Intelligent Book Recommender\n",
540+
"Hostel Gatepass Management System\n",
541+
"AI Mask Compliance Monitor\n",
542+
"Karsog.com - Tourism & Business Directory\n",
543+
"SmartLane AI\n",
544+
"Live DHT11 Sensor Data Dashboard\n",
545+
"SecureCert: Blockchain-Based Certificate Verification\n",
546+
"Iris Species Classifier\n",
547+
"Free QR Code Generator + Scanner\n",
548+
"Face and Emotion Recognition-Based Attendance System\n",
549+
"Flask E-Commerce Store\n",
550+
"360° NIELIT Image Gallery\n",
551+
"Movie Recommender System\n",
552+
"Face Recognition Attendance System\n",
553+
"HRTC Bus Timetable\n",
554+
"Digital Restaurant Menu\n",
555+
"FileMailer - Secure File Email Service\n",
556+
"Flask Web Scraper\n"
557+
]
558+
}
559+
]
364560
}
365561
],
366562
"metadata": {

0 commit comments

Comments
 (0)