Skip to content

Commit a1f50c2

Browse files
Understanding the Data and Loading and Exploring the Data
1 parent 8801167 commit a1f50c2

1 file changed

Lines changed: 244 additions & 18 deletions

File tree

4_data_analysis/MLProject.ipynb

Lines changed: 244 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": 11,
1414
"id": "9beb769c",
1515
"metadata": {},
1616
"outputs": [
@@ -20,34 +20,163 @@
2020
"'/home/clif_lastrophysicien/ELO2_LAPERLE_HT/4_data_analysis'"
2121
]
2222
},
23-
"execution_count": 1,
23+
"execution_count": 11,
2424
"metadata": {},
2525
"output_type": "execute_result"
2626
}
2727
],
2828
"source": [
29-
"import os\n",
30-
"import pandas as pd\n",
31-
"import numpy as np \n",
29+
"import os \n",
3230
"import sys\n",
3331
"\n",
3432
"os.getcwd()\n"
3533
]
3634
},
35+
{
36+
"cell_type": "markdown",
37+
"id": "4031f41a",
38+
"metadata": {},
39+
"source": [
40+
"Step 1: Understanding the Data"
41+
]
42+
},
3743
{
3844
"cell_type": "code",
39-
"execution_count": 2,
40-
"id": "4906fae4",
45+
"execution_count": 15,
46+
"id": "cd59ce1b",
4147
"metadata": {},
4248
"outputs": [],
4349
"source": [
44-
"data = pd.read_csv('UN_tourism_caribbean_countries_cleaned.csv')"
50+
"# Import necessary libraries\n",
51+
"import pandas as pd\n",
52+
"import numpy as np\n",
53+
"import matplotlib.pyplot as plt\n",
54+
"import seaborn as sns\n",
55+
"from sklearn.model_selection import train_test_split\n",
56+
"from sklearn.linear_model import LinearRegression\n",
57+
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
58+
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
59+
"import warnings\n",
60+
"\n",
61+
"warnings.filterwarnings(\"ignore\")\n",
62+
"\n",
63+
"# Set display options\n",
64+
"pd.set_option(\"display.max_columns\", None)\n",
65+
"pd.set_option(\"display.max_rows\", 100)\n"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 14,
71+
"id": "7c11579f",
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"# Set style for better visualizations\n",
76+
"plt.style.use(\"seaborn-v0_8-darkgrid\")\n",
77+
"sns.set_palette(\"husl\")\n"
78+
]
79+
},
80+
{
81+
"cell_type": "markdown",
82+
"id": "7dfb6d43",
83+
"metadata": {},
84+
"source": [
85+
"2: Loading and Exploring the Data"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 16,
91+
"id": "3b222fd2",
92+
"metadata": {},
93+
"outputs": [
94+
{
95+
"name": "stdout",
96+
"output_type": "stream",
97+
"text": [
98+
"=== DATASET INFORMATION ===\n",
99+
"Shape: (3025, 6)\n",
100+
"\n",
101+
"Columns: ['type_of_visitors', 'country_receiving', 'where_tourist_from', 'year', 'number_of_tourist', 'unit']\n",
102+
"\n",
103+
"Data Types:\n",
104+
"type_of_visitors object\n",
105+
"country_receiving object\n",
106+
"where_tourist_from object\n",
107+
"year int64\n",
108+
"number_of_tourist float64\n",
109+
"unit object\n",
110+
"dtype: object\n",
111+
"\n",
112+
"Missing Values:\n",
113+
"type_of_visitors 0\n",
114+
"country_receiving 0\n",
115+
"where_tourist_from 0\n",
116+
"year 0\n",
117+
"number_of_tourist 0\n",
118+
"unit 0\n",
119+
"dtype: int64\n",
120+
"\n",
121+
"First 5 rows:\n",
122+
" type_of_visitors country_receiving where_tourist_from year \\\n",
123+
"0 excursionists Antigua and Barbuda World 1995 \n",
124+
"1 excursionists Antigua and Barbuda World 1996 \n",
125+
"2 excursionists Antigua and Barbuda World 1997 \n",
126+
"3 excursionists Antigua and Barbuda World 1998 \n",
127+
"4 excursionists Antigua and Barbuda World 1999 \n",
128+
"\n",
129+
" number_of_tourist unit \n",
130+
"0 227.0 thousand trips \n",
131+
"1 270.0 thousand trips \n",
132+
"2 286.0 thousand trips \n",
133+
"3 336.0 thousand trips \n",
134+
"4 328.0 thousand trips \n",
135+
"\n",
136+
"=== BASIC STATISTICS ===\n",
137+
" year number_of_tourist\n",
138+
"count 3025.000000 3025.000000\n",
139+
"mean 2008.196033 821.920078\n",
140+
"std 7.987771 1142.864089\n",
141+
"min 1995.000000 0.100000\n",
142+
"25% 2001.000000 170.000000\n",
143+
"50% 2008.000000 424.000000\n",
144+
"75% 2015.000000 972.000000\n",
145+
"max 2024.000000 11162.229723\n",
146+
"\n",
147+
"=== UNIQUE VALUES ===\n",
148+
"Visitor types: ['excursionists' 'tourists' 'visitors_total']\n",
149+
"Number of countries: 30\n",
150+
"Years range: 1995 to 2024\n"
151+
]
152+
}
153+
],
154+
"source": [
155+
"# Load the dataset\n",
156+
"df = pd.read_csv(\"UN_tourism_caribbean_countries_cleaned.csv\")\n",
157+
"\n",
158+
"# Display basic information\n",
159+
"print(\"=== DATASET INFORMATION ===\")\n",
160+
"print(f\"Shape: {df.shape}\")\n",
161+
"print(f\"\\nColumns: {df.columns.tolist()}\")\n",
162+
"print(f\"\\nData Types:\\n{df.dtypes}\")\n",
163+
"print(f\"\\nMissing Values:\\n{df.isnull().sum()}\")\n",
164+
"print(f\"\\nFirst 5 rows:\")\n",
165+
"print(df.head())\n",
166+
"\n",
167+
"print(f\"\\n=== BASIC STATISTICS ===\")\n",
168+
"print(df.describe())\n",
169+
"\n",
170+
"print(f\"\\n=== UNIQUE VALUES ===\")\n",
171+
"print(f\"Visitor types: {df['type_of_visitors'].unique()}\")\n",
172+
"print(f\"Number of countries: {df['country_receiving'].nunique()}\")\n",
173+
"print(f\"Years range: {df['year'].min()} to {df['year'].max()}\")\n"
45174
]
46175
},
47176
{
48177
"cell_type": "code",
49-
"execution_count": 6,
50-
"id": "bf9cb96f",
178+
"execution_count": 20,
179+
"id": "acc8bd2f",
51180
"metadata": {},
52181
"outputs": [
53182
{
@@ -68,19 +197,118 @@
68197
"dtypes: float64(1), int64(1), object(4)\n",
69198
"memory usage: 141.9+ KB\n"
70199
]
200+
},
201+
{
202+
"data": {
203+
"text/html": [
204+
"<div>\n",
205+
"<style scoped>\n",
206+
" .dataframe tbody tr th:only-of-type {\n",
207+
" vertical-align: middle;\n",
208+
" }\n",
209+
"\n",
210+
" .dataframe tbody tr th {\n",
211+
" vertical-align: top;\n",
212+
" }\n",
213+
"\n",
214+
" .dataframe thead th {\n",
215+
" text-align: right;\n",
216+
" }\n",
217+
"</style>\n",
218+
"<table border=\"1\" class=\"dataframe\">\n",
219+
" <thead>\n",
220+
" <tr style=\"text-align: right;\">\n",
221+
" <th></th>\n",
222+
" <th>type_of_visitors</th>\n",
223+
" <th>country_receiving</th>\n",
224+
" <th>where_tourist_from</th>\n",
225+
" <th>year</th>\n",
226+
" <th>number_of_tourist</th>\n",
227+
" <th>unit</th>\n",
228+
" </tr>\n",
229+
" </thead>\n",
230+
" <tbody>\n",
231+
" <tr>\n",
232+
" <th>0</th>\n",
233+
" <td>excursionists</td>\n",
234+
" <td>Antigua and Barbuda</td>\n",
235+
" <td>World</td>\n",
236+
" <td>1995</td>\n",
237+
" <td>227.0</td>\n",
238+
" <td>thousand trips</td>\n",
239+
" </tr>\n",
240+
" <tr>\n",
241+
" <th>1</th>\n",
242+
" <td>excursionists</td>\n",
243+
" <td>Antigua and Barbuda</td>\n",
244+
" <td>World</td>\n",
245+
" <td>1996</td>\n",
246+
" <td>270.0</td>\n",
247+
" <td>thousand trips</td>\n",
248+
" </tr>\n",
249+
" <tr>\n",
250+
" <th>2</th>\n",
251+
" <td>excursionists</td>\n",
252+
" <td>Antigua and Barbuda</td>\n",
253+
" <td>World</td>\n",
254+
" <td>1997</td>\n",
255+
" <td>286.0</td>\n",
256+
" <td>thousand trips</td>\n",
257+
" </tr>\n",
258+
" <tr>\n",
259+
" <th>3</th>\n",
260+
" <td>excursionists</td>\n",
261+
" <td>Antigua and Barbuda</td>\n",
262+
" <td>World</td>\n",
263+
" <td>1998</td>\n",
264+
" <td>336.0</td>\n",
265+
" <td>thousand trips</td>\n",
266+
" </tr>\n",
267+
" <tr>\n",
268+
" <th>4</th>\n",
269+
" <td>excursionists</td>\n",
270+
" <td>Antigua and Barbuda</td>\n",
271+
" <td>World</td>\n",
272+
" <td>1999</td>\n",
273+
" <td>328.0</td>\n",
274+
" <td>thousand trips</td>\n",
275+
" </tr>\n",
276+
" </tbody>\n",
277+
"</table>\n",
278+
"</div>"
279+
],
280+
"text/plain": [
281+
" type_of_visitors country_receiving where_tourist_from year \\\n",
282+
"0 excursionists Antigua and Barbuda World 1995 \n",
283+
"1 excursionists Antigua and Barbuda World 1996 \n",
284+
"2 excursionists Antigua and Barbuda World 1997 \n",
285+
"3 excursionists Antigua and Barbuda World 1998 \n",
286+
"4 excursionists Antigua and Barbuda World 1999 \n",
287+
"\n",
288+
" number_of_tourist unit \n",
289+
"0 227.0 thousand trips \n",
290+
"1 270.0 thousand trips \n",
291+
"2 286.0 thousand trips \n",
292+
"3 336.0 thousand trips \n",
293+
"4 328.0 thousand trips "
294+
]
295+
},
296+
"execution_count": 20,
297+
"metadata": {},
298+
"output_type": "execute_result"
71299
}
72300
],
73301
"source": [
74-
"data.info()\n"
302+
"data.info()\n",
303+
"\n",
304+
"data.head()\n"
75305
]
76306
},
77307
{
78308
"cell_type": "markdown",
79-
"id": "7dfb6d43",
309+
"id": "c7b16997",
80310
"metadata": {},
81-
"source": [
82-
"Data Analysis and Linear Regression Model\n"
83-
]
311+
"source": []
84312
},
85313
{
86314
"cell_type": "markdown",
@@ -114,9 +342,7 @@
114342
"cell_type": "markdown",
115343
"id": "2f1c0a05",
116344
"metadata": {},
117-
"source": [
118-
"2. Data Preparation for Linear Regression"
119-
]
345+
"source": []
120346
},
121347
{
122348
"cell_type": "code",

0 commit comments

Comments
 (0)