Preparing Data for Regression

Lastrophysicien · Lastrophysicien · commit be1d4b690ee3 · 2025-12-06T10:27:29.000-05:00
diff --git a/4_data_analysis/MLProject.ipynb b/4_data_analysis/MLProject.ipynb
@@ -568,10 +568,75 @@
     "print(visitor_type_stats)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3c32b2c6",
+   "metadata": {},
+   "source": [
+    "5. Preparing Data for Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "142d44a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== PREPARING DATA FOR REGRESSION ===\n",
+      "Features shape: (3025, 7)\n",
+      "Target shape: (3025,)\n",
+      "\n",
+      "Training set size: 2420 samples\n",
+      "Testing set size: 605 samples\n",
+      "Features scaled using StandardScaler\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 5.1: Select features and target variable\n",
+    "print(\"=== PREPARING DATA FOR REGRESSION ===\")\n",
+    "\n",
+    "# Define features (X) and target (y)\n",
+    "X = df[\n",
+    "    [\n",
+    "        \"year\",\n",
+    "        \"country_encoded\",\n",
+    "        \"visitor_type_encoded\",\n",
+    "        \"decade\",\n",
+    "        \"post_2000\",\n",
+    "        \"post_2010\",\n",
+    "        \"covid_period\",\n",
+    "    ]\n",
+    "]\n",
+    "y = df[\"number_of_tourist\"]\n",
+    "\n",
+    "print(f\"Features shape: {X.shape}\")\n",
+    "print(f\"Target shape: {y.shape}\")\n",
+    "\n",
+    "# Step 5.2: Split data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.2, random_state=42, shuffle=True\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nTraining set size: {X_train.shape[0]} samples\")\n",
+    "print(f\"Testing set size: {X_test.shape[0]} samples\")\n",
+    "\n",
+    "# Step 5.3: Scale the features (optional, but good practice)\n",
+    "scaler = StandardScaler()\n",
+    "X_train_scaled = scaler.fit_transform(X_train)\n",
+    "X_test_scaled = scaler.transform(X_test)\n",
+    "\n",
+    "print(\"Features scaled using StandardScaler\")\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5a269c6f",
+   "id": "fceebc18",
    "metadata": {},
    "outputs": [],
    "source": []