|
87 | 87 | "print(f\"Total records in dataset: {len(df):,}\")\n", |
88 | 88 | "print(f\"Records with LOS data: {len(df_model):,}\")\n", |
89 | 89 | "print(f\"Percentage with LOS data: {len(df_model) / len(df) * 100:.2f}%\")\n", |
90 | | - "print(df_model[\"length_of_stay\"].value_counts().sort_index())\n" |
| 90 | + "print(df_model[\"length_of_stay\"].value_counts().sort_index())" |
91 | 91 | ] |
92 | 92 | }, |
93 | 93 | { |
|
166 | 166 | ")\n", |
167 | 167 | "print(\n", |
168 | 168 | " f\"Least common class: {los_counts.index[-1]} ({los_counts.values[-1] / len(df_model) * 100:.1f}%)\"\n", |
169 | | - ")\n" |
| 169 | + ")" |
170 | 170 | ] |
171 | 171 | }, |
172 | 172 | { |
|
267 | 267 | "\n", |
268 | 268 | "# Check the new balance\n", |
269 | 269 | "print(\"New Target Distribution:\")\n", |
270 | | - "print(df_model[\"length_of_stay_collapsed\"].value_counts(normalize=True))\n" |
| 270 | + "print(df_model[\"length_of_stay_collapsed\"].value_counts(normalize=True))" |
271 | 271 | ] |
272 | 272 | }, |
273 | 273 | { |
|
350 | 350 | ], |
351 | 351 | "source": [ |
352 | 352 | "X = df_model[features].copy()\n", |
353 | | - "y = df_model['length_of_stay'].copy()\n", |
| 353 | + "y = df_model[\"length_of_stay\"].copy()\n", |
354 | 354 | "\n", |
355 | 355 | "print(f\"Initial feature count: {X.shape[1]}\")\n", |
356 | 356 | "\n", |
|
360 | 360 | "if len(missing_features) > 0:\n", |
361 | 361 | " print(\"\\nFeatures with missing values:\")\n", |
362 | 362 | " for feat, count in missing_features.items():\n", |
363 | | - " print(f\" {feat:30} {count:>8,} ({count/len(X)*100:>5.2f}%)\")\n", |
364 | | - " \n", |
| 363 | + " print(f\" {feat:30} {count:>8,} ({count / len(X) * 100:>5.2f}%)\")\n", |
| 364 | + "\n", |
365 | 365 | " for col in X.columns:\n", |
366 | 366 | " if X[col].isnull().sum() > 0:\n", |
367 | | - " if X[col].dtype in ['object', 'category']:\n", |
368 | | - " X[col] = X[col].fillna('Unknown')\n", |
| 367 | + " if X[col].dtype in [\"object\", \"category\"]:\n", |
| 368 | + " X[col] = X[col].fillna(\"Unknown\")\n", |
369 | 369 | " else:\n", |
370 | 370 | " X[col] = X[col].fillna(X[col].median())\n", |
371 | 371 | "\n", |
372 | | - "categorical_cols = X.select_dtypes(include=['object', 'category']).columns\n", |
| 372 | + "categorical_cols = X.select_dtypes(include=[\"object\", \"category\"]).columns\n", |
373 | 373 | "X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)\n", |
374 | 374 | "\n", |
375 | 375 | "print(f\"\\nFeatures after encoding: {X_encoded.shape[1]}\")\n", |
|
417 | 417 | "X = df_model[features].copy()\n", |
418 | 418 | "\n", |
419 | 419 | "# Define Target\n", |
420 | | - "y = df_model[\"length_of_stay_collapsed\"].copy() \n", |
| 420 | + "y = df_model[\"length_of_stay_collapsed\"].copy()\n", |
421 | 421 | "\n", |
422 | 422 | "# Handle Missing Values & Encoding\n", |
423 | 423 | "categorical_cols = X.select_dtypes(include=[\"object\", \"category\"]).columns\n", |
|
433 | 433 | "X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)\n", |
434 | 434 | "\n", |
435 | 435 | "print(\"Target Class Distribution in Training:\")\n", |
436 | | - "print(y_train_resampled.value_counts(normalize=True))\n" |
| 436 | + "print(y_train_resampled.value_counts(normalize=True))" |
437 | 437 | ] |
438 | 438 | }, |
439 | 439 | { |
|
486 | 486 | " estimator=rf,\n", |
487 | 487 | " param_distributions=param_dist,\n", |
488 | 488 | " n_iter=20,\n", |
489 | | - " cv=3, \n", |
| 489 | + " cv=3,\n", |
490 | 490 | " verbose=2,\n", |
491 | 491 | " random_state=42,\n", |
492 | 492 | " n_jobs=-1,\n", |
|
561 | 561 | "\n", |
562 | 562 | "# This ensures consistency for training and validation splits\n", |
563 | 563 | "X_train_resampled = clean_cols_lgbm(X_train_resampled.copy())\n", |
564 | | - "X_test = clean_cols_lgbm(X_test.copy()) \n", |
| 564 | + "X_test = clean_cols_lgbm(X_test.copy())\n", |
565 | 565 | "\n", |
566 | 566 | "X_train_sub, X_val, y_train_sub, y_val = train_test_split(\n", |
567 | 567 | " X_train_resampled, y_train_resampled, test_size=0.1, random_state=42\n", |
|
600 | 600 | "\n", |
601 | 601 | "print(f\"\\nLightGBM Final Iteration: {best_iteration}\")\n", |
602 | 602 | "print(f\"LightGBM Accuracy: {accuracy_gb:.4f} ({accuracy_gb * 100:.2f}%)\")\n", |
603 | | - "print(f\"LightGBM Macro F1 Score: {f1_gb:.4f}\")\n" |
| 603 | + "print(f\"LightGBM Macro F1 Score: {f1_gb:.4f}\")" |
604 | 604 | ] |
605 | 605 | }, |
606 | 606 | { |
|
786 | 786 | "plt.title(f\"Top 15 Features Predicting Length of Stay Category ({best_name})\")\n", |
787 | 787 | "plt.gca().invert_yaxis()\n", |
788 | 788 | "plt.tight_layout()\n", |
789 | | - "plt.show()\n" |
| 789 | + "plt.show()" |
790 | 790 | ] |
791 | 791 | }, |
792 | 792 | { |
|
856 | 856 | "ax2.set_xlabel(\"Predicted\")\n", |
857 | 857 | "\n", |
858 | 858 | "plt.tight_layout()\n", |
859 | | - "plt.show()\n" |
| 859 | + "plt.show()" |
860 | 860 | ] |
861 | 861 | }, |
862 | 862 | { |
|
0 commit comments