|
14 | 14 | "id": "24d2d933", |
15 | 15 | "metadata": {}, |
16 | 16 | "source": [ |
17 | | - " # 09. Machine Learning with Numerics\n", |
| 17 | + " # 11. Machine Learning with Numerics\n", |
18 | 18 | "This notebook explores machine learning capabilities in Numerics.\n", |
19 | 19 | "\n", |
20 | 20 | "## What You'll Learn\n", |
|
552 | 552 | }, |
553 | 553 | { |
554 | 554 | "cell_type": "code", |
555 | | - "execution_count": 6, |
| 555 | + "execution_count": null, |
556 | 556 | "id": "5ebaa4fd", |
557 | 557 | "metadata": {}, |
558 | 558 | "outputs": [ |
|
752 | 752 | "sklearn_centers = kmeans_sklearn.cluster_centers_\n", |
753 | 753 | "\n", |
754 | 754 | "# Match cluster labels -- Numerics and sklearn return labels in a random order (so they will not automatically match)\n", |
755 | | - "# Sqaured Euclidean distance between each pair of centers'\n", |
| 755 | + "# Squared Euclidean distance between each pair of centers'\n", |
756 | 756 | "cost_matrix = np.linalg.norm(numerics_centers[:, None, :] - sklearn_centers[None, :, :], axis=2)\n", |
757 | 757 | "# rowIndex = Numerics cluster index, colIndex = sklearn cluster index\n", |
758 | 758 | "row_ind, col_ind = linear_sum_assignment(cost_matrix)\n", |
|
904 | 904 | }, |
905 | 905 | { |
906 | 906 | "cell_type": "code", |
907 | | - "execution_count": 8, |
| 907 | + "execution_count": null, |
908 | 908 | "id": "1c8a5985", |
909 | 909 | "metadata": {}, |
910 | 910 | "outputs": [ |
|
998 | 998 | "# Fit mixture model and pull out labels (i.e. clusters)\n", |
999 | 999 | "gmm_ms = timed_fit(lambda: gmm.Train(12345, True))\n", |
1000 | 1000 | "labels_gmm = np.array(list(gmm.Labels), dtype=int)\n", |
1001 | | - "# sklearn Rand index adjusted for chance (computes similarity measure between two clusterings)\n", |
| 1001 | + "# sklearn Rand index adjusted for chance (computes similarity measure between two clustering)\n", |
1002 | 1002 | "ari_gmm = adjusted_rand_score(y_iris, labels_gmm) # NOTE: Numerics has no ARI equivalent yet\n", |
1003 | 1003 | "\n", |
1004 | 1004 | "n, k = X_gmm.shape[0], 3\n", |
|
1046 | 1046 | }, |
1047 | 1047 | { |
1048 | 1048 | "cell_type": "code", |
1049 | | - "execution_count": 9, |
| 1049 | + "execution_count": null, |
1050 | 1050 | "id": "3e679f8c", |
1051 | 1051 | "metadata": {}, |
1052 | 1052 | "outputs": [ |
|
1130 | 1130 | "# Split data into training and testing sets\n", |
1131 | 1131 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)\n", |
1132 | 1132 | "\n", |
1133 | | - "# Standardize by removing the mean and scaling to unit varaiance\n", |
| 1133 | + "# Standardize by removing the mean and scaling to unit variance\n", |
1134 | 1134 | "scaler = StandardScaler()\n", |
1135 | 1135 | "X_train_s = scaler.fit_transform(X_train)\n", |
1136 | 1136 | "X_test_s = scaler.transform(X_test)\n", |
|
1186 | 1186 | }, |
1187 | 1187 | { |
1188 | 1188 | "cell_type": "code", |
1189 | | - "execution_count": 10, |
| 1189 | + "execution_count": null, |
1190 | 1190 | "id": "82e51962", |
1191 | 1191 | "metadata": {}, |
1192 | 1192 | "outputs": [ |
|
1270 | 1270 | "# Split into training and testing sets\n", |
1271 | 1271 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)\n", |
1272 | 1272 | "\n", |
1273 | | - "# Standardize by removing the mean and scaling to unit varaiance\n", |
| 1273 | + "# Standardize by removing the mean and scaling to unit variance\n", |
1274 | 1274 | "scaler = StandardScaler()\n", |
1275 | 1275 | "X_train_s = scaler.fit_transform(X_train)\n", |
1276 | 1276 | "X_test_s = scaler.transform(X_test)\n", |
|
1285 | 1285 | "\n", |
1286 | 1286 | "knn_reg_ms = timed_fit(lambda: knn_reg.Predict(Matrix(X_test_net)))\n", |
1287 | 1287 | "y_pred = np.array(list(knn_reg.Predict(Matrix(X_test_net))), dtype=float)\n", |
1288 | | - "# Performace metrics\n", |
| 1288 | + "# Performance metrics\n", |
1289 | 1289 | "y_test_gof = convert_to_dotnet_array(y_test.astype(float))\n", |
1290 | 1290 | "y_pred_gof = convert_to_dotnet_array(y_pred.astype(float))\n", |
1291 | 1291 | "rmse = np.sqrt(GoodnessOfFit.MSE(y_test_gof, y_pred_gof))\n", |
|
1334 | 1334 | }, |
1335 | 1335 | { |
1336 | 1336 | "cell_type": "code", |
1337 | | - "execution_count": 11, |
| 1337 | + "execution_count": null, |
1338 | 1338 | "id": "4d31156b", |
1339 | 1339 | "metadata": {}, |
1340 | 1340 | "outputs": [ |
|
1422 | 1422 | "y_train_net = convert_to_dotnet_array(y_train)\n", |
1423 | 1423 | "X_test_net = convert_to_dotnet_2d_array(X_test)\n", |
1424 | 1424 | "\n", |
1425 | | - "# Intializie\n", |
| 1425 | + "# Initialize\n", |
1426 | 1426 | "dt_clf = DecisionTree(Matrix(X_train_net), Vector(y_train_net), 12345)\n", |
1427 | 1427 | "dt_clf.IsRegression = False\n", |
1428 | 1428 | "# Depth control is very important for decision trees\n", |
|
1476 | 1476 | }, |
1477 | 1477 | { |
1478 | 1478 | "cell_type": "code", |
1479 | | - "execution_count": 12, |
| 1479 | + "execution_count": null, |
1480 | 1480 | "id": "4ea747d7", |
1481 | 1481 | "metadata": {}, |
1482 | 1482 | "outputs": [ |
|
1574 | 1574 | "dt_reg_ms = timed_fit(lambda: dt_reg.Train())\n", |
1575 | 1575 | "y_pred = np.array(list(dt_reg.Predict(Matrix(X_test_net))), dtype=float)\n", |
1576 | 1576 | "\n", |
1577 | | - "# Performace metrics\n", |
| 1577 | + "# Performance metrics\n", |
1578 | 1578 | "y_test_gof = convert_to_dotnet_array(y_test.astype(float))\n", |
1579 | 1579 | "y_pred_gof = convert_to_dotnet_array(y_pred.astype(float))\n", |
1580 | 1580 | "rmse = np.sqrt(GoodnessOfFit.MSE(y_test_gof, y_pred_gof))\n", |
|
1624 | 1624 | }, |
1625 | 1625 | { |
1626 | 1626 | "cell_type": "code", |
1627 | | - "execution_count": 13, |
| 1627 | + "execution_count": null, |
1628 | 1628 | "id": "c4198906", |
1629 | 1629 | "metadata": {}, |
1630 | 1630 | "outputs": [ |
|
1724 | 1724 | "n = pred_raw.GetLength(0)\n", |
1725 | 1725 | "y_pred = np.array([pred_raw[i, 1] for i in range(n)], dtype=float)\n", |
1726 | 1726 | "\n", |
1727 | | - "# Performace metrics\n", |
| 1727 | + "# Performance metrics\n", |
1728 | 1728 | "y_test_gof = convert_to_dotnet_array(y_test.astype(float))\n", |
1729 | 1729 | "y_pred_gof = convert_to_dotnet_array(y_pred.astype(float))\n", |
1730 | 1730 | "acc = GoodnessOfFit.Accuracy(y_test_gof, y_pred_gof)\n", |
|
1757 | 1757 | }, |
1758 | 1758 | { |
1759 | 1759 | "cell_type": "code", |
1760 | | - "execution_count": 14, |
| 1760 | + "execution_count": null, |
1761 | 1761 | "id": "03961157", |
1762 | 1762 | "metadata": {}, |
1763 | 1763 | "outputs": [ |
|
1847 | 1847 | "# We have to ensure the Random Forest regressor knows it's a regression task (not classification) since it can do both\n", |
1848 | 1848 | "rf_reg.IsRegression = True\n", |
1849 | 1849 | "# More trees generally improves accuracy but increases runtime, 1000 is used here to match Numerics default and scikit-learn settings\n", |
1850 | | - "# Set tree count to match defau;t scikit-learn settings\n", |
| 1850 | + "# Set tree count to match default scikit-learn settings\n", |
1851 | 1851 | "rf_reg.NumberOfTrees = 100\n", |
1852 | 1852 | "# Max Depth helps control overfitting\n", |
1853 | 1853 | "# Random Forests can often benefit from deeper trees than single Decision Trees since they average many trees\n", |
|
0 commit comments