|
36 | 36 | "| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n", |
37 | 37 | "| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n", |
38 | 38 | "| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n", |
39 | | - "\n" |
| 39 | + "| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n" |
40 | 40 | ] |
41 | 41 | }, |
42 | 42 | { |
|
218 | 218 | " ,incomingDependencies\n", |
219 | 219 | " ,outgoingDependencies\n", |
220 | 220 | " ,incomingDependencies + outgoingDependencies AS degree\n", |
| 221 | + " ,coalesce(codeUnit.abstractness, 0.0) AS abstractness\n", |
221 | 222 | " ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding\n", |
222 | 223 | " ,codeUnit.centralityPageRank AS pageRank\n", |
223 | 224 | " ,codeUnit.centralityArticleRank AS articleRank\n", |
|
740 | 741 | " return features" |
741 | 742 | ] |
742 | 743 | }, |
| 744 | + { |
| 745 | + "cell_type": "code", |
| 746 | + "execution_count": null, |
| 747 | + "id": "fb829d75", |
| 748 | + "metadata": {}, |
| 749 | + "outputs": [], |
| 750 | + "source": [ |
| 751 | + "def get_top_anomalies(\n", |
| 752 | + " anomaly_detected_features: pd.DataFrame, \n", |
| 753 | + " anomaly_label_column: str = \"anomalyLabel\",\n", |
| 754 | + " anomaly_score_column: str = \"anomalyScore\",\n", |
| 755 | + " top_n: int = 10\n", |
| 756 | + ") -> pd.DataFrame:\n", |
| 757 | + " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n", |
| 758 | + " return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(top_n)" |
| 759 | + ] |
| 760 | + }, |
743 | 761 | { |
744 | 762 | "cell_type": "code", |
745 | 763 | "execution_count": null, |
|
752 | 770 | " anomaly_label_column: str = \"anomalyLabel\",\n", |
753 | 771 | " anomaly_score_column: str = \"anomalyScore\"\n", |
754 | 772 | ") -> pd.DataFrame:\n", |
755 | | - " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n", |
756 | | - " return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(10)" |
| 773 | + " return get_top_anomalies(anomaly_detected_features, anomaly_label_column, anomaly_score_column, top_n=10)" |
757 | 774 | ] |
758 | 775 | }, |
759 | 776 | { |
|
2039 | 2056 | "display(get_top_10_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))" |
2040 | 2057 | ] |
2041 | 2058 | }, |
| 2059 | + { |
| 2060 | + "cell_type": "code", |
| 2061 | + "execution_count": null, |
| 2062 | + "id": "f0e61b72", |
| 2063 | + "metadata": {}, |
| 2064 | + "outputs": [], |
| 2065 | + "source": [ |
| 2066 | + "# TODO delete when finished tweaking\n", |
| 2067 | + "top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n", |
| 2068 | + "print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))" |
| 2069 | + ] |
| 2070 | + }, |
2042 | 2071 | { |
2043 | 2072 | "cell_type": "markdown", |
2044 | 2073 | "id": "c314821d", |
|
2282 | 2311 | "outputs": [], |
2283 | 2312 | "source": [ |
2284 | 2313 | "java_type_anomaly_detection_importances_series = pd.Series(java_type_anomaly_detection_results.feature_importances, index=java_type_anomaly_detection_feature_names).sort_values(ascending=False)\n", |
2285 | | - "print(java_type_anomaly_detection_importances_series.head(10))" |
| 2314 | + "print(java_type_anomaly_detection_importances_series.head(25))" |
2286 | 2315 | ] |
2287 | 2316 | }, |
2288 | 2317 | { |
|
0 commit comments