Skip to content

Commit e4cf453

Browse files
committed
Add abstractness to anomaly detection features
1 parent 7ed8f36 commit e4cf453

11 files changed

Lines changed: 432 additions & 307 deletions

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,4 +102,8 @@ __pycache__/
102102
*.pyc
103103

104104
# Optuna (and other) Database data
105-
*.db
105+
*.db
106+
107+
# Documentation generation
108+
domains/**/documentation/package.json
109+
domains/**/documentation/package-lock.json

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ anomaly_detection_features() {
6464
# Determine the normalized difference between Page Rank and Article Rank if not already done
6565
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \
6666
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}"
67+
# Determine the "abstractness" (interfaces = 100%, abstract classes = 70%, classes & functions = 0%)
68+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
69+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_Java.cypher" "${@}"
70+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
71+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}"
72+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
73+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}"
6774
}
6875

6976
# Run queries to find anomalies in the graph.

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,13 @@ anomaly_detection_features() {
111111
# Determine the normalized difference between Page Rank and Article Rank if not already done
112112
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \
113113
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}"
114+
# Determine the "abstractness" (interfaces = 100%, abstract classes = 70%, classes & functions = 0%)
115+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
116+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_Java.cypher" "${@}"
117+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
118+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}"
119+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
120+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}"
114121
}
115122

116123
# Execute the Python scripts for anomaly detection.

domains/anomaly-detection/documentation/Architecture.gv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ digraph AnomalyDetectionPipeline {
114114
BetweennessCentrality [label="Betweenness\nCentrality"];
115115
LocalClusteringCoefficient [label="Local Clustering\nCoefficient"];
116116
Degree [label="Degree\n(in, out, sum)"];
117+
Abstractness [label="Abstractness\n(Robert C. Martin)"];
117118
}
118119

119120
// Anomaly detection model area
@@ -152,6 +153,7 @@ digraph AnomalyDetectionPipeline {
152153
BetweennessCentrality -> AnomalyStandardizer;
153154
LocalClusteringCoefficient -> AnomalyStandardizer;
154155
Degree -> AnomalyStandardizer;
156+
Abstractness -> AnomalyStandardizer;
155157

156158
// Proxy RandomForest used as a backing/tuning model for the Isolation Forest
157159
TuningAnomaly -> IsolationMinCluster;

domains/anomaly-detection/documentation/Architecture.svg

Lines changed: 302 additions & 289 deletions
Loading

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n",
3737
"| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n",
3838
"| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n",
39-
"\n"
39+
"| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n"
4040
]
4141
},
4242
{
@@ -218,6 +218,7 @@
218218
" ,incomingDependencies\n",
219219
" ,outgoingDependencies\n",
220220
" ,incomingDependencies + outgoingDependencies AS degree\n",
221+
" ,coalesce(codeUnit.abstractness, 0.0) AS abstractness\n",
221222
" ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding\n",
222223
" ,codeUnit.centralityPageRank AS pageRank\n",
223224
" ,codeUnit.centralityArticleRank AS articleRank\n",
@@ -740,6 +741,23 @@
740741
" return features"
741742
]
742743
},
744+
{
745+
"cell_type": "code",
746+
"execution_count": null,
747+
"id": "fb829d75",
748+
"metadata": {},
749+
"outputs": [],
750+
"source": [
751+
"def get_top_anomalies(\n",
752+
" anomaly_detected_features: pd.DataFrame, \n",
753+
" anomaly_label_column: str = \"anomalyLabel\",\n",
754+
" anomaly_score_column: str = \"anomalyScore\",\n",
755+
" top_n: int = 10\n",
756+
") -> pd.DataFrame:\n",
757+
" anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n",
758+
" return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(top_n)"
759+
]
760+
},
743761
{
744762
"cell_type": "code",
745763
"execution_count": null,
@@ -752,8 +770,7 @@
752770
" anomaly_label_column: str = \"anomalyLabel\",\n",
753771
" anomaly_score_column: str = \"anomalyScore\"\n",
754772
") -> pd.DataFrame:\n",
755-
" anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n",
756-
" return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(10)"
773+
" return get_top_anomalies(anomaly_detected_features, anomaly_label_column, anomaly_score_column, top_n=10)"
757774
]
758775
},
759776
{
@@ -2039,6 +2056,18 @@
20392056
"display(get_top_10_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))"
20402057
]
20412058
},
2059+
{
2060+
"cell_type": "code",
2061+
"execution_count": null,
2062+
"id": "f0e61b72",
2063+
"metadata": {},
2064+
"outputs": [],
2065+
"source": [
2066+
"# TODO delete when finished tweaking\n",
2067+
"top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n",
2068+
"print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))"
2069+
]
2070+
},
20422071
{
20432072
"cell_type": "markdown",
20442073
"id": "c314821d",
@@ -2282,7 +2311,7 @@
22822311
"outputs": [],
22832312
"source": [
22842313
"java_type_anomaly_detection_importances_series = pd.Series(java_type_anomaly_detection_results.feature_importances, index=java_type_anomaly_detection_feature_names).sort_values(ascending=False)\n",
2285-
"print(java_type_anomaly_detection_importances_series.head(10))"
2314+
"print(java_type_anomaly_detection_importances_series.head(25))"
22862315
]
22872316
},
22882317
{
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with the property "abstractness" if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.abstractness IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.abstractness AS abstractness
9+
LIMIT 1
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Calculate and set abstractness for Java Code Packages or Artifacts and return a 0.1 ranged bin distribution.
2+
3+
MATCH (javaCodeUnit:Java&(Package|Artifact))
4+
WITH javaCodeUnit
5+
,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Type) } AS numberTypes
6+
,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Class{abstract:true}) } AS numberAbstractClasses
7+
,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Annotation) } AS numberAnnotations
8+
,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Interface) } AS numberInterfaces
9+
WITH *
10+
,numberInterfaces + numberAnnotations + (numberAbstractClasses * 0.7) AS weightedAbstractTypes
11+
WITH *
12+
,toFloat(weightedAbstractTypes) / (numberTypes + 1E-38) AS abstractness
13+
SET javaCodeUnit.abstractness = abstractness
14+
RETURN round(abstractness, 1) AS abstractnessBin
15+
,count(*) AS packageCount
16+
,collect(javaCodeUnit.name)[0..4] AS examples
17+
ORDER BY abstractnessBin ASC
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Calculate and set abstractness for Java Types and returns the distribution.
2+
3+
MATCH (javaCodeUnit:Java:Type)
4+
WITH javaCodeUnit
5+
,javaCodeUnit:Annotation AS isAnnotation
6+
,javaCodeUnit:Interface AS isInterface
7+
,(javaCodeUnit:Class AND javaCodeUnit.abstract) AS isAbstractClass
8+
WITH *
9+
,CASE WHEN isAnnotation OR isInterface THEN 1.0
10+
WHEN isAbstractClass THEN 0.7
11+
ELSE 0.0
12+
END AS abstractness
13+
SET javaCodeUnit.abstractness = abstractness
14+
RETURN abstractness
15+
,count(*) AS typeCount
16+
,collect(javaCodeUnit.name)[0..4] AS examples
17+
ORDER BY abstractness ASC
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
//Calculate and set Abstractness for TypeScript Modules.
2+
3+
MATCH (module:TS:Module)
4+
WITH module
5+
,count{(module)-[:EXPORTS]->(:TS)} AS numberTypes
6+
,count{(module)-[:EXPORTS]->(:Class{abstract:true})} AS numberAbstractClasses
7+
,count{(module)-[:EXPORTS]->(:TypeAlias)} AS numberTypeAliases
8+
,count{(module)-[:EXPORTS]->(:Interface)} AS numberInterfaces
9+
WITH *
10+
,numberInterfaces + numberTypeAliases + (numberAbstractClasses * 0.7) AS numberAbstractTypes
11+
WITH *
12+
,toFloat(numberAbstractTypes) / (numberTypes + 1E-38) AS abstractness
13+
SET module.abstractness = abstractness
14+
RETURN round(abstractness, 1) AS abstractnessBin
15+
,count(*) AS packageCount
16+
,collect(module.name)[0..4] AS examples
17+
ORDER BY abstractnessBin ASC

0 commit comments

Comments
 (0)