Skip to content

Commit ea2e2e7

Browse files
committed
Add topological sort max distance from source for strongly connected components to anomaly detection features
1 parent 8620b01 commit ea2e2e7

9 files changed

Lines changed: 391 additions & 318 deletions

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ anomaly_detection_features() {
8080
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \
8181
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}"
8282
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}"
83+
# Determines topological sort max distance from source for strongly connected components if not already done
84+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \
85+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher" "${@}"
86+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \
87+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher" "${@}"
8388
}
8489

8590
# Run queries to find anomalies in the graph.

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ anomaly_detection_features() {
127127
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \
128128
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}"
129129
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}"
130+
# Determines topological sort max distance from source for strongly connected components if not already done
131+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \
132+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher" "${@}"
133+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \
134+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher" "${@}"
130135
}
131136

132137
# Execute the Python scripts for anomaly detection.

domains/anomaly-detection/documentation/Architecture.gv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,12 @@ digraph AnomalyDetectionPipeline {
117117
Abstractness [label="Abstractness\n(Robert C. Martin)"];
118118
StronglyConnectedComponents[label="StronglyConnectedComponents\n(member count)"]
119119
WeaklyConnectedComponents [label="WeaklyConnectedComponents\n(median members size)"]
120+
TopologicalSort [label="TopologicalSort\n(max component distance from source)"]
120121
}
121122

123+
// Inter graph algorithm feature connections
124+
StronglyConnectedComponents -> TopologicalSort
125+
122126
// Anomaly detection model area
123127
subgraph cluster_anomaly {
124128
label="Anomaly Detection Model";
@@ -158,6 +162,7 @@ digraph AnomalyDetectionPipeline {
158162
Abstractness -> AnomalyStandardizer;
159163
StronglyConnectedComponents -> AnomalyStandardizer;
160164
WeaklyConnectedComponents -> AnomalyStandardizer;
165+
TopologicalSort -> AnomalyStandardizer;
161166

162167
// Proxy RandomForest used as a backing/tuning model for the Isolation Forest
163168
TuningAnomaly -> IsolationMinCluster;

domains/anomaly-detection/documentation/Architecture.svg

Lines changed: 335 additions & 314 deletions
Loading

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
"| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n",
3838
"| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n",
3939
"| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n",
40-
"| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n"
40+
"| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n",
41+
"| `Max Topological Distance from Source (SCC DAG)` | Structural / Graph Topology | Longest path from any source SCC to the node’s SCC in the condensed DAG | Approximates architectural depth or layering; high values indicate deeply nested components and potential rigidity or change amplification |"
4142
]
4243
},
4344
{
@@ -236,6 +237,7 @@
236237
" ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n",
237238
" ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n",
238239
" ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio\n",
240+
" ,coalesce(stronglyConnectedComponent.topologicalSortMaxDistanceFromSource, 0) AS topologicalComponentLayer\n",
239241
" ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX\n",
240242
" ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY\n",
241243
" \"\"\"\n",
@@ -676,6 +678,7 @@
676678
" study.enqueue_trial({'isolation_max_samples': 0.42726366840740576, 'isolation_n_estimators': 141, 'proxy_n_estimators': 190, 'proxy_max_depth': 5})\n",
677679
" study.enqueue_trial({'isolation_max_samples': 0.40638732079782663, 'isolation_n_estimators': 108, 'proxy_n_estimators': 191, 'proxy_max_depth': 9})\n",
678680
" \n",
681+
" study.enqueue_trial({'isolation_max_samples': 0.10105966483207725, 'isolation_n_estimators': 271, 'proxy_n_estimators': 237, 'proxy_max_depth': 9})\n",
679682
" study.enqueue_trial({'isolation_max_samples': 0.10010443935999927, 'isolation_n_estimators': 350, 'proxy_n_estimators': 344, 'proxy_max_depth': 8})\n",
680683
" study.enqueue_trial({'isolation_max_samples': 0.10015063610944819, 'isolation_n_estimators': 329, 'proxy_n_estimators': 314, 'proxy_max_depth': 8})\n",
681684
"\n",
@@ -2067,9 +2070,9 @@
20672070
"metadata": {},
20682071
"outputs": [],
20692072
"source": [
2070-
"# TODO delete when finished tweaking\n",
2071-
"top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n",
2072-
"print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))"
2073+
"# For debugging purposes\n",
2074+
"# top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n",
2075+
"# print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))"
20732076
]
20742077
},
20752078
{
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a "topologicalSortMaxDistanceFromSource" if it exists
2+
3+
MATCH (component:StronglyConnectedComponent)
4+
WHERE $projection_node_label + 'Members' IN labels(component)
5+
AND component.topologicalSortMaxDistanceFromSource IS NOT NULL
6+
RETURN component.name AS shortCodeUnitName
7+
,elementId(component) AS nodeElementId
8+
,component.topologicalSortMaxDistanceFromSource AS topologicalSortMaxDistanceFromSource
9+
LIMIT 1
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Creates a projection of the strongly connected components graph for the given member type. Requires: "AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher"
2+
3+
MATCH (sourceComponent:StronglyConnectedComponent)
4+
OPTIONAL MATCH (sourceComponent)-[:DEPENDS_ON]->(targetComponent:StronglyConnectedComponent)
5+
WHERE $projection_node_label + 'Members' IN labels(sourceComponent)
6+
AND $projection_node_label + 'Members' IN labels(targetComponent)
7+
WITH gds.graph.project($projection_name + '-components', sourceComponent, targetComponent) AS graph
8+
RETURN graph.graphName AS graphName
9+
,graph.nodeCount AS nodeCount
10+
,graph.relationshipCount AS relationshipCount
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Topological Sort to write the property "topologicalSortMaxDistanceFromSource" (e.g. build order) for strongly connected components into the graph. Requires "AnomalyDetectionFeature-TopologicalSortComponents-Projection".
2+
// Needs graph-data-science plugin version >= 2.5.0
3+
4+
CALL gds.dag.topologicalSort.stream(
5+
$projection_name + '-components', {
6+
computeMaxDistanceFromSource: true
7+
}) YIELD nodeId, maxDistanceFromSource
8+
WITH nodeId
9+
,gds.util.asNode(nodeId) AS component
10+
,toInteger(maxDistanceFromSource) AS maxDistanceFromSource
11+
SET component.topologicalSortMaxDistanceFromSource = maxDistanceFromSource
12+
WITH maxDistanceFromSource, count(*) AS occurrences
13+
RETURN maxDistanceFromSource, occurrences
14+
ORDER BY maxDistanceFromSource

domains/anomaly-detection/tunedAnomalyDetectionExplained.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr
280280
,codeUnit.clusteringHDBSCANLabel AS clusterLabel
281281
,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid
282282
,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio
283+
,coalesce(stronglyConnectedComponent.topologicalSortMaxDistanceFromSource, 0) AS topologicalComponentLayer
283284
,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX
284285
,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY
285286
"""

0 commit comments

Comments
 (0)