Skip to content

Commit 8620b01

Browse files
committed
Add weakly/strongly connected components to anomaly detection features
1 parent e4cf453 commit 8620b01

16 files changed

Lines changed: 519 additions & 295 deletions

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ anomaly_detection_features() {
7171
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}"
7272
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
7373
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}"
74+
# Determines strongly connected components if not already done
75+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher" \
76+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher" "${@}"
77+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher" "${@}"
78+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher" "${@}"
79+
# Determines weakly connected components if not already done
80+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \
81+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}"
82+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}"
7483
}
7584

7685
# Run queries to find anomalies in the graph.

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@ anomaly_detection_features() {
118118
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}"
119119
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \
120120
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}"
121+
# Determines strongly connected components if not already done
122+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher" \
123+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher" "${@}"
124+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher" "${@}"
125+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher" "${@}"
126+
# Determines weakly connected components if not already done
127+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \
128+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}"
129+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}"
121130
}
122131

123132
# Execute the Python scripts for anomaly detection.

domains/anomaly-detection/documentation/Architecture.gv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ digraph AnomalyDetectionPipeline {
115115
LocalClusteringCoefficient [label="Local Clustering\nCoefficient"];
116116
Degree [label="Degree\n(in, out, sum)"];
117117
Abstractness [label="Abstractness\n(Robert C. Martin)"];
118+
StronglyConnectedComponents[label="StronglyConnectedComponents\n(member count)"]
119+
WeaklyConnectedComponents [label="WeaklyConnectedComponents\n(median members size)"]
118120
}
119121

120122
// Anomaly detection model area
@@ -154,6 +156,8 @@ digraph AnomalyDetectionPipeline {
154156
LocalClusteringCoefficient -> AnomalyStandardizer;
155157
Degree -> AnomalyStandardizer;
156158
Abstractness -> AnomalyStandardizer;
159+
StronglyConnectedComponents -> AnomalyStandardizer;
160+
WeaklyConnectedComponents -> AnomalyStandardizer;
157161

158162
// Proxy RandomForest used as a backing/tuning model for the Isolation Forest
159163
TuningAnomaly -> IsolationMinCluster;

domains/anomaly-detection/documentation/Architecture.svg

Lines changed: 317 additions & 293 deletions
Loading

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
"| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n",
3737
"| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n",
3838
"| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n",
39-
"| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n"
39+
"| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n",
40+
"| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n"
4041
]
4142
},
4243
{
@@ -210,6 +211,8 @@
210211
" ,coalesce(codeUnit.outgoingDependencies, 0) AS outgoingDependencies\n",
211212
" ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName\n",
212213
" ,coalesce(artifactName, projectName, \"\") AS projectName\n",
214+
" OPTIONAL MATCH (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(stronglyConnectedComponent:StronglyConnectedComponent)\n",
215+
" OPTIONAL MATCH (codeUnit)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(weaklyConnectedComponent:WeaklyConnectedComponent)\n",
213216
" RETURN DISTINCT \n",
214217
" codeUnitName\n",
215218
" ,codeUnit.name AS shortCodeUnitName\n",
@@ -232,6 +235,7 @@
232235
" ,codeUnit.clusteringHDBSCANSize AS clusterSize\n",
233236
" ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n",
234237
" ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n",
238+
" ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio\n",
235239
" ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX\n",
236240
" ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY\n",
237241
" \"\"\"\n",
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Create nodes for strongly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode".
2+
3+
MATCH (sourceCodeUnit)-[codeUnitDependency:DEPENDS_ON]->(targetCodeUnit)
4+
WHERE $projection_node_label IN labels(sourceCodeUnit)
5+
AND $projection_node_label IN labels(targetCodeUnit)
6+
MATCH (sourceCodeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(sourceComponent:StronglyConnectedComponent)
7+
MATCH (targetCodeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(targetComponent:StronglyConnectedComponent)
8+
WHERE sourceComponent <> targetComponent
9+
WITH sourceComponent
10+
,targetComponent
11+
,count(*) AS weightCount
12+
,sum(codeUnitDependency.weight) AS weight
13+
,CASE $projection_weight_property
14+
WHEN '' THEN sum(codeUnitDependency.weight)
15+
ELSE sum(codeUnitDependency[$projection_weight_property])
16+
END AS weightSelected
17+
// For debugging purposes
18+
// RETURN sourceComponent.name + '-' + sourceComponent.id AS sourceComponentNameId
19+
// ,targetComponent.name + '-' + targetComponent.id AS targetComponentNameId
20+
// ,weightCount
21+
// ,weight
22+
// ,weightSelected
23+
// ,count(*) AS occurs
24+
// LIMIT 50
25+
// ORDER BY occurs DESC, sourceComponentNameId, targetComponentNameId
26+
MERGE (sourceComponent)-[componentDependency:DEPENDS_ON]->(targetComponent)
27+
SET componentDependency.weightCount = weightCount
28+
,componentDependency.weight = weight
29+
,componentDependency[$projection_weight_property] = weightSelected
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Create nodes for strongly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher".
2+
3+
// 1) Select all code units that belong to a strongly connected component
4+
// and sort them by PageRank (used later for naming the component) MATCH (codeUnit)
5+
MATCH (codeUnit)
6+
WHERE $projection_node_label IN labels(codeUnit)
7+
AND codeUnit.communityStronglyConnectedComponentId IS NOT NULL
8+
ORDER BY codeUnit.centralityPageRank DESC
9+
// 2) Group code units by strongly connected component id
10+
WITH codeUnit.communityStronglyConnectedComponentId AS componentId
11+
,collect(codeUnit) AS members
12+
,count(codeUnit) AS componentSize
13+
// 3) Create or update the StronglyConnectedComponent node with member type label e.g. ("TypeMembers")
14+
// - size: number of code units in the component
15+
// - name: derived from the highest PageRank member
16+
MERGE (component:StronglyConnectedComponent {id: componentId})
17+
WITH *
18+
,CASE componentSize WHEN = 1 THEN 'Component ' ELSE 'Cycle around ' END AS componentNamePrefix
19+
CALL apoc.create.addLabels(component, [$projection_node_label + 'Members']) YIELD node
20+
SET component.size = componentSize
21+
,component.name = componentNamePrefix + members[0].name
22+
// 4) Expand members so we can attach relationships
23+
WITH component, members
24+
UNWIND members AS codeUnit
25+
// 5) Connect the code units to the StronglyConnectedComponent they belong to.
26+
MERGE (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(component)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a "communityStronglyConnectedComponentId" if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.communityStronglyConnectedComponentId IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.communityStronglyConnectedComponentId AS communityStronglyConnectedComponentId
9+
LIMIT 1
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Calculates and writes the Strongly Connected Components for anomaly detection
2+
3+
CALL gds.scc.write(
4+
$projection_name + '-directed-cleaned', {
5+
writeProperty: 'communityStronglyConnectedComponentId',
6+
consecutiveIds: true
7+
})
8+
YIELD componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis, componentDistribution
9+
RETURN componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
10+
,componentDistribution.min
11+
,componentDistribution.mean
12+
,componentDistribution.max
13+
,componentDistribution.p50
14+
,componentDistribution.p75
15+
,componentDistribution.p90
16+
,componentDistribution.p95
17+
,componentDistribution.p99
18+
,componentDistribution.p999
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Create nodes for weakly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher".
2+
3+
// 1) Select all code units that belong to a weakly connected component
4+
// and sort them by PageRank (used later for naming the component) MATCH (codeUnit)
5+
MATCH (codeUnit)
6+
WHERE $projection_node_label IN labels(codeUnit)
7+
AND codeUnit.communityWeaklyConnectedComponentId IS NOT NULL
8+
ORDER BY codeUnit.centralityPageRank DESC
9+
// 2) Group code units by weakly connected component id
10+
WITH codeUnit.communityWeaklyConnectedComponentId AS componentId
11+
,collect(codeUnit) AS members
12+
// 3) Create or update the WeaklyConnectedComponent node with member type label e.g. ("TypeMembers")
13+
// - size: number of code units in the component
14+
// - name: derived from the highest PageRank member
15+
MERGE (component:WeaklyConnectedComponent {id: componentId})
16+
WITH *
17+
CALL apoc.create.addLabels(component, [$projection_node_label + 'Members']) YIELD node
18+
SET component.size = size(members)
19+
,component.name = 'Island around ' + members[0].name
20+
// 4) Expand members so we can attach relationships and discover Strongly Connected Components
21+
WITH component, members
22+
UNWIND members AS codeUnit
23+
// 5) Connect the code units to the WeaklyConnectedComponent they belong to.
24+
// Additionally, find the StronglyConnectedComponent each code unit belongs to
25+
// and connect it to the WeaklyConnectedComponent as well.
26+
// Layers: code unit -> strongly connected component -> weakly connected component
27+
OPTIONAL MATCH (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(stronglyConnectedComponent:StronglyConnectedComponent)
28+
MERGE (codeUnit)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(component)
29+
MERGE (stronglyConnectedComponent)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(component)
30+
// 6) Collect code units per StronglyConnectedComponent within this WeaklyConnectedComponent
31+
// (this allows us to compute StronglyConnectedComponent sizes)
32+
WITH component, stronglyConnectedComponent, collect(DISTINCT codeUnit) AS stronglyConnectedComponentMembers
33+
WHERE stronglyConnectedComponent IS NOT NULL
34+
// 7) Compute the size of each StronglyConnectedComponent within each WeaklyConnectedComponent
35+
WITH component, size(stronglyConnectedComponentMembers) AS stronglyConnectedComponentSize
36+
// 8) Compute the StronglyConnectedComponent size percentiles per WeaklyConnectedComponent
37+
WITH component
38+
,percentileDisc(stronglyConnectedComponentSize, 0.25) AS stronglyConnectedComponentSizePercentile25
39+
,percentileDisc(stronglyConnectedComponentSize, 0.50) AS stronglyConnectedComponentSizePercentile50
40+
,percentileDisc(stronglyConnectedComponentSize, 0.75) AS stronglyConnectedComponentSizePercentile75
41+
// 9) Store the computed StronglyConnectedComponent size percentiles on the WeaklyConnectedComponent node
42+
SET component.stronglyConnectedComponentSizePercentile25 = stronglyConnectedComponentSizePercentile25
43+
,component.stronglyConnectedComponentSizePercentile50 = stronglyConnectedComponentSizePercentile50
44+
,component.stronglyConnectedComponentSizePercentile75 = stronglyConnectedComponentSizePercentile75
45+
,component.stronglyConnectedComponentSizeInterQuartileRange = stronglyConnectedComponentSizePercentile75 - stronglyConnectedComponentSizePercentile25

0 commit comments

Comments
 (0)