From 26edd6653d2209ece8273eb38746de66bc3b8d2d Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 18 Jan 2026 14:09:52 +0100 Subject: [PATCH 1/9] Optimize Neo4j start --- scripts/startNeo4j.sh | 6 ++++++ scripts/waitForNeo4jHttpFunctions.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/startNeo4j.sh b/scripts/startNeo4j.sh index d1f58da1c..902a4646d 100755 --- a/scripts/startNeo4j.sh +++ b/scripts/startNeo4j.sh @@ -23,6 +23,12 @@ NEO4J_HTTP_PORT=${NEO4J_HTTP_PORT:-"7474"} SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts echo "startNeo4j: SCRIPTS_DIR=$SCRIPTS_DIR" +# Check if environment variable NEO4J_INITIAL_PASSWORD is set +if [ -z "${NEO4J_INITIAL_PASSWORD}" ]; then + echo "startNeo4j: Error: Requires environment variable NEO4J_INITIAL_PASSWORD to be set first. Use 'export NEO4J_INITIAL_PASSWORD='." + exit 1 +fi + # Check if TOOLS_DIRECTORY variable is set if [ -z "${TOOLS_DIRECTORY}" ]; then echo "startNeo4j: Requires variable TOOLS_DIRECTORY to be set. If it is the current directory, then use a dot to reflect that." diff --git a/scripts/waitForNeo4jHttpFunctions.sh b/scripts/waitForNeo4jHttpFunctions.sh index f07c50745..240971f7b 100644 --- a/scripts/waitForNeo4jHttpFunctions.sh +++ b/scripts/waitForNeo4jHttpFunctions.sh @@ -40,7 +40,7 @@ isDatabaseQueryable() { waitUntilDatabaseIsQueryable() { # List of wait times in seconds per retry - local WAIT_TIMES="16 1 1 2 4 8 16 32 64" + local WAIT_TIMES="4 1 1 1 1 2 2 4 8 16 32" local retries=0 local isDatabaseReady="false" From baff7f0d1d34a39006d11abb5e60b69a597387c1 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 20 Jan 2026 08:49:48 +0100 Subject: [PATCH 2/9] Fix typo in feature existence query for article to page rank feature --- .../AnomalyDetectionFeature-PageToArticleRank-Exists.cypher | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher index 0524df410..4f319874e 100644 --- a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher @@ -4,7 +4,7 @@ WHERE $projection_node_label IN labels(codeUnit) AND codeUnit.centralityPageRankToArticleRankDifference IS NOT NULL AND codeUnit.centralityPageRankNormalized IS NOT NULL - AND codeUnit.centralityPArticleRankNormalized IS NOT NULL + AND codeUnit.centralityArticleRankNormalized IS NOT NULL RETURN codeUnit.name AS shortCodeUnitName ,elementId(codeUnit) AS nodeElementId ,codeUnit.centralityPageRankToArticleRankDifference AS pageToArticleRankDifference From 0376a805779ae3eb85491706a6e3d6b79becdea6 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 18 Jan 2026 12:03:45 +0100 Subject: [PATCH 3/9] Skip PCA when exploring anomaly detection solely on node embeddings --- ...yDetectionIsolationForestExploration.ipynb | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index c354ba369..bcb6b85a0 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -279,7 +279,14 @@ " 'clusterNoise', # highly correlated with \"clusterApproximateOutlierScore\". doesn't improve F1 score of proxy model.\n", " 'embeddingVisualizationX',\n", " 'embeddingVisualizationY',\n", - "]" + "]\n", + "\n", + "features_for_visualization_and_training: typing.List[str] = [\n", + " 'pageRank', \n", + " 'articleRank'\n", + "]\n", + "\n", + "features_for_visualization: typing.List[str] = features_for_visualization_excluded_from_training + features_for_visualization_and_training" ] }, { @@ -748,7 +755,9 @@ "id": "b2cfcc56", "metadata": {}, "source": [ - "#### 1.3b List the top 10 anomalies solely based on embeddings" + "#### 1.3b List the top 10 anomalies solely based on embeddings\n", + "\n", + "By leaving out all other features, we can see if the embeddings alone are sufficient to detect anomalies. Anomalies detected solely based on embeddings could indicate structural outliers in the graph representation of the codebase. In most cases however, combining embeddings with other features yields better results." ] }, { @@ -758,10 +767,18 @@ "metadata": {}, "outputs": [], "source": [ - "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n", - "java_package_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_package_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n", - "java_package_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'nodeEmbeddingPCA_{i}' for i in range(java_package_embedding_anomaly_detection_input.shape[1])]\n", + "# Create a copy of the java_package features, selecting only visualization and embedding features\n", + "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization + ['embedding']].copy()\n", + "\n", + "# Skip PCA and keep the original dimensionality of the node embeddings. When only considering embeddings, there are no features that could get outperformed.\n", + "# java_package_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_package_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n", + "java_package_embedding_anomaly_detection_input = np.stack(java_package_embedding_anomaly_detection_features['embedding'].apply(np.array).tolist())\n", + "java_package_embedding_anomaly_detection_feature_names = [f'nodeEmbedding_{i}' for i in range(java_package_embedding_anomaly_detection_input.shape[1])]\n", + "\n", + "# Tune anomaly detection models using only the reduced embedding features, with automatic contamination threshold\n", "java_package_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_package_embedding_anomaly_detection_input, contamination=\"auto\")\n", + "\n", + "# Add the anomaly detection results (labels and scores) to the features dataframe with custom column names for embedding-based anomalies\n", "java_package_embedding_anomaly_detection_features = add_anomaly_detection_results_to_features(java_package_embedding_anomaly_detection_features, java_package_embedding_anomaly_detection_result, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore')\n", "\n", "display(get_top_10_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))" @@ -2000,7 +2017,9 @@ "id": "c314821d", "metadata": {}, "source": [ - "#### 2.3b List the top 10 anomalies solely based on embeddings" + "#### 2.3b List the top 10 anomalies solely based on embeddings\n", + "\n", + "By leaving out all other features, we can see if the embeddings alone are sufficient to detect anomalies. Anomalies detected solely based on embeddings could indicate structural outliers in the graph representation of the codebase. In most cases however, combining embeddings with other features yields better results." ] }, { @@ -2010,12 +2029,20 @@ "metadata": {}, "outputs": [], "source": [ - "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n", - "java_type_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_type_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n", - "java_type_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'nodeEmbeddingPCA_{i}' for i in range(java_type_embedding_anomaly_detection_input.shape[1])]\n", + "# Create a copy of the java_type features, selecting only embeddings and everything needed for visualization\n", + "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization + ['embedding']].copy()\n", + "\n", + "# Skip PCA and keep the original dimensionality of the node embeddings. When only considering embeddings, there are no features that could get outperformed.\n", + "# java_type_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_type_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n", + "java_type_embedding_anomaly_detection_input = np.stack(java_type_embedding_anomaly_detection_features['embedding'].apply(np.array).tolist())\n", + "java_type_embedding_anomaly_detection_feature_names = [f'nodeEmbedding_{i}' for i in range(java_type_embedding_anomaly_detection_input.shape[1])]\n", + "\n", "java_type_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_type_embedding_anomaly_detection_input, contamination=\"auto\")\n", + "\n", + "# Add the anomaly detection results (labels and scores) to the features dataframe with custom column names for embedding-based anomalies\n", "java_type_embedding_anomaly_detection_features = add_anomaly_detection_results_to_features(java_type_embedding_anomaly_detection_features, java_type_embedding_anomaly_detection_result, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore')\n", "\n", + "# Display the top 10 anomalies detected based on embeddings, sorted by anomaly score in descending order, with index reset for cleaner output\n", "display(get_top_10_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))" ] }, From 1a293f63da01222a3b88b3f95024e916b49b307c Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 18 Jan 2026 12:08:36 +0100 Subject: [PATCH 4/9] Improve readability of anomaly detection feature plots --- ...yDetectionIsolationForestExploration.ipynb | 19 ++++++++++++------- .../tunedAnomalyDetectionExplained.py | 14 ++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index bcb6b85a0..ad38f3290 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -367,9 +367,6 @@ "def plot_feature_correlation_matrix(features: pd.DataFrame) -> None:\n", " \"\"\"\n", " Plots the correlation matrix of the features in the DataFrame.\n", - " \n", - " :param java_package_anomaly_detection_features: DataFrame containing the features.\n", - " :param java_package_features_to_standardize: List of feature names to include in the correlation matrix.\n", " \"\"\"\n", " correlation_matrix = features.corr()\n", "\n", @@ -381,7 +378,7 @@ " axis.set_xticklabels(correlation_matrix.columns, rotation=90)\n", " axis.set_yticklabels(correlation_matrix.index)\n", " for (i, j), correlation_value in np.ndenumerate(correlation_matrix.values):\n", - " axis.text(j, i, f\"{correlation_value:.2f}\", ha='center', va='center', color='black', bbox=dict(facecolor='white', alpha=0.3, edgecolor='none'))\n", + " axis.text(j, i, f\"{correlation_value:.2f}\", ha='center', va='center', color='black', bbox=dict(facecolor='white', alpha=0.3, edgecolor='none'), fontsize=6)\n", " plot.title(\"Feature Correlation Matrix (excluding embeddings)\", fontsize=10)\n", " plot.tight_layout()\n", " plot.show()" @@ -946,8 +943,8 @@ " x_position_column: str = 'embeddingVisualizationX',\n", " y_position_column: str = 'embeddingVisualizationY',\n", " annotate_top_n_anomalies: int = 10,\n", - " annotate_top_n_non_anomalies: int = 5,\n", - " annotate_top_n_clusters: int = 20,\n", + " annotate_top_n_non_anomalies: int = 3,\n", + " annotate_top_n_clusters: int = 10,\n", " percentile_of_distance_to_center: float = 0.8,\n", " no_cluster_coloring: bool = False,\n", ") -> None:\n", @@ -995,7 +992,11 @@ " cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n", "\n", " plot.figure(figsize=(10, 10))\n", - " plot.title(f\"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)\", pad=20)\n", + " plot.title(\n", + " label=f\"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)\", \n", + " pad=30,\n", + " bbox=dict(facecolor='white', edgecolor='none', pad=2, alpha=0.6)\n", + " )\n", "\n", " # Plot noise (from clustering)\n", " plot.scatter(\n", @@ -1111,6 +1112,8 @@ " **plot_annotation_style\n", " )\n", "\n", + " plot.tight_layout(pad=0.2)\n", + " plot.axis('off')\n", " plot.show()" ] }, @@ -1263,6 +1266,8 @@ " **plot_annotation_style\n", " )\n", "\n", + " plot.tight_layout(pad=0.2)\n", + " plot.axis('off')\n", " plot.show()" ] }, diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index fda74f9a8..49e644e71 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -621,8 +621,8 @@ def plot_anomalies( return annotate_top_n_anomalies: int = 10 - annotate_top_n_non_anomalies: int = 5 - annotate_top_n_clusters: int = 20 + annotate_top_n_non_anomalies: int = 3 + annotate_top_n_clusters: int = 10 features_to_visualize_zoomed=zoom_into_center_while_preserving_top_scores( features_to_visualize, @@ -647,8 +647,11 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1] plot.figure(figsize=(10, 10)) - plot.title(f"{title_prefix} Anomalies (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)", pad=20) - + plot.title( + label=f"{title_prefix} Anomalies (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)", + pad=30, + bbox=dict(facecolor='white', edgecolor='none', pad=2, alpha=0.6) + ) # Plot noise (from clustering) plot.scatter( **get_common_plot_parameters(cluster_noise), @@ -712,6 +715,9 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: color="red", ) + plot.tight_layout(pad=0.2) + plot.axis('off') + plot.savefig(plot_file_path) plot.close() From 27c82e67b5441d0a4278af249c6de8b9c3a03725 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 25 Jan 2026 11:16:19 +0100 Subject: [PATCH 5/9] Remove constant features from anomaly detection --- ...yDetectionIsolationForestExploration.ipynb | 32 ++++++++++++++++--- .../tunedAnomalyDetectionExplained.py | 15 +++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index ad38f3290..530fe7606 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -407,6 +407,26 @@ " return scaler.fit_transform(features_to_scale)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "04d510a6", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_constant_features(features: pd.DataFrame, feature_names: list[str]) -> list[str]:\n", + " \"\"\"\n", + " Removes constant features from the feature list.\n", + " \"\"\"\n", + " non_constant_features = []\n", + " for feature in feature_names:\n", + " if features[feature].nunique() > 1:\n", + " non_constant_features.append(feature)\n", + " else:\n", + " print(f\"Removed constant feature: {feature}\")\n", + " return non_constant_features" + ] + }, { "cell_type": "code", "execution_count": null, @@ -414,7 +434,8 @@ "metadata": {}, "outputs": [], "source": [ - "java_package_anomaly_detection_features_standardized = standardize_features(java_package_anomaly_detection_features, java_package_features_to_standardize)" + "java_package_anomaly_detection_feature_names_to_standardize = remove_constant_features(java_package_anomaly_detection_features, java_package_features_to_standardize)\n", + "java_package_anomaly_detection_features_standardized = standardize_features(java_package_anomaly_detection_features, java_package_anomaly_detection_feature_names_to_standardize)" ] }, { @@ -491,7 +512,7 @@ "outputs": [], "source": [ "java_package_anomaly_detection_features_prepared = np.hstack([java_package_anomaly_detection_features_standardized, java_package_anomaly_detection_node_embeddings_reduced])\n", - "java_package_anomaly_detection_feature_names = list(java_package_features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_package_anomaly_detection_node_embeddings_reduced.shape[1])]" + "java_package_anomaly_detection_feature_names = list(java_package_anomaly_detection_feature_names_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_package_anomaly_detection_node_embeddings_reduced.shape[1])]" ] }, { @@ -1986,13 +2007,14 @@ "outputs": [], "source": [ "validate_data(java_type_anomaly_detection_features)\n", - "java_type_anomaly_detection_features_standardized = standardize_features(java_type_anomaly_detection_features, java_type_features_to_standardize)\n", + "java_type_anomaly_detection_feature_names_to_standardize = remove_constant_features(java_type_anomaly_detection_features, java_type_features_to_standardize )\n", + "java_type_anomaly_detection_features_standardized = standardize_features(java_type_anomaly_detection_features, java_type_anomaly_detection_feature_names_to_standardize)\n", "java_type_anomaly_detection_node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(java_type_anomaly_detection_features, max_dimensions=35)\n", "\n", "java_type_anomaly_detection_features_prepared = np.hstack([java_type_anomaly_detection_features_standardized, java_type_anomaly_detection_node_embeddings_reduced])\n", - "java_type_anomaly_detection_feature_names = list(java_type_features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_type_anomaly_detection_node_embeddings_reduced.shape[1])]\n", + "java_type_anomaly_detection_feature_names = list(java_type_anomaly_detection_feature_names_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_type_anomaly_detection_node_embeddings_reduced.shape[1])]\n", "\n", - "plot_feature_correlation_matrix(java_type_anomaly_detection_features[java_type_features_to_standardize])" + "plot_feature_correlation_matrix(java_type_anomaly_detection_features[java_type_anomaly_detection_feature_names_to_standardize])" ] }, { diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 49e644e71..53c5563ea 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -296,6 +296,20 @@ def standardize_features(features: pd.DataFrame, feature_list: list[str]) -> num return scaler.fit_transform(features_to_scale) +def remove_constant_features(features: pd.DataFrame, feature_names: list[str], is_verbose: bool = False) -> list[str]: + """ + Removes constant features from the feature list. + """ + non_constant_features = [] + for feature in feature_names: + if features[feature].nunique() > 1: + non_constant_features.append(feature) + else: + if is_verbose: + print("tunedAnomalyDetectionExplained: Removing constant feature {feature}") + return non_constant_features + + def reduce_dimensionality_of_node_embeddings( features: pd.DataFrame, min_dimensions: int = 20, @@ -1162,6 +1176,7 @@ def output_top_shap_explained_global_features_as_markdown_table( sys.exit(0) features_to_standardize = features.columns.drop(features_for_visualization_to_exclude_from_training + ['embedding']).to_list() +features_to_standardize = remove_constant_features(features, features_to_standardize, is_verbose=parameters.is_verbose()) features_standardized = standardize_features(features, features_to_standardize) node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(features) features_prepared = np.hstack([features_standardized, node_embeddings_reduced]) From 7ed8f369912dc13aa00bfc06b0a283f021ef4860 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 18 Jan 2026 16:10:39 +0100 Subject: [PATCH 6/9] Consider abstract classes to have 70% abstractness. They usually also contain some implementations and are ideally weighted for calculating the abstractness. --- .../Calculate_and_set_Abstractness_for_Java.cypher | 14 +++++++++----- ...tractness_for_Java_including_Subpackages.cypher | 8 +++++++- ...late_and_set_Abstractness_for_Typescript.cypher | 14 +++++++++----- cypher/Metrics/Clear_all_metrics.cypher | 1 + 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/cypher/Metrics/Calculate_and_set_Abstractness_for_Java.cypher b/cypher/Metrics/Calculate_and_set_Abstractness_for_Java.cypher index 9b42121c7..d0c268190 100644 --- a/cypher/Metrics/Calculate_and_set_Abstractness_for_Java.cypher +++ b/cypher/Metrics/Calculate_and_set_Abstractness_for_Java.cypher @@ -9,16 +9,20 @@ MATCH (artifact:Artifact)-[:CONTAINS]->(package) ,count{(package)-[:CONTAINS]->(:Annotation)} AS numberAnnotations ,count{(package)-[:CONTAINS]->(:Interface)} AS numberInterfaces WITH * - ,numberInterfaces + numberAnnotations + numberAbstractClasses AS numberAbstractTypes + ,numberInterfaces + numberAnnotations + numberAbstractClasses AS numberAbstractTypes + ,numberInterfaces + numberAnnotations + (numberAbstractClasses * 0.7) AS weightedAbstractTypes WITH * - ,toFloat(numberAbstractTypes) / (numberTypes + 1E-38) AS abstractness - SET package.abstractness = abstractness - ,package.numberOfAbstractTypes = numberAbstractTypes - ,package.numberOfTypes = numberTypes + ,toFloat(weightedAbstractTypes) / (numberTypes + 1E-38) AS abstractness + SET package.abstractness = abstractness + ,package.numberOfAbstractTypes = numberAbstractTypes + ,package.numberOfAbstractClasses = numberAbstractClasses + ,package.numberOfTypes = numberTypes RETURN artifactName ,package.fqn AS fullQualifiedPackageName ,package.name AS packageName ,abstractness ,numberAbstractTypes ,numberTypes + ,numberAbstractClasses + ,weightedAbstractTypes ORDER BY abstractness ASC, numberTypes DESC \ No newline at end of file diff --git a/cypher/Metrics/Calculate_and_set_Abstractness_for_Java_including_Subpackages.cypher b/cypher/Metrics/Calculate_and_set_Abstractness_for_Java_including_Subpackages.cypher index 386b9ce95..6958d9f3d 100644 --- a/cypher/Metrics/Calculate_and_set_Abstractness_for_Java_including_Subpackages.cypher +++ b/cypher/Metrics/Calculate_and_set_Abstractness_for_Java_including_Subpackages.cypher @@ -13,10 +13,14 @@ MATCH (artifact:Artifact)-[:CONTAINS]->(package) ,package ,sum(subpackage.numberOfTypes) AS numberTypes ,sum(subpackage.numberOfAbstractTypes) AS numberAbstractTypes + ,sum(subpackage.numberOfAbstractClasses) AS numberAbstractClasses ,count(path) - 1 AS numberOfIncludedSubPackages ,max(length(path)) AS maxSubpackageDepth WITH * - ,toFloat(numberAbstractTypes) / (numberTypes + 1E-38) AS abstractness + // Calculate abstract classes out of abstract types and then add 70% of them back in (weighted) + ,numberAbstractTypes - (numberAbstractClasses * 0.3) AS weightedAbstractTypes + WITH * + ,toFloat(weightedAbstractTypes) / (numberTypes + 1E-38) AS abstractness SET package.abstractnessIncludingSubpackages = abstractness ,package.numberOfAbstractTypesIncludingSubpackages = numberAbstractTypes ,package.numberOfTypesIncludingSubpackages = numberTypes @@ -26,6 +30,8 @@ RETURN artifactName ,abstractness ,numberAbstractTypes ,numberTypes + ,numberAbstractClasses + ,weightedAbstractTypes ,numberOfIncludedSubPackages ,maxSubpackageDepth ORDER BY abstractness ASC, maxSubpackageDepth DESC, numberTypes DESC \ No newline at end of file diff --git a/cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher b/cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher index 199e2cf84..2289a2626 100644 --- a/cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher +++ b/cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher @@ -10,16 +10,20 @@ OPTIONAL MATCH (projectdir:Directory)<-[:HAS_ROOT]-(project:TS:Project)-[:CONTAI ,count{(module)-[:EXPORTS]->(:TypeAlias)} AS numberTypeAliases ,count{(module)-[:EXPORTS]->(:Interface)} AS numberInterfaces WITH * - ,numberInterfaces + numberTypeAliases + numberAbstractClasses AS numberAbstractTypes + ,numberInterfaces + numberTypeAliases + numberAbstractClasses AS numberAbstractTypes + ,numberInterfaces + numberTypeAliases + (numberAbstractClasses * 0.7) AS weightedAbstractTypes WITH * - ,toFloat(numberAbstractTypes) / (numberTypes + 1E-38) AS abstractness - SET module.abstractness = abstractness - ,module.numberOfAbstractTypes = numberAbstractTypes - ,module.numberOfTypes = numberTypes + ,toFloat(weightedAbstractTypes) / (numberTypes + 1E-38) AS abstractness + SET module.abstractness = abstractness + ,module.numberOfAbstractTypes = numberAbstractTypes + ,module.numberOfAbstractClasses = numberAbstractClasses + ,module.numberOfTypes = numberTypes RETURN projectName ,module.globalFqn AS fullQualifiedModuleName ,module.name AS moduleName ,abstractness ,numberAbstractTypes ,numberTypes + ,numberAbstractClasses + ,weightedAbstractTypes ORDER BY abstractness ASC, numberTypes DESC \ No newline at end of file diff --git a/cypher/Metrics/Clear_all_metrics.cypher b/cypher/Metrics/Clear_all_metrics.cypher index 931932d42..7cf4082b4 100644 --- a/cypher/Metrics/Clear_all_metrics.cypher +++ b/cypher/Metrics/Clear_all_metrics.cypher @@ -28,6 +28,7 @@ REMOVE package.incomingDependencies ,package.abstractness ,package.numberOfTypes ,package.numberOfAbstractTypes + ,package.numberOfAbstractClasses ,package.abstractnessIncludingSubpackages ,package.numberOfAbstractTypesIncludingSubpackages ,package.numberOfTypesIncludingSubpackages From e4cf453699dd050a2a54782d29c926ab640c2854 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 18 Jan 2026 20:58:10 +0100 Subject: [PATCH 7/9] Add abstractness to anomaly detection features --- .gitignore | 6 +- .../anomaly-detection/anomalyDetectionCsv.sh | 7 + .../anomalyDetectionPython.sh | 7 + .../documentation/Architecture.gv | 2 + .../documentation/Architecture.svg | 591 +++++++++--------- ...yDetectionIsolationForestExploration.ipynb | 37 +- ...etectionFeature-Abstractness-Exists.cypher | 9 + ...yDetectionFeature_Abstractness_Java.cypher | 17 + ...ectionFeature_Abstractness_JavaType.cypher | 17 + ...ature_Abstractness_TypeScriptModule.cypher | 17 + .../tunedAnomalyDetectionExplained.py | 29 +- 11 files changed, 432 insertions(+), 307 deletions(-) create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-Abstractness-Exists.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_Java.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_JavaType.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_TypeScriptModule.cypher diff --git a/.gitignore b/.gitignore index e393661b2..006ba2e6a 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,8 @@ __pycache__/ *.pyc # Optuna (and other) Database data -*.db \ No newline at end of file +*.db + +# Documentation generation +domains/**/documentation/package.json +domains/**/documentation/package-lock.json \ No newline at end of file diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index b4e276dcd..022292a9b 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -64,6 +64,13 @@ anomaly_detection_features() { # Determine the normalized difference between Page Rank and Article Rank if not already done execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}" + # Determine the "abstractness" (interfaces = 100%, abstract classes = 70%, classes & functions = 0%) + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_Java.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}" } # Run queries to find anomalies in the graph. diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index 67e0465a3..c11a0eb1c 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -111,6 +111,13 @@ anomaly_detection_features() { # Determine the normalized difference between Page Rank and Article Rank if not already done execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}" + # Determine the "abstractness" (interfaces = 100%, abstract classes = 70%, classes & functions = 0%) + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_Java.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}" } # Execute the Python scripts for anomaly detection. diff --git a/domains/anomaly-detection/documentation/Architecture.gv b/domains/anomaly-detection/documentation/Architecture.gv index 24ce04d5d..61bc691a7 100644 --- a/domains/anomaly-detection/documentation/Architecture.gv +++ b/domains/anomaly-detection/documentation/Architecture.gv @@ -114,6 +114,7 @@ digraph AnomalyDetectionPipeline { BetweennessCentrality [label="Betweenness\nCentrality"]; LocalClusteringCoefficient [label="Local Clustering\nCoefficient"]; Degree [label="Degree\n(in, out, sum)"]; + Abstractness [label="Abstractness\n(Robert C. Martin)"]; } // Anomaly detection model area @@ -152,6 +153,7 @@ digraph AnomalyDetectionPipeline { BetweennessCentrality -> AnomalyStandardizer; LocalClusteringCoefficient -> AnomalyStandardizer; Degree -> AnomalyStandardizer; + Abstractness -> AnomalyStandardizer; // Proxy RandomForest used as a backing/tuning model for the Isolation Forest TuningAnomaly -> IsolationMinCluster; diff --git a/domains/anomaly-detection/documentation/Architecture.svg b/domains/anomaly-detection/documentation/Architecture.svg index e59b1662b..0360932b5 100644 --- a/domains/anomaly-detection/documentation/Architecture.svg +++ b/domains/anomaly-detection/documentation/Architecture.svg @@ -1,701 +1,714 @@ - - - + + AnomalyDetectionPipeline - + cluster_leiden - -Leiden Community Detection + +Leiden Community Detection cluster_fastRP - -Fast Random Projection (FastRP) + +Fast Random Projection (FastRP) cluster_UMAP - -Uniform Manifold Approximation and Projection (UMAP) -Dimensionality Reduction for Visualization + +Uniform Manifold Approximation and Projection (UMAP) +Dimensionality Reduction for Visualization cluster_hdbscan - -Hierarchical Density-Based Spatial Clustering (HDBSCAN) + +Hierarchical Density-Based Spatial Clustering (HDBSCAN) cluster_graph_features - -Graph (Algorithm) Features + +Graph (Algorithm) Features cluster_anomaly - -Anomaly Detection Model + +Anomaly Detection Model cluster_explainability - -Explainable AI (SHAP) + +Explainable AI (SHAP) Tuning_Leiden - -Tuning -(Optuna) + +Tuning +(Optuna) Leiden_Gamma - -gamma + +gamma Tuning_Leiden->Leiden_Gamma - - + + Leiden_Theta - -theta + +theta Tuning_Leiden->Leiden_Theta - - + + Leiden_Algorithm - -Leiden Community Detection + +Leiden Community Detection Leiden_Gamma->Leiden_Algorithm - - + + Leiden_Theta->Leiden_Algorithm - - + + Leiden_Algorithm->Tuning_Leiden - - -modularity + + +modularity Leiden_Algorithm->Tuning_Leiden - - -size + + +size CommunityId - -Community + +Community Leiden_Algorithm->CommunityId - - + + Tuning_HDBSCAN - -Tuning -(Optuna) + +Tuning +(Optuna) CommunityId->Tuning_HDBSCAN - - -reference + + +reference Tuning_FastRP - -Tuning -(Optuna) + +Tuning +(Optuna) FastRP_Dimension - -dimension + +dimension Tuning_FastRP->FastRP_Dimension - - + + FastRP_Normalization_Strength - -normalization strength + +normalization strength Tuning_FastRP->FastRP_Normalization_Strength - - + + FastRP_Forth_Iteration_Weight - -forth iteration weight + +forth iteration weight Tuning_FastRP->FastRP_Forth_Iteration_Weight - - + + FastRP_Algorithm - -FastRP + +FastRP FastRP_Dimension->FastRP_Algorithm - - + + FastRP_Normalization_Strength->FastRP_Algorithm - - + + FastRP_Forth_Iteration_Weight->FastRP_Algorithm - - + + FastRP_Algorithm->Tuning_FastRP - - -adjusted mutual info score -(incl. preview clustering) + + +adjusted mutual info score +(incl. preview clustering) NodeEmbeddings - -Node Embeddings + +Node Embeddings FastRP_Algorithm->NodeEmbeddings - - + + UMAP_Algorithm - -UMAP + +UMAP NodeEmbeddings->UMAP_Algorithm - - + + HDBSCAN_Node - -HDBSCAN + +HDBSCAN NodeEmbeddings->HDBSCAN_Node - - + + - + AnomalyPCA - -Principal Component -Analysis (PCA) + +Principal Component +Analysis (PCA) NodeEmbeddings->AnomalyPCA - - + + UMAP_Coordinates - -2D Coordinates + +2D Coordinates UMAP_Algorithm->UMAP_Coordinates - - + + HDBSCAN_Min_Cluster_Size - -Min Cluster Size + +Min Cluster Size Tuning_HDBSCAN->HDBSCAN_Min_Cluster_Size - - + + HDBSCAN_Min_Samples - -Min Samples + +Min Samples Tuning_HDBSCAN->HDBSCAN_Min_Samples - - + + HDBSCAN_Node->Tuning_HDBSCAN - - -adjusted mutual info score + + +adjusted mutual info score ClusterLabel - -Label + +Label HDBSCAN_Node->ClusterLabel - - + + ClusterRadius - -Radius -(avg,max) + +Radius +(avg,max) HDBSCAN_Node->ClusterRadius - - + + ClusterSize - -Size + +Size HDBSCAN_Node->ClusterSize - - + + NormDistToMedoid - -Normalized Distance -To Medoid + +Normalized Distance +To Medoid HDBSCAN_Node->NormDistToMedoid - - + + ClusterNoise - -Noise -(label=-1) + +Noise +(label=-1) HDBSCAN_Node->ClusterNoise - - + + ClusterProbability - -Probability + +Probability HDBSCAN_Node->ClusterProbability - - + + ClusterApproximationOutlierScore - -Approximation -OutlierScore -(= 1 - Probability) + +Approximation +OutlierScore +(= 1 - Probability) HDBSCAN_Node->ClusterApproximationOutlierScore - - + + HDBSCAN_Min_Cluster_Size->HDBSCAN_Node - - + + HDBSCAN_Min_Samples->HDBSCAN_Node - - + + - + AnomalyStandardizer - -Standardizer + +Standardizer ClusterRadius->AnomalyStandardizer - - + + NormDistToMedoid->AnomalyStandardizer - - + + ClusterApproximationOutlierScore->AnomalyStandardizer - - + + ArticleRank - -ArticleRank + +ArticleRank ArticleRank->AnomalyStandardizer - - + + PageRank - -PageRank + +PageRank PageRank->AnomalyStandardizer - - + + PageRank_minus_ArticleRank - -PageRank - -ArticleRank + +PageRank - +ArticleRank PageRank_minus_ArticleRank->AnomalyStandardizer - - + + BetweennessCentrality - -Betweenness -Centrality + +Betweenness +Centrality BetweennessCentrality->AnomalyStandardizer - - + + LocalClusteringCoefficient - -Local Clustering -Coefficient + +Local Clustering +Coefficient LocalClusteringCoefficient->AnomalyStandardizer - - + + Degree - -Degree -(in, out, sum) + +Degree +(in, out, sum) Degree->AnomalyStandardizer - - + + - + +Abstractness + +Abstractness +(Robert C. Martin) + + + +Abstractness->AnomalyStandardizer + + + + + TuningAnomaly - -Tuning -(Optuna) + +Tuning +(Optuna) - + IsolationMinCluster - -Min Cluster Size + +Min Cluster Size - + TuningAnomaly->IsolationMinCluster - - + + - + IsolationEstimators - -n estimators + +n estimators - + TuningAnomaly->IsolationEstimators - - + + - + ProxyEstimators - -n estimators + +n estimators - + TuningAnomaly->ProxyEstimators - - + + - + ProxyMaxDepth - -max depth + +max depth - + TuningAnomaly->ProxyMaxDepth - - + + - + IsolationForest - -Isolation Forest -Anomaly Detector + +Isolation Forest +Anomaly Detector - + IsolationMinCluster->IsolationForest - - + + - + IsolationEstimators->IsolationForest - - + + - + ProxyRandomForest - -RandomForest -(Proxy) + +RandomForest +(Proxy) - + ProxyEstimators->ProxyRandomForest - - + + - + ProxyMaxDepth->ProxyRandomForest - - + + - + AnomalyStandardizer->IsolationForest - - + + - + AnomalyPCA->IsolationForest - - + + - + IsolationForest->ProxyRandomForest - - -reference + + +reference - + AnomalyScore - -Score + +Score - + IsolationForest->AnomalyScore - - + + - + AnomalyLabel - -Label + +Label - + IsolationForest->AnomalyLabel - - + + - + ProxyRandomForest->TuningAnomaly - - -f1 score -(cross validation) + + +f1 score +(cross validation) - + SHAP - - - -SHAP TreeExplainer + + + +SHAP TreeExplainer - + ProxyRandomForest->SHAP - - + + - + SHAP_Values - -Top SHAP Values + +Top SHAP Values - + SHAP->SHAP_Values - - + + - + SHAP_Features - -Top Features + +Top Features - + SHAP->SHAP_Features - - + + - + SHAP_Embedding_Sum - -Node Embeddings -SHAP Sum + +Node Embeddings +SHAP Sum - + SHAP->SHAP_Embedding_Sum - - + + diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index 530fe7606..c2ab9a593 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -36,7 +36,7 @@ "| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n", "| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n", "| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n", - "\n" + "| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n" ] }, { @@ -218,6 +218,7 @@ " ,incomingDependencies\n", " ,outgoingDependencies\n", " ,incomingDependencies + outgoingDependencies AS degree\n", + " ,coalesce(codeUnit.abstractness, 0.0) AS abstractness\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding\n", " ,codeUnit.centralityPageRank AS pageRank\n", " ,codeUnit.centralityArticleRank AS articleRank\n", @@ -740,6 +741,23 @@ " return features" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb829d75", + "metadata": {}, + "outputs": [], + "source": [ + "def get_top_anomalies(\n", + " anomaly_detected_features: pd.DataFrame, \n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + " anomaly_score_column: str = \"anomalyScore\",\n", + " top_n: int = 10\n", + ") -> pd.DataFrame:\n", + " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n", + " return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(top_n)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -752,8 +770,7 @@ " anomaly_label_column: str = \"anomalyLabel\",\n", " anomaly_score_column: str = \"anomalyScore\"\n", ") -> pd.DataFrame:\n", - " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n", - " return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(10)" + " return get_top_anomalies(anomaly_detected_features, anomaly_label_column, anomaly_score_column, top_n=10)" ] }, { @@ -2039,6 +2056,18 @@ "display(get_top_10_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0e61b72", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO delete when finished tweaking\n", + "top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n", + "print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))" + ] + }, { "cell_type": "markdown", "id": "c314821d", @@ -2282,7 +2311,7 @@ "outputs": [], "source": [ "java_type_anomaly_detection_importances_series = pd.Series(java_type_anomaly_detection_results.feature_importances, index=java_type_anomaly_detection_feature_names).sort_values(ascending=False)\n", - "print(java_type_anomaly_detection_importances_series.head(10))" + "print(java_type_anomaly_detection_importances_series.head(25))" ] }, { diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-Abstractness-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-Abstractness-Exists.cypher new file mode 100644 index 000000000..9aa666ef6 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-Abstractness-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with the property "abstractness" if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.abstractness IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.abstractness AS abstractness + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_Java.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_Java.cypher new file mode 100644 index 000000000..6f4cbd350 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_Java.cypher @@ -0,0 +1,17 @@ +// Calculate and set abstractness for Java Code Packages or Artifacts and return a 0.1 ranged bin distribution. + +MATCH (javaCodeUnit:Java&(Package|Artifact)) + WITH javaCodeUnit + ,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Type) } AS numberTypes + ,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Class{abstract:true}) } AS numberAbstractClasses + ,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Annotation) } AS numberAnnotations + ,COUNT{ (javaCodeUnit)-[:CONTAINS]->(:Interface) } AS numberInterfaces + WITH * + ,numberInterfaces + numberAnnotations + (numberAbstractClasses * 0.7) AS weightedAbstractTypes + WITH * + ,toFloat(weightedAbstractTypes) / (numberTypes + 1E-38) AS abstractness + SET javaCodeUnit.abstractness = abstractness +RETURN round(abstractness, 1) AS abstractnessBin + ,count(*) AS packageCount + ,collect(javaCodeUnit.name)[0..4] AS examples +ORDER BY abstractnessBin ASC \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_JavaType.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_JavaType.cypher new file mode 100644 index 000000000..54c5f5694 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_JavaType.cypher @@ -0,0 +1,17 @@ +// Calculate and set abstractness for Java Types and returns the distribution. + +MATCH (javaCodeUnit:Java:Type) + WITH javaCodeUnit + ,javaCodeUnit:Annotation AS isAnnotation + ,javaCodeUnit:Interface AS isInterface + ,(javaCodeUnit:Class AND javaCodeUnit.abstract) AS isAbstractClass + WITH * + ,CASE WHEN isAnnotation OR isInterface THEN 1.0 + WHEN isAbstractClass THEN 0.7 + ELSE 0.0 + END AS abstractness + SET javaCodeUnit.abstractness = abstractness +RETURN abstractness + ,count(*) AS typeCount + ,collect(javaCodeUnit.name)[0..4] AS examples +ORDER BY abstractness ASC \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_TypeScriptModule.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_TypeScriptModule.cypher new file mode 100644 index 000000000..fbec722f9 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature_Abstractness_TypeScriptModule.cypher @@ -0,0 +1,17 @@ +//Calculate and set Abstractness for TypeScript Modules. + +MATCH (module:TS:Module) + WITH module + ,count{(module)-[:EXPORTS]->(:TS)} AS numberTypes + ,count{(module)-[:EXPORTS]->(:Class{abstract:true})} AS numberAbstractClasses + ,count{(module)-[:EXPORTS]->(:TypeAlias)} AS numberTypeAliases + ,count{(module)-[:EXPORTS]->(:Interface)} AS numberInterfaces + WITH * + ,numberInterfaces + numberTypeAliases + (numberAbstractClasses * 0.7) AS numberAbstractTypes + WITH * + ,toFloat(numberAbstractTypes) / (numberTypes + 1E-38) AS abstractness + SET module.abstractness = abstractness +RETURN round(abstractness, 1) AS abstractnessBin + ,count(*) AS packageCount + ,collect(module.name)[0..4] AS examples +ORDER BY abstractnessBin ASC \ No newline at end of file diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 53c5563ea..0b9eb7374 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -263,6 +263,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr ,incomingDependencies ,outgoingDependencies ,incomingDependencies + outgoingDependencies AS degree + ,coalesce(codeUnit.abstractness, 0.0) AS abstractness ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding ,codeUnit.centralityPageRank AS pageRank ,codeUnit.centralityArticleRank AS articleRank @@ -596,22 +597,24 @@ def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detectio return anomaly_detection_results -def get_top_10_anomalies( +def get_top_n_anomalies( anomaly_detected_features: pd.DataFrame, anomaly_label_column: str = "anomalyLabel", - anomaly_score_column: str = "anomalyScore" + anomaly_score_column: str = "anomalyScore", + top_n: int = 10 ) -> pd.DataFrame: anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1] - return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(10) + return anomalies.sort_values(by=anomaly_score_column, ascending=False).head(top_n) -def get_top_10_non_anomalies( +def get_top_n_non_anomalies( anomaly_detected_features: pd.DataFrame, anomaly_label_column: str = "anomalyLabel", - anomaly_score_column: str = "anomalyScore" + anomaly_score_column: str = "anomalyScore", + top_n: int = 10 ) -> pd.DataFrame: anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] != 1] - return anomalies.sort_values(by=anomaly_score_column, ascending=True).head(10) + return anomalies.sort_values(by=anomaly_score_column, ascending=True).head(top_n) def plot_anomalies( @@ -1189,10 +1192,10 @@ def output_top_shap_explained_global_features_as_markdown_table( features = add_anomaly_detection_results_to_features(features, anomaly_detection_results) if parameters.is_verbose(): - print("tunedAnomalyDetectionExplained: Top 10 anomalies:") - print(get_top_10_anomalies(features).reset_index(drop=True)) - print("tunedAnomalyDetectionExplained: Top 10 non-anomalies:") - print(get_top_10_non_anomalies(features).reset_index(drop=True)) + print("tunedAnomalyDetectionExplained: Top 20 anomalies:") + print(get_top_n_anomalies(features, top_n=20).reset_index(drop=True)) + print("tunedAnomalyDetectionExplained: Top 20 non-anomalies:") + print(get_top_n_non_anomalies(features, top_n=20).reset_index(drop=True)) plot_anomalies( features_to_visualize=features, @@ -1213,8 +1216,8 @@ def output_top_shap_explained_global_features_as_markdown_table( if parameters.is_verbose(): feature_importances = pd.Series(anomaly_detection_results.feature_importances, index=feature_names).sort_values(ascending=False) - print("tunedAnomalyDetectionExplained: Most influential features for anomaly detection according to the proxy model directly without SHAP (top 10):") - print(feature_importances.head(10)) + print("tunedAnomalyDetectionExplained: Most influential features for anomaly detection according to the proxy model directly without SHAP (top 20):") + print(feature_importances.head(20)) explanation_results = explain_anomalies_with_shap( random_forest_model=anomaly_detection_results.random_forest_classifier, @@ -1231,7 +1234,7 @@ def output_top_shap_explained_global_features_as_markdown_table( ) plot_all_shap_explained_local_feature_importance( - data=get_top_10_anomalies(features), + data=get_top_n_anomalies(features), explanation_results=explanation_results, prepared_features=features_prepared, feature_names=feature_names, From 8620b013b7e2d5ed65bfed2997644f0078e2d449 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 20 Jan 2026 08:50:13 +0100 Subject: [PATCH 8/9] Add weakly/strongly connected components to anomaly detection features --- .../anomaly-detection/anomalyDetectionCsv.sh | 9 + .../anomalyDetectionPython.sh | 9 + .../documentation/Architecture.gv | 4 + .../documentation/Architecture.svg | 610 +++++++++--------- ...yDetectionIsolationForestExploration.ipynb | 6 +- ...onnectedComponents-CreateDependency.cypher | 29 + ...onglyConnectedComponents-CreateNode.cypher | 26 + ...-StronglyConnectedComponents-Exists.cypher | 9 + ...e-StronglyConnectedComponents-Write.cypher | 18 + ...eaklyConnectedComponents-CreateNode.cypher | 45 ++ ...re-WeaklyConnectedComponents-Exists.cypher | 9 + ...ure-WeaklyConnectedComponents-Write.cypher | 18 + .../AnomalyDetectionReset-Algorithms.cypher | 4 +- ...onReset-StronglyConnectedComponents.cypher | 7 + ...tionReset-WeaklyConnectedComponents.cypher | 7 + .../tunedAnomalyDetectionExplained.py | 4 + 16 files changed, 519 insertions(+), 295 deletions(-) create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher create mode 100644 domains/anomaly-detection/reset/AnomalyDetectionReset-StronglyConnectedComponents.cypher create mode 100644 domains/anomaly-detection/reset/AnomalyDetectionReset-WeaklyConnectedComponents.cypher diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index 022292a9b..d8d5b1629 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -71,6 +71,15 @@ anomaly_detection_features() { "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}" execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}" + # Determines strongly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher" "${@}" + # Determines weakly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}" } # Run queries to find anomalies in the graph. diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index c11a0eb1c..049817eec 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -118,6 +118,15 @@ anomaly_detection_features() { "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_JavaType.cypher" "${@}" execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Abstractness-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature_Abstractness_TypeScriptModules.cypher" "${@}" + # Determines strongly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher" "${@}" + # Determines weakly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}" } # Execute the Python scripts for anomaly detection. diff --git a/domains/anomaly-detection/documentation/Architecture.gv b/domains/anomaly-detection/documentation/Architecture.gv index 61bc691a7..3f0dbb829 100644 --- a/domains/anomaly-detection/documentation/Architecture.gv +++ b/domains/anomaly-detection/documentation/Architecture.gv @@ -115,6 +115,8 @@ digraph AnomalyDetectionPipeline { LocalClusteringCoefficient [label="Local Clustering\nCoefficient"]; Degree [label="Degree\n(in, out, sum)"]; Abstractness [label="Abstractness\n(Robert C. Martin)"]; + StronglyConnectedComponents[label="StronglyConnectedComponents\n(member count)"] + WeaklyConnectedComponents [label="WeaklyConnectedComponents\n(median members size)"] } // Anomaly detection model area @@ -154,6 +156,8 @@ digraph AnomalyDetectionPipeline { LocalClusteringCoefficient -> AnomalyStandardizer; Degree -> AnomalyStandardizer; Abstractness -> AnomalyStandardizer; + StronglyConnectedComponents -> AnomalyStandardizer; + WeaklyConnectedComponents -> AnomalyStandardizer; // Proxy RandomForest used as a backing/tuning model for the Isolation Forest TuningAnomaly -> IsolationMinCluster; diff --git a/domains/anomaly-detection/documentation/Architecture.svg b/domains/anomaly-detection/documentation/Architecture.svg index 0360932b5..db526b9ea 100644 --- a/domains/anomaly-detection/documentation/Architecture.svg +++ b/domains/anomaly-detection/documentation/Architecture.svg @@ -4,711 +4,735 @@ - - + + AnomalyDetectionPipeline - + cluster_leiden - -Leiden Community Detection + +Leiden Community Detection cluster_fastRP - -Fast Random Projection (FastRP) + +Fast Random Projection (FastRP) cluster_UMAP - -Uniform Manifold Approximation and Projection (UMAP) -Dimensionality Reduction for Visualization + +Uniform Manifold Approximation and Projection (UMAP) +Dimensionality Reduction for Visualization cluster_hdbscan - -Hierarchical Density-Based Spatial Clustering (HDBSCAN) + +Hierarchical Density-Based Spatial Clustering (HDBSCAN) cluster_graph_features - -Graph (Algorithm) Features + +Graph (Algorithm) Features cluster_anomaly - -Anomaly Detection Model + +Anomaly Detection Model cluster_explainability - -Explainable AI (SHAP) + +Explainable AI (SHAP) Tuning_Leiden - -Tuning -(Optuna) + +Tuning +(Optuna) Leiden_Gamma - -gamma + +gamma Tuning_Leiden->Leiden_Gamma - - + + Leiden_Theta - -theta + +theta Tuning_Leiden->Leiden_Theta - - + + Leiden_Algorithm - -Leiden Community Detection + +Leiden Community Detection Leiden_Gamma->Leiden_Algorithm - - + + Leiden_Theta->Leiden_Algorithm - - + + Leiden_Algorithm->Tuning_Leiden - - -modularity + + +modularity Leiden_Algorithm->Tuning_Leiden - - -size + + +size CommunityId - -Community + +Community Leiden_Algorithm->CommunityId - - + + Tuning_HDBSCAN - -Tuning -(Optuna) + +Tuning +(Optuna) CommunityId->Tuning_HDBSCAN - - -reference + + +reference Tuning_FastRP - -Tuning -(Optuna) + +Tuning +(Optuna) FastRP_Dimension - -dimension + +dimension Tuning_FastRP->FastRP_Dimension - - + + FastRP_Normalization_Strength - -normalization strength + +normalization strength Tuning_FastRP->FastRP_Normalization_Strength - - + + FastRP_Forth_Iteration_Weight - -forth iteration weight + +forth iteration weight Tuning_FastRP->FastRP_Forth_Iteration_Weight - - + + FastRP_Algorithm - -FastRP + +FastRP FastRP_Dimension->FastRP_Algorithm - - + + FastRP_Normalization_Strength->FastRP_Algorithm - - + + FastRP_Forth_Iteration_Weight->FastRP_Algorithm - - + + FastRP_Algorithm->Tuning_FastRP - - -adjusted mutual info score -(incl. preview clustering) + + +adjusted mutual info score +(incl. preview clustering) NodeEmbeddings - -Node Embeddings + +Node Embeddings FastRP_Algorithm->NodeEmbeddings - - + + UMAP_Algorithm - -UMAP + +UMAP NodeEmbeddings->UMAP_Algorithm - - + + HDBSCAN_Node - -HDBSCAN + +HDBSCAN NodeEmbeddings->HDBSCAN_Node - - + + - + AnomalyPCA - -Principal Component -Analysis (PCA) + +Principal Component +Analysis (PCA) NodeEmbeddings->AnomalyPCA - - + + UMAP_Coordinates - -2D Coordinates + +2D Coordinates UMAP_Algorithm->UMAP_Coordinates - - + + HDBSCAN_Min_Cluster_Size - -Min Cluster Size + +Min Cluster Size Tuning_HDBSCAN->HDBSCAN_Min_Cluster_Size - - + + HDBSCAN_Min_Samples - -Min Samples + +Min Samples Tuning_HDBSCAN->HDBSCAN_Min_Samples - - + + HDBSCAN_Node->Tuning_HDBSCAN - - -adjusted mutual info score + + +adjusted mutual info score ClusterLabel - -Label + +Label HDBSCAN_Node->ClusterLabel - - + + ClusterRadius - -Radius -(avg,max) + +Radius +(avg,max) HDBSCAN_Node->ClusterRadius - - + + ClusterSize - -Size + +Size HDBSCAN_Node->ClusterSize - - + + NormDistToMedoid - -Normalized Distance -To Medoid + +Normalized Distance +To Medoid HDBSCAN_Node->NormDistToMedoid - - + + ClusterNoise - -Noise -(label=-1) + +Noise +(label=-1) HDBSCAN_Node->ClusterNoise - - + + ClusterProbability - -Probability + +Probability HDBSCAN_Node->ClusterProbability - - + + ClusterApproximationOutlierScore - -Approximation -OutlierScore -(= 1 - Probability) + +Approximation +OutlierScore +(= 1 - Probability) HDBSCAN_Node->ClusterApproximationOutlierScore - - + + HDBSCAN_Min_Cluster_Size->HDBSCAN_Node - - + + HDBSCAN_Min_Samples->HDBSCAN_Node - - + + - + AnomalyStandardizer - -Standardizer + +Standardizer ClusterRadius->AnomalyStandardizer - - + + NormDistToMedoid->AnomalyStandardizer - - + + ClusterApproximationOutlierScore->AnomalyStandardizer - - + + ArticleRank - -ArticleRank + +ArticleRank ArticleRank->AnomalyStandardizer - - + + PageRank - -PageRank + +PageRank PageRank->AnomalyStandardizer - - + + PageRank_minus_ArticleRank - -PageRank - -ArticleRank + +PageRank - +ArticleRank PageRank_minus_ArticleRank->AnomalyStandardizer - - + + BetweennessCentrality - -Betweenness -Centrality + +Betweenness +Centrality BetweennessCentrality->AnomalyStandardizer - - + + LocalClusteringCoefficient - -Local Clustering -Coefficient + +Local Clustering +Coefficient LocalClusteringCoefficient->AnomalyStandardizer - - + + Degree - -Degree -(in, out, sum) + +Degree +(in, out, sum) Degree->AnomalyStandardizer - - + + Abstractness - -Abstractness -(Robert C. Martin) + +Abstractness +(Robert C. Martin) Abstractness->AnomalyStandardizer - - + + - + +StronglyConnectedComponents + +StronglyConnectedComponents(member count) + + + +StronglyConnectedComponents->AnomalyStandardizer + + + + + +WeaklyConnectedComponents + +WeaklyConnectedComponents(median members size) + + + +WeaklyConnectedComponents->AnomalyStandardizer + + + + + TuningAnomaly - -Tuning -(Optuna) + +Tuning +(Optuna) - + IsolationMinCluster - -Min Cluster Size + +Min Cluster Size - + TuningAnomaly->IsolationMinCluster - - + + - + IsolationEstimators - -n estimators + +n estimators - + TuningAnomaly->IsolationEstimators - - + + - + ProxyEstimators - -n estimators + +n estimators - + TuningAnomaly->ProxyEstimators - - + + - + ProxyMaxDepth - -max depth + +max depth - + TuningAnomaly->ProxyMaxDepth - - + + - + IsolationForest - -Isolation Forest -Anomaly Detector + +Isolation Forest +Anomaly Detector - + IsolationMinCluster->IsolationForest - - + + - + IsolationEstimators->IsolationForest - - + + - + ProxyRandomForest - -RandomForest -(Proxy) + +RandomForest +(Proxy) - + ProxyEstimators->ProxyRandomForest - - + + - + ProxyMaxDepth->ProxyRandomForest - - + + - + AnomalyStandardizer->IsolationForest - - + + - + AnomalyPCA->IsolationForest - - + + - + IsolationForest->ProxyRandomForest - - -reference + + +reference - + AnomalyScore - -Score + +Score - + IsolationForest->AnomalyScore - - + + - + AnomalyLabel - -Label + +Label - + IsolationForest->AnomalyLabel - - + + - + ProxyRandomForest->TuningAnomaly - - -f1 score -(cross validation) + + +f1 score +(cross validation) - + SHAP - - - -SHAP TreeExplainer + + + +SHAP TreeExplainer - + ProxyRandomForest->SHAP - - + + - + SHAP_Values - -Top SHAP Values + +Top SHAP Values - + SHAP->SHAP_Values - - + + - + SHAP_Features - -Top Features + +Top Features - + SHAP->SHAP_Features - - + + - + SHAP_Embedding_Sum - -Node Embeddings -SHAP Sum + +Node Embeddings +SHAP Sum - + SHAP->SHAP_Embedding_Sum - - + + diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index c2ab9a593..36a0d0990 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -36,7 +36,8 @@ "| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n", "| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n", "| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n", - "| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n" + "| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n", + "| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n" ] }, { @@ -210,6 +211,8 @@ " ,coalesce(codeUnit.outgoingDependencies, 0) AS outgoingDependencies\n", " ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName\n", " ,coalesce(artifactName, projectName, \"\") AS projectName\n", + " OPTIONAL MATCH (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(stronglyConnectedComponent:StronglyConnectedComponent)\n", + " OPTIONAL MATCH (codeUnit)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(weaklyConnectedComponent:WeaklyConnectedComponent)\n", " RETURN DISTINCT \n", " codeUnitName\n", " ,codeUnit.name AS shortCodeUnitName\n", @@ -232,6 +235,7 @@ " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY\n", " \"\"\"\n", diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher new file mode 100644 index 000000000..67f040ff0 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher @@ -0,0 +1,29 @@ +// Create nodes for strongly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode". + + MATCH (sourceCodeUnit)-[codeUnitDependency:DEPENDS_ON]->(targetCodeUnit) + WHERE $projection_node_label IN labels(sourceCodeUnit) + AND $projection_node_label IN labels(targetCodeUnit) + MATCH (sourceCodeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(sourceComponent:StronglyConnectedComponent) + MATCH (targetCodeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(targetComponent:StronglyConnectedComponent) + WHERE sourceComponent <> targetComponent + WITH sourceComponent + ,targetComponent + ,count(*) AS weightCount + ,sum(codeUnitDependency.weight) AS weight + ,CASE $projection_weight_property + WHEN '' THEN sum(codeUnitDependency.weight) + ELSE sum(codeUnitDependency[$projection_weight_property]) + END AS weightSelected +// For debugging purposes + // RETURN sourceComponent.name + '-' + sourceComponent.id AS sourceComponentNameId + // ,targetComponent.name + '-' + targetComponent.id AS targetComponentNameId + // ,weightCount + // ,weight + // ,weightSelected + // ,count(*) AS occurs + // LIMIT 50 + // ORDER BY occurs DESC, sourceComponentNameId, targetComponentNameId + MERGE (sourceComponent)-[componentDependency:DEPENDS_ON]->(targetComponent) + SET componentDependency.weightCount = weightCount + ,componentDependency.weight = weight + ,componentDependency[$projection_weight_property] = weightSelected \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher new file mode 100644 index 000000000..fc3b0b319 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher @@ -0,0 +1,26 @@ +// Create nodes for strongly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher". + +// 1) Select all code units that belong to a strongly connected component +// and sort them by PageRank (used later for naming the component) MATCH (codeUnit) + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityStronglyConnectedComponentId IS NOT NULL + ORDER BY codeUnit.centralityPageRank DESC +// 2) Group code units by strongly connected component id + WITH codeUnit.communityStronglyConnectedComponentId AS componentId + ,collect(codeUnit) AS members + ,count(codeUnit) AS componentSize +// 3) Create or update the StronglyConnectedComponent node with member type label e.g. ("TypeMembers") +// - size: number of code units in the component +// - name: derived from the highest PageRank member + MERGE (component:StronglyConnectedComponent {id: componentId}) + WITH * + ,CASE componentSize WHEN = 1 THEN 'Component ' ELSE 'Cycle around ' END AS componentNamePrefix + CALL apoc.create.addLabels(component, [$projection_node_label + 'Members']) YIELD node + SET component.size = componentSize + ,component.name = componentNamePrefix + members[0].name +// 4) Expand members so we can attach relationships + WITH component, members + UNWIND members AS codeUnit +// 5) Connect the code units to the StronglyConnectedComponent they belong to. + MERGE (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(component) \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher new file mode 100644 index 000000000..b73542f0e --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a "communityStronglyConnectedComponentId" if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityStronglyConnectedComponentId IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.communityStronglyConnectedComponentId AS communityStronglyConnectedComponentId + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher new file mode 100644 index 000000000..309d46973 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-StronglyConnectedComponents-Write.cypher @@ -0,0 +1,18 @@ +// Calculates and writes the Strongly Connected Components for anomaly detection + +CALL gds.scc.write( + $projection_name + '-directed-cleaned', { + writeProperty: 'communityStronglyConnectedComponentId', + consecutiveIds: true +}) + YIELD componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis, componentDistribution +RETURN componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis + ,componentDistribution.min + ,componentDistribution.mean + ,componentDistribution.max + ,componentDistribution.p50 + ,componentDistribution.p75 + ,componentDistribution.p90 + ,componentDistribution.p95 + ,componentDistribution.p99 + ,componentDistribution.p999 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher new file mode 100644 index 000000000..08aa334cf --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher @@ -0,0 +1,45 @@ +// Create nodes for weakly connected components and connect them to their members. Requires "AnomalyDetectionFeature-StronglyConnectedComponents-CreateNode.cypher". + +// 1) Select all code units that belong to a weakly connected component +// and sort them by PageRank (used later for naming the component) MATCH (codeUnit) + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityWeaklyConnectedComponentId IS NOT NULL + ORDER BY codeUnit.centralityPageRank DESC +// 2) Group code units by weakly connected component id + WITH codeUnit.communityWeaklyConnectedComponentId AS componentId + ,collect(codeUnit) AS members +// 3) Create or update the WeaklyConnectedComponent node with member type label e.g. ("TypeMembers") +// - size: number of code units in the component +// - name: derived from the highest PageRank member + MERGE (component:WeaklyConnectedComponent {id: componentId}) + WITH * + CALL apoc.create.addLabels(component, [$projection_node_label + 'Members']) YIELD node + SET component.size = size(members) + ,component.name = 'Island around ' + members[0].name +// 4) Expand members so we can attach relationships and discover Strongly Connected Components + WITH component, members + UNWIND members AS codeUnit +// 5) Connect the code units to the WeaklyConnectedComponent they belong to. +// Additionally, find the StronglyConnectedComponent each code unit belongs to +// and connect it to the WeaklyConnectedComponent as well. +// Layers: code unit -> strongly connected component -> weakly connected component +OPTIONAL MATCH (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(stronglyConnectedComponent:StronglyConnectedComponent) + MERGE (codeUnit)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(component) + MERGE (stronglyConnectedComponent)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(component) +// 6) Collect code units per StronglyConnectedComponent within this WeaklyConnectedComponent +// (this allows us to compute StronglyConnectedComponent sizes) + WITH component, stronglyConnectedComponent, collect(DISTINCT codeUnit) AS stronglyConnectedComponentMembers + WHERE stronglyConnectedComponent IS NOT NULL +// 7) Compute the size of each StronglyConnectedComponent within each WeaklyConnectedComponent + WITH component, size(stronglyConnectedComponentMembers) AS stronglyConnectedComponentSize +// 8) Compute the StronglyConnectedComponent size percentiles per WeaklyConnectedComponent + WITH component + ,percentileDisc(stronglyConnectedComponentSize, 0.25) AS stronglyConnectedComponentSizePercentile25 + ,percentileDisc(stronglyConnectedComponentSize, 0.50) AS stronglyConnectedComponentSizePercentile50 + ,percentileDisc(stronglyConnectedComponentSize, 0.75) AS stronglyConnectedComponentSizePercentile75 +// 9) Store the computed StronglyConnectedComponent size percentiles on the WeaklyConnectedComponent node + SET component.stronglyConnectedComponentSizePercentile25 = stronglyConnectedComponentSizePercentile25 + ,component.stronglyConnectedComponentSizePercentile50 = stronglyConnectedComponentSizePercentile50 + ,component.stronglyConnectedComponentSizePercentile75 = stronglyConnectedComponentSizePercentile75 + ,component.stronglyConnectedComponentSizeInterQuartileRange = stronglyConnectedComponentSizePercentile75 - stronglyConnectedComponentSizePercentile25 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher new file mode 100644 index 000000000..f46e7c3f3 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a "communityWeaklyConnectedComponentId" if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityWeaklyConnectedComponentId IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.communityWeaklyConnectedComponentId AS communityWeaklyConnectedComponentId + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher new file mode 100644 index 000000000..374e17fc8 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher @@ -0,0 +1,18 @@ +// Calculates and writes the Weakly Connected Components for anomaly detection + +CALL gds.wcc.write( + $projection_name + '-cleaned', { + writeProperty: 'communityWeaklyConnectedComponentId', + consecutiveIds: true +}) + YIELD componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis, componentDistribution +RETURN componentCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis + ,componentDistribution.min + ,componentDistribution.mean + ,componentDistribution.max + ,componentDistribution.p50 + ,componentDistribution.p75 + ,componentDistribution.p90 + ,componentDistribution.p95 + ,componentDistribution.p99 + ,componentDistribution.p999 \ No newline at end of file diff --git a/domains/anomaly-detection/reset/AnomalyDetectionReset-Algorithms.cypher b/domains/anomaly-detection/reset/AnomalyDetectionReset-Algorithms.cypher index 34638f4c3..60b5773fb 100644 --- a/domains/anomaly-detection/reset/AnomalyDetectionReset-Algorithms.cypher +++ b/domains/anomaly-detection/reset/AnomalyDetectionReset-Algorithms.cypher @@ -5,4 +5,6 @@ REMOVE codeUnit.communityLocalClusteringCoefficient ,codeUnit.centralityArticleRank ,codeUnit.centralityPageRank - ,codeUnit.centralityBetweenness \ No newline at end of file + ,codeUnit.centralityBetweenness + ,codeUnit.communityStronglyConnectedComponentId + ,codeUnit.communityWeaklyConnectedComponentId \ No newline at end of file diff --git a/domains/anomaly-detection/reset/AnomalyDetectionReset-StronglyConnectedComponents.cypher b/domains/anomaly-detection/reset/AnomalyDetectionReset-StronglyConnectedComponents.cypher new file mode 100644 index 000000000..bf5ab81b0 --- /dev/null +++ b/domains/anomaly-detection/reset/AnomalyDetectionReset-StronglyConnectedComponents.cypher @@ -0,0 +1,7 @@ +// Reset all StronglyConnectedComponent nodes and their relationships + + MATCH (component:StronglyConnectedComponent) + CALL { WITH component + DETACH DELETE component + } IN TRANSACTIONS OF 1000 ROWS + RETURN count(component) as numberOfDeletedComponents \ No newline at end of file diff --git a/domains/anomaly-detection/reset/AnomalyDetectionReset-WeaklyConnectedComponents.cypher b/domains/anomaly-detection/reset/AnomalyDetectionReset-WeaklyConnectedComponents.cypher new file mode 100644 index 000000000..326c44b1e --- /dev/null +++ b/domains/anomaly-detection/reset/AnomalyDetectionReset-WeaklyConnectedComponents.cypher @@ -0,0 +1,7 @@ +// Reset all WeaklyConnectedComponent nodes and their relationships + + MATCH (component:WeaklyConnectedComponent) + CALL { WITH component + DETACH DELETE component + } IN TRANSACTIONS OF 1000 ROWS + RETURN count(component) as numberOfDeletedComponents \ No newline at end of file diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 0b9eb7374..105ec1d7b 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -255,6 +255,8 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr ,coalesce(codeUnit.outgoingDependencies, 0) AS outgoingDependencies ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName ,coalesce(artifactName, projectName, "") AS projectName + OPTIONAL MATCH (codeUnit)-[:IN_STRONGLY_CONNECTED_COMPONENT]->(stronglyConnectedComponent:StronglyConnectedComponent) + OPTIONAL MATCH (codeUnit)-[:IN_WEAKLY_CONNECTED_COMPONENT]->(weaklyConnectedComponent:WeaklyConnectedComponent) RETURN DISTINCT codeUnitName ,codeUnit.name AS shortCodeUnitName @@ -277,6 +279,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr ,codeUnit.clusteringHDBSCANSize AS clusterSize ,codeUnit.clusteringHDBSCANLabel AS clusterLabel ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid + ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY """ @@ -500,6 +503,7 @@ def objective(trial) -> float: study.enqueue_trial({'isolation_max_samples': 0.42726366840740576, 'isolation_n_estimators': 141, 'proxy_n_estimators': 190, 'proxy_max_depth': 5}) study.enqueue_trial({'isolation_max_samples': 0.40638732079782663, 'isolation_n_estimators': 108, 'proxy_n_estimators': 191, 'proxy_max_depth': 9}) + study.enqueue_trial({'isolation_max_samples': 0.10105966483207725, 'isolation_n_estimators': 271, 'proxy_n_estimators': 237, 'proxy_max_depth': 9}) study.enqueue_trial({'isolation_max_samples': 0.10010443935999927, 'isolation_n_estimators': 350, 'proxy_n_estimators': 344, 'proxy_max_depth': 8}) study.enqueue_trial({'isolation_max_samples': 0.10015063610944819, 'isolation_n_estimators': 329, 'proxy_n_estimators': 314, 'proxy_max_depth': 8}) From ea2e2e720167b596962ddb9268d633def9dfe128 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Wed, 21 Jan 2026 08:27:16 +0100 Subject: [PATCH 9/9] Add topological sort max distance from source for strongly connected components to anomaly detection features --- .../anomaly-detection/anomalyDetectionCsv.sh | 5 + .../anomalyDetectionPython.sh | 5 + .../documentation/Architecture.gv | 5 + .../documentation/Architecture.svg | 649 +++++++++--------- ...yDetectionIsolationForestExploration.ipynb | 11 +- ...re-TopologicalSortComponents-Exists.cypher | 9 + ...opologicalSortComponents-Projection.cypher | 10 + ...ure-TopologicalSortComponents-Write.cypher | 14 + .../tunedAnomalyDetectionExplained.py | 1 + 9 files changed, 391 insertions(+), 318 deletions(-) create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher create mode 100644 domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index d8d5b1629..2eaf031cf 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -80,6 +80,11 @@ anomaly_detection_features() { execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}" execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}" + # Determines topological sort max distance from source for strongly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher" "${@}" } # Run queries to find anomalies in the graph. diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index 049817eec..087e9da2c 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -127,6 +127,11 @@ anomaly_detection_features() { execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-Write.cypher" "${@}" execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-WeaklyConnectedComponents-CreateNode.cypher" "${@}" + # Determines topological sort max distance from source for strongly connected components if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher" "${@}" + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher" "${@}" } # Execute the Python scripts for anomaly detection. diff --git a/domains/anomaly-detection/documentation/Architecture.gv b/domains/anomaly-detection/documentation/Architecture.gv index 3f0dbb829..d2138f817 100644 --- a/domains/anomaly-detection/documentation/Architecture.gv +++ b/domains/anomaly-detection/documentation/Architecture.gv @@ -117,8 +117,12 @@ digraph AnomalyDetectionPipeline { Abstractness [label="Abstractness\n(Robert C. Martin)"]; StronglyConnectedComponents[label="StronglyConnectedComponents\n(member count)"] WeaklyConnectedComponents [label="WeaklyConnectedComponents\n(median members size)"] + TopologicalSort [label="TopologicalSort\n(max component distance from source)"] } + // Inter graph algorithm feature connections + StronglyConnectedComponents -> TopologicalSort + // Anomaly detection model area subgraph cluster_anomaly { label="Anomaly Detection Model"; @@ -158,6 +162,7 @@ digraph AnomalyDetectionPipeline { Abstractness -> AnomalyStandardizer; StronglyConnectedComponents -> AnomalyStandardizer; WeaklyConnectedComponents -> AnomalyStandardizer; + TopologicalSort -> AnomalyStandardizer; // Proxy RandomForest used as a backing/tuning model for the Isolation Forest TuningAnomaly -> IsolationMinCluster; diff --git a/domains/anomaly-detection/documentation/Architecture.svg b/domains/anomaly-detection/documentation/Architecture.svg index db526b9ea..55cf9ef40 100644 --- a/domains/anomaly-detection/documentation/Architecture.svg +++ b/domains/anomaly-detection/documentation/Architecture.svg @@ -4,735 +4,756 @@ - - + + AnomalyDetectionPipeline - + cluster_leiden - -Leiden Community Detection + +Leiden Community Detection cluster_fastRP - -Fast Random Projection (FastRP) + +Fast Random Projection (FastRP) cluster_UMAP - -Uniform Manifold Approximation and Projection (UMAP) -Dimensionality Reduction for Visualization + +Uniform Manifold Approximation and Projection (UMAP) +Dimensionality Reduction for Visualization cluster_hdbscan - -Hierarchical Density-Based Spatial Clustering (HDBSCAN) + +Hierarchical Density-Based Spatial Clustering (HDBSCAN) cluster_graph_features - -Graph (Algorithm) Features + +Graph (Algorithm) Features cluster_anomaly - -Anomaly Detection Model + +Anomaly Detection Model cluster_explainability - -Explainable AI (SHAP) + +Explainable AI (SHAP) Tuning_Leiden - -Tuning -(Optuna) + +Tuning +(Optuna) Leiden_Gamma - -gamma + +gamma Tuning_Leiden->Leiden_Gamma - - + + Leiden_Theta - -theta + +theta Tuning_Leiden->Leiden_Theta - - + + Leiden_Algorithm - -Leiden Community Detection + +Leiden Community Detection Leiden_Gamma->Leiden_Algorithm - - + + Leiden_Theta->Leiden_Algorithm - - + + Leiden_Algorithm->Tuning_Leiden - - -modularity + + +modularity Leiden_Algorithm->Tuning_Leiden - - -size + + +size CommunityId - -Community + +Community Leiden_Algorithm->CommunityId - - + + Tuning_HDBSCAN - -Tuning -(Optuna) + +Tuning +(Optuna) CommunityId->Tuning_HDBSCAN - - -reference + + +reference Tuning_FastRP - -Tuning -(Optuna) + +Tuning +(Optuna) FastRP_Dimension - -dimension + +dimension Tuning_FastRP->FastRP_Dimension - - + + FastRP_Normalization_Strength - -normalization strength + +normalization strength Tuning_FastRP->FastRP_Normalization_Strength - - + + FastRP_Forth_Iteration_Weight - -forth iteration weight + +forth iteration weight Tuning_FastRP->FastRP_Forth_Iteration_Weight - - + + FastRP_Algorithm - -FastRP + +FastRP FastRP_Dimension->FastRP_Algorithm - - + + FastRP_Normalization_Strength->FastRP_Algorithm - - + + FastRP_Forth_Iteration_Weight->FastRP_Algorithm - - + + FastRP_Algorithm->Tuning_FastRP - - -adjusted mutual info score -(incl. preview clustering) + + +adjusted mutual info score +(incl. preview clustering) NodeEmbeddings - -Node Embeddings + +Node Embeddings FastRP_Algorithm->NodeEmbeddings - - + + UMAP_Algorithm - -UMAP + +UMAP NodeEmbeddings->UMAP_Algorithm - - + + HDBSCAN_Node - -HDBSCAN + +HDBSCAN NodeEmbeddings->HDBSCAN_Node - - + + - + AnomalyPCA - -Principal Component -Analysis (PCA) + +Principal Component +Analysis (PCA) - + NodeEmbeddings->AnomalyPCA - - + + UMAP_Coordinates - -2D Coordinates + +2D Coordinates UMAP_Algorithm->UMAP_Coordinates - - + + HDBSCAN_Min_Cluster_Size - -Min Cluster Size + +Min Cluster Size Tuning_HDBSCAN->HDBSCAN_Min_Cluster_Size - - + + HDBSCAN_Min_Samples - -Min Samples + +Min Samples Tuning_HDBSCAN->HDBSCAN_Min_Samples - - + + HDBSCAN_Node->Tuning_HDBSCAN - - -adjusted mutual info score + + +adjusted mutual info score ClusterLabel - -Label + +Label HDBSCAN_Node->ClusterLabel - - + + ClusterRadius - -Radius -(avg,max) + +Radius +(avg,max) HDBSCAN_Node->ClusterRadius - - + + ClusterSize - -Size + +Size HDBSCAN_Node->ClusterSize - - + + NormDistToMedoid - -Normalized Distance -To Medoid + +Normalized Distance +To Medoid HDBSCAN_Node->NormDistToMedoid - - + + ClusterNoise - -Noise -(label=-1) + +Noise +(label=-1) HDBSCAN_Node->ClusterNoise - - + + ClusterProbability - -Probability + +Probability HDBSCAN_Node->ClusterProbability - - + + ClusterApproximationOutlierScore - -Approximation -OutlierScore -(= 1 - Probability) + +Approximation +OutlierScore +(= 1 - Probability) HDBSCAN_Node->ClusterApproximationOutlierScore - - + + HDBSCAN_Min_Cluster_Size->HDBSCAN_Node - - + + HDBSCAN_Min_Samples->HDBSCAN_Node - - + + - + AnomalyStandardizer - -Standardizer + +Standardizer - + ClusterRadius->AnomalyStandardizer - - + + - + NormDistToMedoid->AnomalyStandardizer - - + + - + ClusterApproximationOutlierScore->AnomalyStandardizer - - + + ArticleRank - -ArticleRank + +ArticleRank - + ArticleRank->AnomalyStandardizer - - + + PageRank - -PageRank + +PageRank - + PageRank->AnomalyStandardizer - - + + PageRank_minus_ArticleRank - -PageRank - -ArticleRank + +PageRank - +ArticleRank - + PageRank_minus_ArticleRank->AnomalyStandardizer - - + + BetweennessCentrality - -Betweenness -Centrality + +Betweenness +Centrality - + BetweennessCentrality->AnomalyStandardizer - - + + LocalClusteringCoefficient - -Local Clustering -Coefficient + +Local Clustering +Coefficient - + LocalClusteringCoefficient->AnomalyStandardizer - - + + Degree - -Degree -(in, out, sum) + +Degree +(in, out, sum) - + Degree->AnomalyStandardizer - - + + Abstractness - -Abstractness -(Robert C. Martin) + +Abstractness +(Robert C. Martin) - + Abstractness->AnomalyStandardizer - - + + StronglyConnectedComponents - -StronglyConnectedComponents(member count) + +StronglyConnectedComponents +(member count) + + + +TopologicalSort + +TopologicalSort +(max component distance from source) + + + +StronglyConnectedComponents->TopologicalSort + + - + StronglyConnectedComponents->AnomalyStandardizer - - + + WeaklyConnectedComponents - -WeaklyConnectedComponents(median members size) + +WeaklyConnectedComponents +(median members size) - + WeaklyConnectedComponents->AnomalyStandardizer - - + + + + + +TopologicalSort->AnomalyStandardizer + + - + TuningAnomaly - -Tuning -(Optuna) + +Tuning +(Optuna) - + IsolationMinCluster - -Min Cluster Size + +Min Cluster Size - + TuningAnomaly->IsolationMinCluster - - + + - + IsolationEstimators - -n estimators + +n estimators - + TuningAnomaly->IsolationEstimators - - + + - + ProxyEstimators - -n estimators + +n estimators - + TuningAnomaly->ProxyEstimators - - + + - + ProxyMaxDepth - -max depth + +max depth - + TuningAnomaly->ProxyMaxDepth - - + + - + IsolationForest - -Isolation Forest -Anomaly Detector + +Isolation Forest +Anomaly Detector - + IsolationMinCluster->IsolationForest - - + + - + IsolationEstimators->IsolationForest - - + + - + ProxyRandomForest - -RandomForest -(Proxy) + +RandomForest +(Proxy) - + ProxyEstimators->ProxyRandomForest - - + + - + ProxyMaxDepth->ProxyRandomForest - - + + - + AnomalyStandardizer->IsolationForest - - + + - + AnomalyPCA->IsolationForest - - + + - + IsolationForest->ProxyRandomForest - - -reference + + +reference - + AnomalyScore - -Score + +Score - + IsolationForest->AnomalyScore - - + + - + AnomalyLabel - -Label + +Label - + IsolationForest->AnomalyLabel - - + + - + ProxyRandomForest->TuningAnomaly - - -f1 score -(cross validation) + + +f1 score +(cross validation) - + SHAP - - - -SHAP TreeExplainer + + + +SHAP TreeExplainer - + ProxyRandomForest->SHAP - - + + - + SHAP_Values - -Top SHAP Values + +Top SHAP Values - + SHAP->SHAP_Values - - + + - + SHAP_Features - -Top Features + +Top Features - + SHAP->SHAP_Features - - + + - + SHAP_Embedding_Sum - -Node Embeddings -SHAP Sum + +Node Embeddings +SHAP Sum - + SHAP->SHAP_Embedding_Sum - - + + diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index 36a0d0990..08b81cc8d 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -37,7 +37,8 @@ "| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n", "| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n", "| `Abstractness` (Robert C. Martin) | Design / OO Metric | Ratio of abstract types (interfaces, abstract classes) to total types | Indicates architectural intent; supports Dependency Inversion Principle and stability balance |\n", - "| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n" + "| `Relative Strong Component Size (vs WCC Median)` | Structural / Graph Topology | Size of the node’s strongly connected component normalized by the median SCC size within its weakly connected component | Highlights unusually large cyclic dependency groups relative to local context; high values often indicate architectural tangles or stability issues |\n", + "| `Max Topological Distance from Source (SCC DAG)` | Structural / Graph Topology | Longest path from any source SCC to the node’s SCC in the condensed DAG | Approximates architectural depth or layering; high values indicate deeply nested components and potential rigidity or change amplification |" ] }, { @@ -236,6 +237,7 @@ " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", " ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio\n", + " ,coalesce(stronglyConnectedComponent.topologicalSortMaxDistanceFromSource, 0) AS topologicalComponentLayer\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY\n", " \"\"\"\n", @@ -676,6 +678,7 @@ " study.enqueue_trial({'isolation_max_samples': 0.42726366840740576, 'isolation_n_estimators': 141, 'proxy_n_estimators': 190, 'proxy_max_depth': 5})\n", " study.enqueue_trial({'isolation_max_samples': 0.40638732079782663, 'isolation_n_estimators': 108, 'proxy_n_estimators': 191, 'proxy_max_depth': 9})\n", " \n", + " study.enqueue_trial({'isolation_max_samples': 0.10105966483207725, 'isolation_n_estimators': 271, 'proxy_n_estimators': 237, 'proxy_max_depth': 9})\n", " study.enqueue_trial({'isolation_max_samples': 0.10010443935999927, 'isolation_n_estimators': 350, 'proxy_n_estimators': 344, 'proxy_max_depth': 8})\n", " study.enqueue_trial({'isolation_max_samples': 0.10015063610944819, 'isolation_n_estimators': 329, 'proxy_n_estimators': 314, 'proxy_max_depth': 8})\n", "\n", @@ -2067,9 +2070,9 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO delete when finished tweaking\n", - "top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n", - "print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))" + "# For debugging purposes\n", + "# top10=get_top_anomalies(java_type_anomaly_detection_features, top_n=25).reset_index(drop=True)\n", + "# print(top10.to_csv(index=False, columns=['shortCodeUnitName', 'anomalyScore']))" ] }, { diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher new file mode 100644 index 000000000..b7c3ab0ed --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a "topologicalSortMaxDistanceFromSource" if it exists + + MATCH (component:StronglyConnectedComponent) + WHERE $projection_node_label + 'Members' IN labels(component) + AND component.topologicalSortMaxDistanceFromSource IS NOT NULL + RETURN component.name AS shortCodeUnitName + ,elementId(component) AS nodeElementId + ,component.topologicalSortMaxDistanceFromSource AS topologicalSortMaxDistanceFromSource + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher new file mode 100644 index 000000000..f7458df14 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Projection.cypher @@ -0,0 +1,10 @@ +// Creates a projection of the strongly connected components graph for the given member type. Requires: "AnomalyDetectionFeature-StronglyConnectedComponents-CreateDependency.cypher" + + MATCH (sourceComponent:StronglyConnectedComponent) +OPTIONAL MATCH (sourceComponent)-[:DEPENDS_ON]->(targetComponent:StronglyConnectedComponent) + WHERE $projection_node_label + 'Members' IN labels(sourceComponent) + AND $projection_node_label + 'Members' IN labels(targetComponent) + WITH gds.graph.project($projection_name + '-components', sourceComponent, targetComponent) AS graph + RETURN graph.graphName AS graphName + ,graph.nodeCount AS nodeCount + ,graph.relationshipCount AS relationshipCount \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher new file mode 100644 index 000000000..d24c36ec5 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-TopologicalSortComponents-Write.cypher @@ -0,0 +1,14 @@ +// Topological Sort to write the property "topologicalSortMaxDistanceFromSource" (e.g. build order) for strongly connected components into the graph. Requires "AnomalyDetectionFeature-TopologicalSortComponents-Projection". +// Needs graph-data-science plugin version >= 2.5.0 + +CALL gds.dag.topologicalSort.stream( + $projection_name + '-components', { + computeMaxDistanceFromSource: true +}) YIELD nodeId, maxDistanceFromSource + WITH nodeId + ,gds.util.asNode(nodeId) AS component + ,toInteger(maxDistanceFromSource) AS maxDistanceFromSource + SET component.topologicalSortMaxDistanceFromSource = maxDistanceFromSource + WITH maxDistanceFromSource, count(*) AS occurrences + RETURN maxDistanceFromSource, occurrences + ORDER BY maxDistanceFromSource \ No newline at end of file diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 105ec1d7b..a40fc466a 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -280,6 +280,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr ,codeUnit.clusteringHDBSCANLabel AS clusterLabel ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid ,coalesce(stronglyConnectedComponent.size / weaklyConnectedComponent.stronglyConnectedComponentSizePercentile50, 1.0) AS stronglyConnectedComponentSizeRatio + ,coalesce(stronglyConnectedComponent.topologicalSortMaxDistanceFromSource, 0) AS topologicalComponentLayer ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY """