Print more anomaly detection feature distributions

JohT · JohT · commit 4cf3efe6dbc0 · 2025-12-26T10:25:38.000+01:00
diff --git a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py
@@ -378,16 +378,37 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
     plot.savefig(plot_file_path)
 
 
-def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
+def plot_feature_distribution(
+        feature_values: pd.Series,
+        feature_name: str,
+        short_names: pd.Series,
+        title: str,
+        plot_file_path: str,
+        log_y_axis: bool = False,
+        standard_deviation_lines: bool = True,
+        number_of_outliers_to_annotate: int = 4,
+) -> None:
     """
     Plots the distribution of feature's values.
     
     Parameters
     ----------
     feature_values : pd.Series
         Series containing feature values.
-    text_prefix: str
-        Text at the beginning of the title
+    feature_name : str
+        Name of the feature to be displayed on the x-axis.
+    short_names : pd.Series
+        Series containing the short names of the data points for annotation.
+    title : str
+        Title of the plot.
+    plot_file_path : str
+        File path to save the plot.
+    log_y_axis : bool
+        Whether to use logarithmic scale for the y-axis.
+    standard_deviation_lines : bool
+        Whether to plot standard deviation lines.
+    number_of_outliers_to_annotate : int
+        Number of outliers to annotate on each side of the distribution.
     """
     if feature_values.empty:
         print("No data available to plot.")
@@ -400,18 +421,50 @@ def plot_feature_distribution(feature_values: pd.Series, feature_name: str, titl
     plot.xlabel(feature_name)
     plot.ylabel('Frequency')
     plot.xlim(left=feature_values.min(), right=feature_values.max())
-    # plot.yscale('log')  # Use logarithmic scale for better visibility of differences
+    if log_y_axis:
+        plot.yscale('log')  # Use logarithmic scale for better visibility of differences
     plot.grid(True)
 
     mean = feature_values.mean()
     standard_deviation = feature_values.std()
 
-    # Vertical line for the mean
-    plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
-    # Vertical line for 1 x standard deviations + mean (=z-score of 1)
-    plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
-    # Vertical line for 2 x standard deviations + mean (=z-score of 2)
-    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
+    if standard_deviation_lines:
+        # Vertical line for the mean
+        plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
+        # Vertical line for 1 x standard deviations + mean (=z-score of 1)
+        plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
+        # Vertical line for 2 x standard deviations + mean (=z-score of 2)
+        plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
+
+    def annotate_distribution_outliers(
+        outliers: pd.Series,
+        names: pd.Series,
+    ) -> None:
+        if outliers.empty:
+            return
+        rank = 0
+        for item_index, value in outliers.items():
+            index = typing.cast(int, item_index)
+            rank = rank + 1
+            short_name = names[index]
+            
+            x_index_offset = (rank % 6) * 10
+            if value > mean:
+                x_index_offset = -x_index_offset
+
+            plot.annotate(
+                text=f'{short_name}',
+                xy=(value, 1),
+                xytext=(x_index_offset, 60),  # offset in points (uses 'textcoords': 'offset points')
+                rotation=90,
+                **plot_annotation_style,
+            )
+
+    positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)
+    annotate_distribution_outliers(positive_outliers, short_names)
+
+    negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)
+    annotate_distribution_outliers(negative_outliers, short_names)
 
     plot.tight_layout()
     plot.savefig(plot_file_path)
@@ -835,17 +888,48 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
 plot_feature_distribution(
     feature_values=data['clusteringCoefficient'],
     feature_name='Clustering Coefficient',
+    short_names=data['shortCodeUnitName'],
     title=f"{title_prefix} clustering coefficient distribution",
-    plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
+    standard_deviation_lines=False,
+    plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters),
 )
 
 plot_feature_distribution(
     feature_values=data['betweenness'],
     feature_name='Betweenness',
+    short_names=data['shortCodeUnitName'],
     title=f"{title_prefix} betweenness centrality distribution",
+    log_y_axis=True,
     plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
 )
 
+plot_feature_distribution(
+    feature_values=data['degree'],
+    feature_name='Degree',
+    short_names=data['shortCodeUnitName'],
+    title=f"{title_prefix} degree distribution",
+    log_y_axis=True,
+    plot_file_path=get_file_path("Degree_distribution", parameters)
+)
+
+plot_feature_distribution(
+    feature_values=data['incomingDependencies'],
+    feature_name='incomingDependencies',
+    short_names=data['shortCodeUnitName'],
+    title=f"{title_prefix} incoming dependencies distribution",
+    log_y_axis=True,
+    plot_file_path=get_file_path("IncomingDependencies_distribution", parameters)
+)
+
+plot_feature_distribution(
+    feature_values=data['outgoingDependencies'],
+    feature_name='outgoingDependencies',
+    short_names=data['shortCodeUnitName'],
+    title=f"{title_prefix} outgoing dependencies distribution",
+    log_y_axis=True,
+    plot_file_path=get_file_path("OutgoingDependencies_distribution", parameters)
+)
+
 plot_clustering_coefficient_vs_page_rank(
     data['clusteringCoefficient'],
     data['pageRank'],
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb
@@ -644,16 +644,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n",
+    "def plot_feature_distribution(\n",
+    "        feature_values: pd.Series,\n",
+    "        feature_name: str,\n",
+    "        short_names: pd.Series,\n",
+    "        title_prefix: str,\n",
+    "        log_y_axis: bool = False,\n",
+    "        standard_deviation_lines: bool = True,\n",
+    "        number_of_outliers_to_annotate: int = 4,\n",
+    "    ) -> None:\n",
     "    \"\"\"\n",
     "    Plots the distribution of feature's values.\n",
     "    \n",
     "    Parameters\n",
     "    ----------\n",
     "    feature_values : pd.Series\n",
     "        Series containing feature values.\n",
-    "    text_prefix: str\n",
-    "        Text at the beginning of the title\n",
+    "    feature_name : str\n",
+    "        Name of the feature to be displayed on the x-axis.\n",
+    "    short_names : pd.Series\n",
+    "        Series containing the short names of the data points for annotation.\n",
+    "    title_prefix : str\n",
+    "        Prefix for the title of the plot.\n",
+    "    log_y_axis : bool\n",
+    "        Whether to use logarithmic scale for the y-axis.\n",
+    "    standard_deviation_lines : bool\n",
+    "        Whether to plot standard deviation lines.\n",
+    "    number_of_outliers_to_annotate : int\n",
+    "        Number of outliers to annotate on each side of the distribution.\n",
     "    \"\"\"\n",
     "    if feature_values.empty:\n",
     "        print(\"No data available to plot.\")\n",
@@ -666,52 +684,69 @@
     "    plot.xlabel(feature_name)\n",
     "    plot.ylabel('Frequency')\n",
     "    plot.xlim(left=feature_values.min(), right=feature_values.max())\n",
-    "    # plot.yscale('log')  # Use logarithmic scale for better visibility of differences\n",
+    "    if log_y_axis:\n",
+    "        plot.yscale('log')  # Use logarithmic scale for better visibility of differences\n",
     "    plot.grid(True)\n",
     "    plot.tight_layout()\n",
     "\n",
     "    mean = feature_values.mean()\n",
     "    standard_deviation = feature_values.std()\n",
     "\n",
-    "    # Vertical line for the mean\n",
-    "    plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
-    "    # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
-    "    plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
-    "    # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
-    "    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
+    "    if standard_deviation_lines:\n",
+    "        # Vertical line for the mean\n",
+    "        plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
+    "        # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
+    "        plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
+    "        # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
+    "        plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
+    "\n",
+    "    def annotate_distribution_outliers(\n",
+    "        outliers: pd.Series,\n",
+    "        names: pd.Series,\n",
+    "    ) -> None:\n",
+    "        if outliers.empty:\n",
+    "            return\n",
+    "        rank = 0\n",
+    "        for item_index, value in outliers.items():\n",
+    "            index = typing.cast(int, item_index)\n",
+    "            rank = rank + 1\n",
+    "            short_name = names[index]\n",
+    "            \n",
+    "            x_index_offset = (rank % 6) * 10\n",
+    "            if value > mean:\n",
+    "                x_index_offset = -x_index_offset\n",
+    "\n",
+    "            plot.annotate(\n",
+    "                text=f'{short_name}',\n",
+    "                xy=(value, 1),\n",
+    "                xytext=(x_index_offset, 60),\n",
+    "                rotation=90,\n",
+    "                **plot_annotation_style,\n",
+    "            )\n",
+    "\n",
+    "    positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)\n",
+    "    annotate_distribution_outliers(positive_outliers, short_names)\n",
+    "\n",
+    "    negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)\n",
+    "    annotate_distribution_outliers(negative_outliers, short_names)\n",
     "\n",
     "    plot.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed900c59",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
-    "    \"\"\"\n",
-    "    Plots the distribution of clustering coefficients.\n",
-    "    \n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    clustering_coefficients : pd.Series\n",
-    "        Series containing clustering coefficient values.\n",
-    "    text_prefix: str\n",
-    "        Text at the beginning of the title\n",
-    "    \"\"\"\n",
-    "    plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "92aff8d9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")"
+    "plot_feature_distribution(\n",
+    "    java_package_features['clusteringCoefficient'],\n",
+    "    'Clustering Coefficient', \n",
+    "    java_package_features['shortCodeUnitName'], \n",
+    "    title_prefix=\"Java Package\",\n",
+    "    standard_deviation_lines=False\n",
+    ")"
    ]
   },
   {
@@ -836,7 +871,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")"
+    "plot_feature_distribution(\n",
+    "    java_package_features['betweenness'], \n",
+    "    'Betweenness', \n",
+    "    java_package_features['shortCodeUnitName'],\n",
+    "    title_prefix=\"Java Package\",\n",
+    "    log_y_axis=True,\n",
+    ")"
    ]
   },
   {
@@ -1443,7 +1484,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")"
+    "plot_feature_distribution(\n",
+    "    java_type_features['clusteringCoefficient'],\n",
+    "    'Clustering Coefficient',\n",
+    "    java_type_features['shortCodeUnitName'],\n",
+    "    title_prefix=\"Java Type\",\n",
+    "    standard_deviation_lines=False\n",
+    ")"
    ]
   },
   {
@@ -1477,7 +1524,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")"
+    "plot_feature_distribution(\n",
+    "    java_type_features['betweenness'],\n",
+    "    'Betweenness',\n",
+    "    java_type_features['shortCodeUnitName'],\n",
+    "    log_y_axis=True,\n",
+    "    title_prefix=\"Java Type\"\n",
+    ")"
    ]
   },
   {