Skip to content

Commit 4cf3efe

Browse files
committed
Print more anomaly detection feature distributions
1 parent 2b081a0 commit 4cf3efe

2 files changed

Lines changed: 183 additions & 46 deletions

File tree

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 95 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -378,16 +378,37 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378
plot.savefig(plot_file_path)
379379

380380

381-
def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
381+
def plot_feature_distribution(
382+
feature_values: pd.Series,
383+
feature_name: str,
384+
short_names: pd.Series,
385+
title: str,
386+
plot_file_path: str,
387+
log_y_axis: bool = False,
388+
standard_deviation_lines: bool = True,
389+
number_of_outliers_to_annotate: int = 4,
390+
) -> None:
382391
"""
383392
Plots the distribution of feature's values.
384393
385394
Parameters
386395
----------
387396
feature_values : pd.Series
388397
Series containing feature values.
389-
text_prefix: str
390-
Text at the beginning of the title
398+
feature_name : str
399+
Name of the feature to be displayed on the x-axis.
400+
short_names : pd.Series
401+
Series containing the short names of the data points for annotation.
402+
title : str
403+
Title of the plot.
404+
plot_file_path : str
405+
File path to save the plot.
406+
log_y_axis : bool
407+
Whether to use logarithmic scale for the y-axis.
408+
standard_deviation_lines : bool
409+
Whether to plot standard deviation lines.
410+
number_of_outliers_to_annotate : int
411+
Number of outliers to annotate on each side of the distribution.
391412
"""
392413
if feature_values.empty:
393414
print("No data available to plot.")
@@ -400,18 +421,50 @@ def plot_feature_distribution(feature_values: pd.Series, feature_name: str, titl
400421
plot.xlabel(feature_name)
401422
plot.ylabel('Frequency')
402423
plot.xlim(left=feature_values.min(), right=feature_values.max())
403-
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
424+
if log_y_axis:
425+
plot.yscale('log') # Use logarithmic scale for better visibility of differences
404426
plot.grid(True)
405427

406428
mean = feature_values.mean()
407429
standard_deviation = feature_values.std()
408430

409-
# Vertical line for the mean
410-
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
411-
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
412-
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
413-
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
414-
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
431+
if standard_deviation_lines:
432+
# Vertical line for the mean
433+
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
434+
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
435+
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
436+
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
437+
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
438+
439+
def annotate_distribution_outliers(
440+
outliers: pd.Series,
441+
names: pd.Series,
442+
) -> None:
443+
if outliers.empty:
444+
return
445+
rank = 0
446+
for item_index, value in outliers.items():
447+
index = typing.cast(int, item_index)
448+
rank = rank + 1
449+
short_name = names[index]
450+
451+
x_index_offset = (rank % 6) * 10
452+
if value > mean:
453+
x_index_offset = -x_index_offset
454+
455+
plot.annotate(
456+
text=f'{short_name}',
457+
xy=(value, 1),
458+
xytext=(x_index_offset, 60), # offset in points (uses 'textcoords': 'offset points')
459+
rotation=90,
460+
**plot_annotation_style,
461+
)
462+
463+
positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)
464+
annotate_distribution_outliers(positive_outliers, short_names)
465+
466+
negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)
467+
annotate_distribution_outliers(negative_outliers, short_names)
415468

416469
plot.tight_layout()
417470
plot.savefig(plot_file_path)
@@ -835,17 +888,48 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
835888
plot_feature_distribution(
836889
feature_values=data['clusteringCoefficient'],
837890
feature_name='Clustering Coefficient',
891+
short_names=data['shortCodeUnitName'],
838892
title=f"{title_prefix} clustering coefficient distribution",
839-
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
893+
standard_deviation_lines=False,
894+
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters),
840895
)
841896

842897
plot_feature_distribution(
843898
feature_values=data['betweenness'],
844899
feature_name='Betweenness',
900+
short_names=data['shortCodeUnitName'],
845901
title=f"{title_prefix} betweenness centrality distribution",
902+
log_y_axis=True,
846903
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
847904
)
848905

906+
plot_feature_distribution(
907+
feature_values=data['degree'],
908+
feature_name='Degree',
909+
short_names=data['shortCodeUnitName'],
910+
title=f"{title_prefix} degree distribution",
911+
log_y_axis=True,
912+
plot_file_path=get_file_path("Degree_distribution", parameters)
913+
)
914+
915+
plot_feature_distribution(
916+
feature_values=data['incomingDependencies'],
917+
feature_name='incomingDependencies',
918+
short_names=data['shortCodeUnitName'],
919+
title=f"{title_prefix} incoming dependencies distribution",
920+
log_y_axis=True,
921+
plot_file_path=get_file_path("IncomingDependencies_distribution", parameters)
922+
)
923+
924+
plot_feature_distribution(
925+
feature_values=data['outgoingDependencies'],
926+
feature_name='outgoingDependencies',
927+
short_names=data['shortCodeUnitName'],
928+
title=f"{title_prefix} outgoing dependencies distribution",
929+
log_y_axis=True,
930+
plot_file_path=get_file_path("OutgoingDependencies_distribution", parameters)
931+
)
932+
849933
plot_clustering_coefficient_vs_page_rank(
850934
data['clusteringCoefficient'],
851935
data['pageRank'],

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 88 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -644,16 +644,34 @@
644644
"metadata": {},
645645
"outputs": [],
646646
"source": [
647-
"def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n",
647+
"def plot_feature_distribution(\n",
648+
" feature_values: pd.Series,\n",
649+
" feature_name: str,\n",
650+
" short_names: pd.Series,\n",
651+
" title_prefix: str,\n",
652+
" log_y_axis: bool = False,\n",
653+
" standard_deviation_lines: bool = True,\n",
654+
" number_of_outliers_to_annotate: int = 4,\n",
655+
" ) -> None:\n",
648656
" \"\"\"\n",
649657
" Plots the distribution of feature's values.\n",
650658
" \n",
651659
" Parameters\n",
652660
" ----------\n",
653661
" feature_values : pd.Series\n",
654662
" Series containing feature values.\n",
655-
" text_prefix: str\n",
656-
" Text at the beginning of the title\n",
663+
" feature_name : str\n",
664+
" Name of the feature to be displayed on the x-axis.\n",
665+
" short_names : pd.Series\n",
666+
" Series containing the short names of the data points for annotation.\n",
667+
" title_prefix : str\n",
668+
" Prefix for the title of the plot.\n",
669+
" log_y_axis : bool\n",
670+
" Whether to use logarithmic scale for the y-axis.\n",
671+
" standard_deviation_lines : bool\n",
672+
" Whether to plot standard deviation lines.\n",
673+
" number_of_outliers_to_annotate : int\n",
674+
" Number of outliers to annotate on each side of the distribution.\n",
657675
" \"\"\"\n",
658676
" if feature_values.empty:\n",
659677
" print(\"No data available to plot.\")\n",
@@ -666,52 +684,69 @@
666684
" plot.xlabel(feature_name)\n",
667685
" plot.ylabel('Frequency')\n",
668686
" plot.xlim(left=feature_values.min(), right=feature_values.max())\n",
669-
" # plot.yscale('log') # Use logarithmic scale for better visibility of differences\n",
687+
" if log_y_axis:\n",
688+
" plot.yscale('log') # Use logarithmic scale for better visibility of differences\n",
670689
" plot.grid(True)\n",
671690
" plot.tight_layout()\n",
672691
"\n",
673692
" mean = feature_values.mean()\n",
674693
" standard_deviation = feature_values.std()\n",
675694
"\n",
676-
" # Vertical line for the mean\n",
677-
" plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
678-
" # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
679-
" plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
680-
" # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
681-
" plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
695+
" if standard_deviation_lines:\n",
696+
" # Vertical line for the mean\n",
697+
" plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
698+
" # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
699+
" plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
700+
" # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
701+
" plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
702+
"\n",
703+
" def annotate_distribution_outliers(\n",
704+
" outliers: pd.Series,\n",
705+
" names: pd.Series,\n",
706+
" ) -> None:\n",
707+
" if outliers.empty:\n",
708+
" return\n",
709+
" rank = 0\n",
710+
" for item_index, value in outliers.items():\n",
711+
" index = typing.cast(int, item_index)\n",
712+
" rank = rank + 1\n",
713+
" short_name = names[index]\n",
714+
" \n",
715+
" x_index_offset = (rank % 6) * 10\n",
716+
" if value > mean:\n",
717+
" x_index_offset = -x_index_offset\n",
718+
"\n",
719+
" plot.annotate(\n",
720+
" text=f'{short_name}',\n",
721+
" xy=(value, 1),\n",
722+
" xytext=(x_index_offset, 60),\n",
723+
" rotation=90,\n",
724+
" **plot_annotation_style,\n",
725+
" )\n",
726+
"\n",
727+
" positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)\n",
728+
" annotate_distribution_outliers(positive_outliers, short_names)\n",
729+
"\n",
730+
" negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)\n",
731+
" annotate_distribution_outliers(negative_outliers, short_names)\n",
682732
"\n",
683733
" plot.show()"
684734
]
685735
},
686-
{
687-
"cell_type": "code",
688-
"execution_count": null,
689-
"id": "ed900c59",
690-
"metadata": {},
691-
"outputs": [],
692-
"source": [
693-
"def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
694-
" \"\"\"\n",
695-
" Plots the distribution of clustering coefficients.\n",
696-
" \n",
697-
" Parameters\n",
698-
" ----------\n",
699-
" clustering_coefficients : pd.Series\n",
700-
" Series containing clustering coefficient values.\n",
701-
" text_prefix: str\n",
702-
" Text at the beginning of the title\n",
703-
" \"\"\"\n",
704-
" plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)"
705-
]
706-
},
707736
{
708737
"cell_type": "code",
709738
"execution_count": null,
710739
"id": "92aff8d9",
711740
"metadata": {},
712741
"outputs": [],
713742
"source": [
714-
"plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")"
743+
"plot_feature_distribution(\n",
744+
" java_package_features['clusteringCoefficient'],\n",
745+
" 'Clustering Coefficient', \n",
746+
" java_package_features['shortCodeUnitName'], \n",
747+
" title_prefix=\"Java Package\",\n",
748+
" standard_deviation_lines=False\n",
749+
")"
715750
]
716751
},
717752
{
@@ -836,7 +871,13 @@
836871
"metadata": {},
837872
"outputs": [],
838873
"source": [
839-
"plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")"
874+
"plot_feature_distribution(\n",
875+
" java_package_features['betweenness'], \n",
876+
" 'Betweenness', \n",
877+
" java_package_features['shortCodeUnitName'],\n",
878+
" title_prefix=\"Java Package\",\n",
879+
" log_y_axis=True,\n",
880+
")"
840881
]
841882
},
842883
{
@@ -1443,7 +1484,13 @@
14431484
"metadata": {},
14441485
"outputs": [],
14451486
"source": [
1446-
"plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")"
1487+
"plot_feature_distribution(\n",
1488+
" java_type_features['clusteringCoefficient'],\n",
1489+
" 'Clustering Coefficient',\n",
1490+
" java_type_features['shortCodeUnitName'],\n",
1491+
" title_prefix=\"Java Type\",\n",
1492+
" standard_deviation_lines=False\n",
1493+
")"
14471494
]
14481495
},
14491496
{
@@ -1477,7 +1524,13 @@
14771524
"metadata": {},
14781525
"outputs": [],
14791526
"source": [
1480-
"plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")"
1527+
"plot_feature_distribution(\n",
1528+
" java_type_features['betweenness'],\n",
1529+
" 'Betweenness',\n",
1530+
" java_type_features['shortCodeUnitName'],\n",
1531+
" log_y_axis=True,\n",
1532+
" title_prefix=\"Java Type\"\n",
1533+
")"
14811534
]
14821535
},
14831536
{

0 commit comments

Comments
 (0)